Casperjs实战 · 小路口

Casperjs系列之实践操作。接上一篇Casperjs的理论介绍之后，我们来介绍一下Casperjs如何在实际场景中使用，以及可能会遇到的一些问题。

怎么用？

因为Casperjs使用比较简单，基本可以分为三步，’创建并打开一个浏览器’，’进行网页浏览点击等等一系列操作’, ‘关闭浏览器’。这里仅举两个例子作为介绍，其他更多的用法请参考Casperjs官方API。

网页截图
截图可分为2种，一种是简单的直接截图，一种是先登录再截图。对于前者很简单，直接调api即可。难点再与后者，如何登录。
而登录又分为2种，正常填写form表单并提交，网页跳转登录，但是这一种，往往会遇到需要填写验证码的情况，尤其是图片验证码，可能需要先进行图片识别，这样操作会比较复杂；另一种是，带着cookie直接打开需要截取的页面，但这种也存在弊端，就是如何解决cookie过期的问题。

对于表单提交可以使用fill, fillSelectors的方法，关键代码片段如下：

var casper = require('casper').create({
	pageSettings: {
		userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2593.0 Safari/537.36"
	},
	// 不可以写online路径，只能写相对或绝对路径
	//clientScripts: ["includes/jquery.min.js"],
	verbose: true,
	logLevel: 'debug'
});
// 1. start，初始化一个浏览器
casper.start().then(function(){
	this.echo('....begin.....');
});
// 2. 设置viewportsize后打开页面
casper.viewport(1024, 768)
		.thenOpen('https://www.so.com')
		.then(function() {
			this.echo(this.getTitle());
		});
// 点击名为‘登录’的链接
casper.then(function(){
	if( this.exists('#user-login') ){
		this.echo('还未登录');
		// 也可用'click'，'click'要求对选择器跟精确些
		this.clickLabel('登录', 'a');
	}else{
		this.echo('已登录');
		this.exit();
	}
});
// 等待表单出现
casper.waitForSelector('form.quc-form', function(){
	// 截图： 登录之前的首页
	this.captureSelector('a.png', 'body');
	this.echo('登录表单已出现, 即将开始填写表单');
	// 填写表单，要是遇着输入图片形式的验证码就没辙了。。
	/**
	 * 方法1
	 */
	this.fillSelectors('form.quc-form', {
		'.quc-input-account': '你的账号',
		'.quc-input-password': '你的密码'
	}, true);
	/**
	 * 方法2
	 * fill(),根据他们的'name'值来选择字段
	 */
	/*this.fill('form.quc-form', {
		'account': '你的账号',
		'password': '你的密码'
	}, true);*/
	/**
	 * 手动点击提交
	 */
	/*this.evaluate(function(){
		document.querySelector('.quc-form').submit();
	});*/
});
casper.wait(2000, function(){
	this.echo('表单填写完成，正在验证.....');
	// 验证是否得到登录后的用户名
	var name = this.exists('.login .uname') ? this.getElementInfo('.login .uname').text : '';
	this.echo('the name is: ' + name);
	// 截图： 登录之后的首页
	this.captureSelector('b1.png', 'body');
});
// 3. 将上面定义的操作进行实际操作
casper.run(function() {
	// 4. 关闭浏览器
	this.echo('===================the progress is over!===================').exit();
});

带着cookie打开网页的方法，但是Casperjs没有提供cookie相关的方法，好在我们还可以使用phantom中的方法，代码片段如下：

phantom.cookiesEnabled = true;
casper.then(function(){
	phantom.addCookie({
	  'name': 'Q',
	  'value': '具体的值',
	  'domain': '.so.com'
	});
	phantom.addCookie({
	  'name': 'T',
	  'value': '具体的值',
	  'domain': '.so.com'
	});
	showAllCookie();
	// 需要带着cookie重新打开一下页面
	casper.thenOpen('https://www.so.com', function(){
		// 点击登录
		if( this.exists('#user-login') ){
			this.echo('还未登录');
			// 也可用'click'，'click'要求对选择器跟精确些
			this.clickLabel('登录', 'a');
		}
		// 验证是否得到登录后的用户名
		var name = this.exists('.login .uname') ? this.getElementInfo('.login .uname').text : '';
		this.echo('the name is: ' + name);
		// 截图： 登录之后的首页
		this.captureSelector('CB1.png', 'body');
	});
	
});

爬虫
爬虫的基本思路就是，在网页中根据有序的dom结构获得到自己想要的信息。Casperjs自身提供了很多简单的可以操作dom的api，比如getTitle，getCurrentUrl，getHTML等等。除此之外，还可以在创建Casperjs对象时引入其他脚本，比如jquery。
有时我们在做无具体目标的，需要保存页面中所有链接及其子链接依次往下链接内容时，就可能会连续用到then和thenOpen方法打开url。这里贴出一段官方的例子,就是遇到a链接就打开，并在新的页面中继续查找链接并打开，直接页面中没有链接为止：

/*eslint strict:0*/
/*global CasperError, console, phantom, require*/
var casper = require("casper").create({
    verbose: true
});
// The base links array
var links = [
    "http://www.so.com/"
];
// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;
// Get the links, and add them to the links array
// (It could be done all in one step, but it is intentionally splitted)
function addLinks(link) {
    this.then(function() {
        var found = this.evaluate(searchLinks);
        this.echo(found.length + " links found on " + link);
        links = links.concat(found);
    });
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
    var filter, map;
    filter = Array.prototype.filter;
    map = Array.prototype.map;
    return map.call(filter.call(document.querySelectorAll("a"), function(a) {
        return (/^http:\/\/.*/i).test(a.getAttribute("href"));
    }), function(a) {
        return a.getAttribute("href");
    });
}
// Just opens the page and prints the title
function start(link) {
    this.start(link, function() {
        this.echo('Page title: ' + this.getTitle());
    });
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
    if (links[currentLink] && currentLink < upTo) {
        this.echo('--- Link ' + currentLink + ' ---');
        start.call(this, links[currentLink]);
        addLinks.call(this, links[currentLink]);
        currentLink++;
        this.run(check);
    } else {
        this.echo("All done.");
        this.exit();
    }
}
casper.start().then(function() {
    this.echo("Starting");
});
casper.run(check);

我遇到过的问题

viewport的设置:
phantom默认设置是300*400。实践证明在不重写的情况下，不同页面截取下来的大小不一，有时是300*y，有时是x*400。
如果需要根据页面的内容来截取特定大小的，需要先设置viewport中width，height中的任意一个，再通过极端内容的宽高，再次动态改变viewport的大小。
不同系统对字体和highdpi支持不好
比如在某些linux系统上无法显示汉字和某些特定的字体，需要重新安装。
截出来的图在highdpi的屏幕上显示会发虚（暂时还没找到解决办法）
不支持多进程，耗时长
虽然可以通过配置loadImages: false,loadPlugins: false来提高页面加载速度，但是在进行数据量较大的爬虫时，与pyhton还是有较大差距。
找不到环境变量，如Phantomjs，python
我自己是在使用crontab命令时遇到的，主要原因是环境变量没生效。
解决办法：在命令运行之前，先source一下环境变量
截图在开发机上莫名崩溃
没有原因，不能稳定复现；不是占用内存或cpu过高，事实上才占用百分之几到0点几。
暂且解决办法是，多执行几次截图命令直到截取成功为止。
目前没有找到真正的原因和解决办法
截取vue组件渲染的页面，空白
具体场景：使用webpack来运行一套view，使用thinkjs来运行webpack打包后的view，截取后者view页面时，总是空白
解决办法：去截取webpack运行的view