使用promise和co实现延时处理,实现爬虫逻辑
//以下以抓取某网站电影为例说明
var Promise = require("bluebird");
var request = Promise.promisifyAll(require('request'));
var co = require('co');
//读取dom树
var cheerio = require('cheerio');
function Kb(){};
Kb.prototype.executeMovieQuene = co.wrap(function*() {
var self = this;
var key = 'kb_movie';
var arr = ['26', '13', '10', '12', '11', '8', '9'];
arr.sort(function(){ return 0.5 - Math.random() });
var i = arr.length - 1;
//定义当前电视剧分类总的页数
var totalpage = 0;
var page = 1;
while (i > 0) {
var data = yield self.getDataLink(arr[i], page); //此处为获取链接详情
if (page == 1) {
totalpage = data.totalpage;
}
console.log('分类 ' + arr[i] + ' 开始进入队列...' + ' 总共:' + totalpage + '页');
while (page <= totalpage) {
data = yield self.getDataLink(arr[i], page);
yield redis.lpush(key, data.list);
console.log('抓取分类 ' + arr[i] + ' 第' + page + '页');
//延迟五秒执行
yield Promise.delay(3000);
page++;
}
i--;
page = 1;
totalpage = 0;
if (i <= 0) {
i = arr.length - 1;
yield Promise.delay(24 * 60 * 60 * 1000);
}
console.log('分类 ' + arr[i] + ' 抓取完毕...');
}
});
//获取某个分类的总的页数和视频链接
Kb.prototype.getDataLink = function*(cid, page) {
var options = {
url: "http://www.ikulive.com",
headers: {
"Referer": "http://www.ikulive.com",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
};
var json = {
totalpage: 0,
list: []
};
var response = yield request.getAsync(options);
if (response.statusCode == 200) {
var body = response.body;
var doc = body.toString();
var $ = cheerio.load(doc);
var reg = /共(.*?)部(.*?)\/(.*?)/g;
var strong = $('#long-page').find('strong').text();
var li = $('#contents').find('li');
var match = reg.exec(strong);
var total = 0;
if (match && match.length > 0) {
total = parseInt(match[1]);
}
json.totalpage = Math.ceil(total / li.length);
//获取链接地址
li.each(function(data) {
var url = $(this).find('.play-img').attr('href');
if (url) {
json.list.push(url);
}
});
}
return json;
};
module.exports = new Kb();
2 回复
bluebird、co + generator、request、cheerio都掌握了也挺难为人的,哈哈
之前也做过类似的,用async做的,也还不错