使用promise和co实现延时处理,实现爬虫逻辑
发布于 4 个月前 作者 yxz1025 627 次浏览 来自 分享
//以下以抓取某网站电影为例说明
var Promise = require("bluebird");
var request = Promise.promisifyAll(require('request'));
var co = require('co');
//读取dom树
var cheerio = require('cheerio');

function Kb(){};
Kb.prototype.executeMovieQuene = co.wrap(function*() {
    var self = this;
    var key = 'kb_movie';
    var arr = ['26', '13', '10', '12', '11', '8', '9'];
    arr.sort(function(){ return 0.5 - Math.random() });
    var i = arr.length - 1;
    //定义当前电视剧分类总的页数
    var totalpage = 0;
    var page = 1;
    while (i > 0) {

        var data = yield self.getDataLink(arr[i], page); //此处为获取链接详情
        if (page == 1) {
            totalpage = data.totalpage;
        }

        console.log('分类 ' + arr[i] + ' 开始进入队列...' + ' 总共:' + totalpage + '页');
        while (page <= totalpage) {
            data = yield self.getDataLink(arr[i], page);
            yield redis.lpush(key, data.list);
            console.log('抓取分类 ' + arr[i] + ' 第' + page + '页');
            //延迟五秒执行
            yield Promise.delay(3000);
            page++;

        }

        i--;
        page = 1;
        totalpage = 0;
        if (i <= 0) {
            i = arr.length - 1;
            yield Promise.delay(24 * 60 * 60 * 1000);
        }
        console.log('分类 ' + arr[i] + ' 抓取完毕...');
    }
});
//获取某个分类的总的页数和视频链接
Kb.prototype.getDataLink = function*(cid, page) {
    var options = {
        url: "http://www.ikulive.com",
        headers: {
            "Referer": "http://www.ikulive.com",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        }
    };
    var json = {
        totalpage: 0,
        list: []
    };
    var response = yield request.getAsync(options);
    if (response.statusCode == 200) {
        var body = response.body;
        var doc = body.toString();
        var $ = cheerio.load(doc);
        var reg = /共(.*?)部(.*?)\/(.*?)/g;
        var strong = $('#long-page').find('strong').text();
        var li = $('#contents').find('li');
        var match = reg.exec(strong);
        var total = 0;
        if (match && match.length > 0) {
            total = parseInt(match[1]);
        }
        json.totalpage = Math.ceil(total / li.length);

        //获取链接地址
        li.each(function(data) {
            var url = $(this).find('.play-img').attr('href');
            if (url) {
                json.list.push(url);
            }
        });
    }
    return json;

};
module.exports = new Kb();
2 回复

bluebird、co + generator、request、cheerio都掌握了也挺难为人的,哈哈

之前也做过类似的,用async做的,也还不错

回到顶部