精华 博客爬虫技术分享(9.26日更新书籍爬取实例)
发布于 5个月前 作者 yessirpopesama 1970 次浏览 最后一次编辑是 4个月前 来自 分享

前略

最近扑在nodejs上学习研究,尝试各种有趣的功(姿)能(势),根本停不下来。前些天偶听同事谈起爬虫,有感,顺道上网查了些资料,编(改)写了几个nodejs版的爬虫程序作为demo,与大家分享,欢迎拍砖。 (9.21日更新) 新增在线阅读爬虫demo (9.26日更新) 重构novelspider代码结构,增加策略选择机制,针对不同网站选择不同的爬虫策略,强化代码健壮性。

准备

1 几个依赖库: request : https://github.com/mikeal/request 非常实用的客户端模块 cheerio : https://github.com/cheeriojs/cheerio 服务器端能jQurey核心的实现 iconv-lite: https://github.com/ashtuchkin/iconv-lite 实现编码转换模块,比较常用的功能是buffer二进制流转string bufferhelper: https://github.com/ashtuchkin/iconv-lite 一个buffer的加强类模块 promise: 实现Promise异步回调后操作功能模块 2 了解正则表达式

demo思路

爬虫思想即为抓取我们所需要的目标元素内的信息。 示例demo博客爬虫程序以抓取新浪博客列表为目的,将抓取的博客列表内容存储到本地文件夹中。代码中,配置引入url,程序分两块,一块为数据抓取, 一块为文件读写。程序可判断哪些文章是推荐文章。 示例demo书籍爬虫程序以抓取我们想看的小说为目的,方法基本与博客的那个基本一致。唯一需要注意的是编码问题,有不少的网站编码是gb2312,如果按照之前的做法的话爬下来的是乱码。为解决这个问题,采用获取buffer流,转成gbk编码的字符串,再通过cheerio实现获取书籍内容。

博客爬虫实现代码

var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var config = require('./config');

function writeIntoFile(data) {
  fs.writeFile('output.txt', data, function(err) {
    if (err) return err.stack;
    console.log('File write done!');
  });
}

function BlogSpider(url, callback) {
  //使用request模块进行爬虫
  request(url, function(err, res) {
    if (err) return err.stack;
    var $ = cheerio.load(res.body.toString());

    var articleList = [];
    $('.articleList .articleCell').each(function() {
      var $me = $(this);
      var $title = $me.find('.atc_title a');
      var $time = $me.find('.atc_tm');

      var item = {
        title: $title.text().trim(),
        url: $title.attr('href'),
        time: $time.text().trim()
      }

      // 如果推荐图标存在
      var $img = $me.find('.atc_main .atc_ic_f img');
      item.hasRecommand = $img.hasClass('SG_icon');

      // 删选link
      var s = item.url.match(/blog_([a-zA-Z0-9]+)\.html/);
      if (Array.isArray(s)) {
        item.id = s[1];
        articleList.push(item);
      }
    });

    var nextUrl = $('.SG_pgnext a').attr('href');
    if (nextUrl) {
      BlogSpider(nextUrl, function(err, articleList2) {
        if (err) return callback(err);
        callback(null, articleList.concat(articleList2));
      });
    } else {
      callback(null, articleList);
    }
  });
}

BlogSpider(config.url, function(err, articleList) {
  if (err) return console.error(err.stack);
  var listContents = '';
  articleList.map(function(article) {
    //判断是否为新浪推荐文章
    if (article.hasRecommand) {
      listContents += '荐 ';
    }
    listContents += '发表文章:' + article.title + ' 发表时间: ' + article.time + '\n';
  });
  writeIntoFile(listContents);
});

书籍爬虫实现代码

Spider.js
/**
 * [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[@author](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author)](/user/author) lsy
 * NovelSpider的程序入口
 */
var spider = require('./NovelSpider');
var targetName = 'qidian';
spider.createSpiderServer(targetName);
NovelSpider.js
/**
 * NovelSpider的爬虫模块
 */
var http = require('http');
var config = require('./Config');
var strategy = require('./SpiderStrategy');
var fs = require('fs');
var iconv = require('iconv-lite');
var BufferHelper = require('bufferhelper');
var cheerio = require('cheerio');

/*
 * [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[@private](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)
 * 写入我们所需要的文件
 */
function writeIntoFile(data) {
  fs.writeFile('output.txt', data, function(err) {
    if (err) return err.stack;
    console.log('File write done!');
  });
}

/**
 * [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[@private](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)
 * 用于小说章节拼接,暂未完成
 */
function appendToFile(data) {
  fs.appendFile('output.txt', data, function(err) {
    if (err) return err.stack;
    console.log('File write done!');
  });
}

/**
 * [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[@private](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)
 * 获取页面的正文内容
 */
function getWebsiteContent(buffer, objName) {
  //处理特殊编码
  var encoding = 'utf-8';
  if (objName === 'shubao') {
    encoding = 'GBK';
  }
  var contentStr = iconv.decode(buffer.toBuffer(), encoding);
  // 回调的方式获取策略结果
  strategy.findStrategy(contentStr, objName, function(content) {
    // 获取目标爬虫结果
    if (content) {
      contentStr = content;
      //写入文件
      appendToFile(contentStr);
    } else {
      console.log('Content spider failed!!!');
    }
  });
}

/**
 * [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[@private](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)](/user/private)
 * 访问特定章节页面
 */
function visitURL(url, objName) {
  http.get(url, function(res) {
    var bufferHelper = new BufferHelper();
    res.on('data', function(chunk) {
      bufferHelper.concat(chunk);
    });
    res.on('end', function() {
      getWebsiteContent(bufferHelper, objName);
    });
  })
}

exports.createSpiderServer = function(targetName) {
  // 根据target点查看是否存在爬虫策略
  var findFlag = false;
  config.forEach(function(object) {
    var objName = object.targetName;
    if (objName === targetName) {
      console.log('Spider Strategy ready');
      findFlag = true;
      visitURL(object.url, objName);
    }
  });
  //没有找到爬虫策略
  if (!findFlag) {
    console.log('Spider Strategy Not Found!!!! owo');
  }
}
SpiderStrategy.js
/**
 * 爬虫抓取的策略文件, 根据不同网站类型进行解析
 */
var cheerio = require('cheerio');
var http = require('http');
var BufferHelper = require('bufferhelper');
var iconv = require('iconv-lite');
var Promise = require('promise');

//书宝的抓取策略
function method_shubao(content) {
    var $ = cheerio.load(content);
    return $('#view_content_txt').text().trim();
}

//起点的抓取策略有些不同,需要解析web源代码中包含txt文件的url,再通过http访问获取里面的内容
function method_qidian(content) {
    var re, targetUrl, promise;

    promise = new Promise(function(resolve, reject) {
        try {
            re = /(http[s]?:\/\/.*?\.(txt))/gi;
            targetUrl = content.match(re).toString().trim();
            console.log('txt url : '+ targetUrl);
            // Get 访问
            http.get(targetUrl, function(res) {
                var bufferHelper = new BufferHelper();
                res.on('data', function(chunk) {
                    bufferHelper.concat(chunk);
                });
                res.on('end', function() {
                    var finalContent = iconv.decode(bufferHelper.toBuffer(), 'gb2312');
                    console.log(finalContent);
                    resolve(finalContent);
                });
            });
        } catch (e) {
            console.log('http get error!');
            reject();
        }
    });

    return promise;
}

exports.findStrategy = function(content, targetName, callback) {
    if (targetName === 'shubao') {
        callback(method_shubao(content));
    }
    if (targetName === 'qidian') {
        method_qidian(content).then(function(finalContent){
            callback(finalContent);
        })
    }
}
Config.js
    module.exports = [{
        url: 'http://free.qidian.com/Free/ReadChapter.aspx?bookId=2493293&chapterId=41916349',
        targetName: 'qidian'
        }, {
        url: 'http://www.bookbao.com/view/201208/09/id_XMjg0NTE3.html',
        targetName: 'shubao'
    }]

小结

9月21日完成了书籍爬虫demo程序,甚是高兴,希望能帮到大家。 9月26日完成程序更新,采用策略模式进行对应网站爬取,增加promise异步回调方法。 未完成内容:多页面抓取 PS: 1 代码在别的程序上改进的,不要在意这些细节~.~ 2 我只是随意找了个书籍网站,不要吐槽内容owo 2 第一次发帖 请多关照~

###参考文献 1 http://cnodejs.org/topic/5034b141f767cc9a51baf9b0 2 nodejs实战

20 回复

其实很多人 node 入门也都是爬虫,对有收集癖的人来说挺好的。

比如说这里就是汉纸们的福利。

@xadillax 我最大的爱好也是写爬虫。。。前几天看一个黄网不错,爬了整站 20G 的图片下来

@xadillax lofter没有反爬机制的?

@alsotang 公众场合,收敛点 0. 0

看到精华有些受宠若惊,感谢大家支持~

@xadillax 多谢大神分享,我在写这个也有这个想法,一个命令把图片抓下来,没想到有实现的了。

@xadillax 到处都能看到你。

前阵才写了一个下小说的(本人不看小说),那个站居然还不防爬虫,太爽了。

@coolicer 圈子本来就很小😂

哪个站防爬虫,我想挑战一下,还没遇见过爬不到的。

@alsotang 好人,求分享。。。

更新了书籍爬虫部分

回到顶部