分享一段网页爬虫代码
发布于 2年前 作者 nihgwu 3841 次浏览

看到这么多人对爬虫感兴趣,这里我把我之前写的爬虫代码贴出来,用于之前一个采集网站 发布一个用nodejs建的小站 ,网站使用geddy框架,不过因为域名备案的问题,目前暂时停掉了。 爬虫使用async做任务调度,iconv-lite转换gb2312编码,request请求页面内容,cheerio解析内容数据,gm生产缩略图。代码删掉了部分处理代码,保留了完整的流程,所以不能正常运行。代码没有半句注释,我认为好的代码就是最好的注释,不过我不是说我代码写得好,本人js菜鸟,只是没有写注释的习惯而已。 在此顺便提个问题,怎么在内存中生成缩略图?看了gm的文档好多遍都没有找到解决办法,找了github上其他的图像处理库,在windows下都没法用,只得先保存为临时文件再删除。。

var async = require('async');
var iconv=require('iconv-lite');
var request=require('request');
var cheerio=require('cheerio');
var querystring=require('querystring');
var Buffer=require('buffer').Buffer;
var gm=require('gm');
var fs=require('fs');

var log=function(str){
  var time=geddy.date.strftime(new Date(), '%Y.%m.%d %H:%M:%S')
  console.log(time+': '+str);
}

var postQueue=async.queue(function(task,callback){
  log('get post ---> board: ' + task.board + ' file: ' +task.file);
  getPost(task.board,task.file,task.replyCount);
  setTimeout(callback,5000);
});

var userQueue=async.queue(function(task,callback){
  log('get user ---> userid: ' + task.userid);
  getUser(task.userid);
  setTimeout(callback,5000);
},1);

var imageQueue=async.queue(function(task,callback){
  log('get image ---> url: ' + task.url );
  getImage(task.id,task.url);
  setTimeout(callback,2000);
},5);

var cookie='';
function login(){
  var qs=querystring.stringify({
    id:'',
    pw:'',
    xml:1
  });
  request.get('bbslogin?'+qs,{encoding: null},function(err,res,data){
    if(!err&&res.statusCode==200){
      var xml=iconv.decode(data,'gb2312');
      var $=cheerio.load(xml);
      var utmpnum= $('utmpnum').text();
      var utmpuserid=$('utmpuserid').text();
      var utmpkey=$('utmpkey').text();
      cookie='utmpnum='+utmpnum+'; utmpuserid='+utmpuserid+'; utmpkey='+utmpkey;
      log("login success");
    }
  })
}

function getPosts(){
  request.get('posttop10.xml',{ encoding: null },function(err,res,data){
    if(!err&&res.statusCode==200){
      var xml=iconv.decode(data,'gb2312');
      var $=cheerio.load(xml);
      var updatetime=$('updatetime').text();
      $('post').each(function(i,item){ if(i>2)return;
        var board=$(this).children('board').text();
        var file=$(this).children('file').text();
        var replyCount=parseInt( $(this).children('reply_count').text());
        geddy.model.Post.first({board:board, file:file},function (err, post) {
          if(null == post || (post.replyCount!=null&&post.replyCount!=replyCount)){
            log('add post task ---> board: '+board + ', file: ' +file);
            postQueue.push({
              board:board,
              file:file,
              replyCount:replyCount
            })
          }else{
            post.crawlTime=new Date();
            post.save();
          }
        });
      })
    }
  });
}

function replaceEmotion(content) {
  return content;
}
function purifyContent(content) {
  return content;
}
function getPostTime(content) {
  var postTime = new Date();
  return postTime;
}
function replace(content) {
  return content;
}
function getPost(board,file,replyCount){
  var qs=querystring.stringify({
    board:board,
    file:file,
    xml:1
  });
  var r = request.defaults({ encoding: null, headers: { cookie: cookie} });
  r.get('bbsnewtcon?'+qs,function(err,res,data){
    if(err)
    {
      log(err);
      return;
    }
    if(res.statusCode==403){
      //login();
      return;
    }
    if(res.statusCode==200){
      var xml=iconv.decode(data,'gb2312');
      if(xml.indexOf('<error>')>0){
        login();
        return;
      }
      var $=cheerio.load(xml);
      var postid,posttitle;
      var floor=0;
      async.eachSeries($('article'),function(item,callback1){
        //log($(item).text());
        var title=$(item).children('title').html();
        var content=$(item).children('content').text();
        var userid=$(item).children('owner').text();
        var filename=$(item).children('filename').text();
        var crawlTime=new Date();
        var replyTo=null;
        if(floor>0){
          var reg=/【 在 (\w+) [\w\W]*的大作中提到: 】/;
          var r=content.match(reg);
          if(r!=null){
            replyTo=r[1];
            content=content.replace(reg,'');
          }
        }
        content = replace(content);
        var postTime = getPostTime(content);
        var $$=cheerio.load(content);
        async.each($$('a'),function(item,callback){
          var tmp= $$(item).html().toLowerCase();
          if (tmp.indexOf(".gif") != -1
            || tmp.indexOf(".jpg") != -1
            || tmp.indexOf(".jpeg") != -1
            || tmp.indexOf(".bmp") != -1
            || tmp.indexOf(".png") != -1) {
            var url=$$(item).html();
            var crawlTime=new Date();
            geddy.model.Image.first({url:url} , function(err,image){
              if(null==image){
                image = geddy.model.Image.create({
                  url:url
                })
              }
              image.save(function(err){
                log('add image task ---> id: '+image.id + ', url: ' +url);
                imageQueue.push({
                  id:image.id,
                  url:url
                });
                var src='/images/'+ image.id;
                $$(item).html('<div/><img class="img" src="'+ src +'" />');
                $$(item).attr('href',src+'.jpg').attr('alt',src+'.jpg');
                callback();
              });
            });
          }
          else{
            callback();
          }
        },function(err){
          content=$$.html();
          content = purifyContent(content);
          if(floor==0){
            //replyCount=$('article').length-1;
            geddy.model.Post.first({board:board,file:file} , function(err,post){
              if(null==post){
                post = geddy.model.Post.create({
                  board : board ,
                  file : file ,
                  title : title
                });
                geddy.model.User.first({userid:userid},function (err, user) {
                  if(err) return;
                  if(null!=user && user.crawlTime.getDate()== new Date().getDate()) return;
                  userQueue.push({
                    userid:userid
                  });
                });
              }
              post.userid = userid;
              post.postTime = postTime;
              post.crawlTime=crawlTime;
              post.title = title;
              post.content = content;
              post.replyCount = replyCount;
              post.save(function(err){
                postid=post.id;
                posttitle=post.title;
                floor++;
                callback1();
              })
            });
          }
          else{
            geddy.model.Comment.first({board:board,file:filename} , function(err,comment){
              if(null==comment){
                comment = geddy.model.Comment.create({
                  board : board ,
                  file : filename
                });
                geddy.model.User.first({userid:userid},function (err, user) {
                  if(err) return;
                  if(null!=user && user.crawlTime.getDate()== new Date().getDate()) return;
                  userQueue.push({
                    userid:userid
                  });
                });
              }
              comment.postid=postid;
              comment.posttitle=posttitle;
              comment.replyTo=replyTo;
              comment.userid = userid;
              comment.postTime = postTime;
              comment.crawlTime=crawlTime;
              comment.title = title;
              comment.content = content;
              comment.floor = floor;
              comment.save(function(err){
                floor++;
                callback1();
              });
            });
          }
        })
      },function(err){
        if(!err)
          log('got post ---> board: ' + board + ' file: ' +file);
        else
          log('got post error ---> board: ' + board + ' file: ' +file);
      })
    }
  })
}

function getUser(userid){
  var qs=querystring.stringify({
    userid:userid,
    xml:1
  });
  request.get('bbsqry?'+qs,{ encoding: null },function(err,res,data){
    if(err || res.statusCode!=200){
      //userQueue.push({
      //  userid:userid
      //});
      return;
    }
    if(!err && res.statusCode==200){
      var xml=iconv.decode(data,'gb2312');
      if(xml.indexOf('<error>')>0)  return;
      var $=cheerio.load(xml);
      $=cheerio.load($('userinfo').html());
      var userid=$('userid').text();
      var nick=$('nick').text();
      nick=replaceEmotion(nick);
      var horoscope=$('horoscope').text();
      var lastloginstr=$('lastlogin').text().substring(0,19).replace('年', '-').replace('月', '-').replace('日', ' ');
      var lastlogin=geddy.date.parse(lastloginstr);
      var strposts=$('strposts').text();
      var strnetage=$('strnetage').text();
      var strexp=$('strexp').text();
      var strmoney=$('strmoney').text();
      var strmedals=$('strmedals').text();
      var duty=$('duty').text();
      var individual=$('individual').text();
      var plans=$('plans').text();
      plans=replace(plans);
      var numlogins=parseInt( $('numlogins').text());
      var gender=parseInt( $('gender').text());
      var newmail=parseInt( $('newmail').text());
      var numposts=parseInt( $('numposts').text());
      var netage=parseInt( $('netage').text());
      var life=parseInt( $('life').text());
      var exp=parseInt( $('exp').text());
      var money=parseInt( $('money').text());
      var medals=parseInt( $('medals').text());
      var crawlTime=new Date();
      geddy.model.User.first({userid:userid},function (err, user) {
        if(null==user){
          user = geddy.model.User.create({
            userid : userid
          })
        }
        user.nick = nick ;
        user.horoscope= horoscope;
        user.lastlogin = lastlogin;
        user.strposts =strposts;
        user.strnetage = strnetage;
        user.strexp = strexp;
        user.strmoney = strmoney;
        user.strmedals =strmedals;
        user.duty = duty;
        user.individual = individual;
        user.plans = plans;
        user.numlogins = numlogins;
        user.gender = gender;
        user.newmail = newmail;
        user.numposts = numposts;
        user.netage = netage;
        user.life = life;
        user.exp = exp;
        user.money = money;
        user.medals = medals;
        user.crawlTime = crawlTime;
        user.save(function(err){
          log('got user ---> userid: '+userid);
          var url='faceimg/'+userid.substring(0,1).toUpperCase()+'/'+userid+'.jpg';
          imageQueue.push({
            url:url
          })
        });
      });
    }
  });
}

function getImage(id,url){var r = request.defaults({ encoding: null, headers: { cookie: cookie} });
  r.get(url,function(err,res,data){
    if(err)
    {
      log(err);    /*
     imageQueue.push({
     id:id,
     url:url
     })         */
      return;
    }
    if(res.statusCode==200){
      if(id){
        geddy.model.Image.first({id:id} , function(err,image){
          image.data=data;
          image.crawlTime=new Date();
          image.save();
          log('saved image ---> id: '+id +' url: '+url);
          var tmpfile=id;
          gm(data).size({bufferStream:true},function(err,size){
            if(!err&&size.width>600){
              this.resize(600,size.height*600/size.width)
                .write(tmpfile,function(err){
                  if(!err){
                    fs.exists(tmpfile,function(exists){
                      if(!exists) return;
                      try{
                        var buf=fs.readFileSync(tmpfile);
                        image.thumbnail=buf;
                        image.save();
                        log('thumbnailed image ---> id: '+id +' url: '+url);
                        fs.unlink(tmpfile);
                      }
                      catch(ex){
                      }
                    })
                  }
                })
            }
          })
        });
      }
      else{
        var tmpfile=url.substring(url.lastIndexOf('/')+1);
        var userid=tmpfile.substring(0,tmpfile.lastIndexOf('.')-1);
        async.waterfall([
          function(callback){
            gm(data).thumb(100,100,tmpfile,100,function(err,stdout,stderr){
              if(!err&&fs.existsSync(tmpfile)){
                try{
                  var buf=fs.readFileSync(tmpfile);
                  fs.unlink(tmpfile);
                  callback(null,data,buf);
                }
                catch(ex) {
                  callback(null,data,null)
                }
              }else{
                callback(null,data,null)
              }
            })
          },function(data,thumbnail,callback){
            geddy.model.Image.first({url:url},function(err,image){
              if(null == image){
                image=geddy.model.Image.create({
                  data:data,
                  thumbnail:thumbnail,
                  url:url,
                  crawlTime:new Date()
                })
              }else{
                image.data=data;
                image.thumbnail=thumbnail;
                image.url=url;
                image.crawlTime=new Date();
              }
              image.save();
            })
          }
        ],function(err,result){
          if(!err)
            log('saved gravatar ---> userid: '+userid );
        })
      }
    }
  });
}

exports.getCookie=function(){return cookie;};
exports.postQueue=postQueue;
exports.userQueue=userQueue;
exports.imageQueue=imageQueue;
exports.run=function(){
  login();
  async.whilst(
    function () { return true; },
    function (callback) {
      if(postQueue.tasks.length==0){
        log('request top10 posts');
        getPosts();
      }
      setTimeout(callback, 1000*60*5);
    },
    function (err) {
      log(err);
    }
  );
};
6 回复

nodejs专注于爬虫,图片交给云,我的小站就是这样处理~~图片我用了七牛云储存~ http://www.17qingsong.com

感觉不错的样子,不过官网怎么没找到价位表呢

@neavo 还没正式推出~~估计是在这个月正式推出~现在有3个月试用期~

@xieren58 大概看了一眼API,针对爬虫类网站的话得先手动抓到服务器上再上传至云端?

@neavo 有api,可以全自动啦~

爬虫用Ruby是不是更好呢?

回到顶部