在学习爬虫程序,不知道为什么read.js可以工作,但是save.js不能工作,所爬的页面就是咱们这个cnodejs.org,想要的内容是话题的list,read.js是读取页面内容,save.js是保存到数据库中,目前数据库已经配置好,也经过测试,正常。运行work.js会出现下面的情况,那个save.js就是不能正常运行,请各位哥哥姐姐指教,谢谢了。。。
c:\node\lianxi\cnode\update>node work.js
read is completely
c:\node\lianxi\cnode\update>
>reader.js如下:
var request = require('request');
var cheerio = require('cheerio');
var debug = require('debug')('cnode:update:read');
//读取帖子列表
exports.topicList = function(url,callback){
debug('get the topic list :%s',url);
request(url,function(err,res){
if(err){
return callback(err);
}
//根据网页内容创建DOM对象
var $ = cheerio.load(res.body.toString());
//读取topic列表
var topicList = [];
$('.topic_list .cell').each(function(){
var $me = $(this);
var $title = $me.find('topic_title_wrapper a');
var $time = $me.find('last_time last_active_time');
var item = {
title : $title.text().trim(),
url : $title.attr('href'),
time : $time.text().trim()
};
//从URL中提取出id
var s = item.url.match(/([a-zA-Z0-9]+)/);
if(Array.isArray(s)){
item.id = s[0];
topicList.push(item);
}
});
});
};
>save.js如下:
var async = require('async');
var db = require('../config').db;
var debug = require('debug')('cnode:update:save');
//保存帖子列表
exports.topicList = function(list,callback){
debug('save the topic list :%s',list.length);
async.eachSeries(list,function(item,next){
//查询文章是否已经存在
db.query('select * from topic_list where id = ? limit 1',[item.id],function(err,data){
if(err){
return next(err);
}
//将发布时间改成时间戳
var created_time = new Date(item.time).getTime()/1000;
if(Array.isArray(data) && data.length > 0){
//分类已存在,更新一下
db.query('update topic_list set title = ?,url =?,created_time =? where id =?',[item.title,item.url,created_time,item.id],next);
}else{
//分类不存在,添加
db.query('insert into topic_list(id,title,url,created_time) values(?,?,?,?)',[item.id,item.title,item.url,created_time],next);
}
});
},callback);
};
>work.js如下:
var async = require('async');
var config = require('../config');
var read = require('./read');
var save = require('./save');
var debug = require('debug')('cnode:update:all');
var topicList;
async.series([
//获取帖子列表
function(done){
read.topicList(config.cnode.url,function(err,list){
if(err){
done(err);
}
topicList = list;
});
console.log('read is completely');
},
//保存帖子列表到数据库中
function(done){
save.topicList(topicList,done);
console.log('save is completely');
}
],function(err){
if(err){
console.log(err.stack);
}
console.log('completely');
process.exit(1);
});