node初级学,不知道怎么用node爬取动态网页?有没有哪位大神分享下实例?
抓取动态内容无非就是构造请求,你可以参考我的这个例子, ,虽然内容不是动态的,但关联页面的id是动态获取的,道理一样。
yield Nightmare()
.goto('http://yahoo.com')
.type('input[title="Search"]', 'github nightmare')
.click('.searchsubmit');
thank you @tower1229 @atian25
正好以前分析某网站营收情况时用过,因为那边没有做反爬,url什么的打了点马赛克。 其实用起来非常简单的,就是 new Crawler({…}); 选项里传递 callback 处理 result, 然后 c.queue(url); 就行了。
var sqlite3 = require('sqlite3').verbose();
var Crawler = require("crawler");
var url = require('url');
var db = initDB();
var c = new Crawler({
maxConnections: 2,
rateLimit: 2000,
skipDuplicates: true,
timeout:5000,
callback: function(err, result, done){
// result is node http.IncomingMessage
if (err){
console.log(err);
} else {
handleResult(result, done);
console.log(result.request.uri.href);
done();
}
}
});
for ( var i = 1; i <=11; i++ )
c.queue('http://www.some.com/list/c'+i+'/');
c.on('drain',function(){
dumpDB();
closeDB();
console.log("finished.");
});
function initDB() {
var db = new sqlite3.Database('some_table.sqlite');
var store;
db.serialize(function() {
db.run('CREATE TABLE IF NOT EXISTS some_table \
( url TEXT UNIQUE, c1 TEXT, c2 TEXT, t1 TEXT, \
p1 TEXT, l1 TEXT, v1 INT, t2 TEXT, \
s1 INT, i1 TEXT, r1 INT, r2 TEXT, r3 TEXT)',
function (err) {
if ( err ) {
console.log(err);
} else {
console.log("table created");
}
})
store = {
db : db,
stmtCourse : db.prepare("INSERT INTO some_table VALUES \
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"),
// 1 2 3 4 5 6 7 8 9 10 11 12 13
saveCourse : function save(r){
this.stmtCourse.run(
r.url, r.c1, r.c2, r.t1,
r.p1, r.l1, r.v1, r.t2,
r.s1, r.i1, r.r1, r.r2, r.r3.toString(),
(err) => { if ( err ) console.log('ignore '+r.url, err) }
);
}
}
});
return store;
}
function closeDB() {
db.stmtCourse.finalize();
db.db.close();
}
function dumpDB() {
db.db.each("SELECT url, t1, s1 FROM some_table", function(err, row) {
console.log(row);
});
}
function dumpRecord(r){
console.log(
r.url, r.c1, r.c2, r.t1,
r.p1, r.l1, r.v1, r.t2,
r.s1, r.i1, r.r1, r.r2, r.r3);
}
function handleResult(res, done){
var $ = res.$;
console.log("category: " + $("title").text() + " at " + res.request.uri.href);
processCategoryPage(res, $);
}
function processCategoryPage(res, $, done) {
// cat 1~11 是顶层,cat12+ 是二级分类,/?catetagid=n 则是二级类下的属性标签
processNextPage(res, $)
processCate(res, $)
}
function processNextPage(res, $) {
// 取下一页 $("a.next")[0].href 待抓取分析
var next = $("a.next").attr('href');
if ( next ) {
console.log('page:'+next);
c.queue("http://www.some.com"+next);
}
}
function processCate(res, $) {
$(".cate_tit").map( function (idx,item) {
c.queue([{
uri: "http://www.some.com"+item.attribs.href,
// The global callback won't be called
callback: function (error, res, done) {
if(error){
console.log(error);
}else{
// savePage4Debug(res);
try {
processCourseDetails(res);
}catch(e){
console.log("error in course detail!", e);
}
}
done();
}
}])
});
}
function savePage4Debug(res) {
var fs = require('fs');
var fname = res.request.uri.href.replace(/[:_/.]/g, '_');
fs.writeFileSync(fname, res.body, 'utf8', null);
}
function processCourseDetails(res) {
var $ = res.$;
var c = $(".nav-link").map((idx,item)=>($(item).text().trim()));
var t1 = $('.title-txt').text().trim();
var t2Url = $('.a-info>.pic-box>a').attr('href');
var t2 = $('.a-info>.cell>a').text().trim();
var p1 = $('#details-topfixed .price-num').text().trim();
if ( p1.startsWith('¥') )
p1 = p1.substring(1).trim();
var l1Node = $('.difficullty')[0];
var l1 = 'N/A';
if (l1Node && l1Node.childNodes[0])
l1 = l1.childNodes[0].nodeValue.trim();
else
console.log("no l1 info: " + res.request.uri.href);
var s1CountText = $('.counter-st>.counter-num').text().trim();
var s1 = Number.parseFloat(s1CountText);
if (s1CountText.endsWith('万')) {
s1 = s1 * 10000;
}
var r1 = Number.parseInt($('#courseEval .section-title-num').text().trim().substring(1));
var courseIntro = $('#sectionIntro #hideSummary').attr('value');
var v1 = Number.parseInt( $('.tit-outline-num').text().trim().substring(1) );
var r2 = $('.star-txt').text().trim();
var r3 = [];//学习收获、互动氛围、授课老师、课程内容
$('.evaluate-score').map((idx,s)=>{ rate_parts[idx]=$(s).text()})
db.saveCourse({
url: res.request.uri.href,
t2: t2+ '('+t2Url+')',
c1: c[1], c2: c[2], // [0]=首页,catetag在详情页貌似取不到,分析需要时再想办法
t1:t1, p1:p1, l1:l1, v1:v1,
s1:s1, i1:c1,
r1:r1, r2:r2, r3:r3
});
}
@tower1229 @atian25 ok谢谢,我现在先看了一下 网络爬虫与数据库操作,然后去理解两位大神的。
@Jackzhangpan Chrome render 可以满足你的需求 https://github.com/gwuhaolin/chrome-render
@i5ting 请问一下,利用node-crawler爬虫时,如何利用爬下来的第一层数据进入到下一层
@zhoujinhai 数据缓存起来,遍历不就可以了么
@i5ting 不好意思啊,又来问了,第二层我的会报错。 error: CRAWLER Error Error: ETIMEDOUT when fetching https://www.youtube.com/playlist?list=PL3ZQ5CpNulQk8-p0CWo9ufI81IdrGoyNZ
function video(categoryList){ console.log(categoryList) c.queue([{ uri: categoryList, forceUTF8:true, // The global callback won’t be called callback: function (error, res, done) { var $ = res.$; var s = $(’#pl-header’).text(); console.log(s); done(); }
}]); };
var catelist = ‘https://www.youtube.com/playlist?list=PL3ZQ5CpNulQk8-p0CWo9ufI81IdrGoyNZ’; video(catelist);