描述
- 本月刚接触nodejs,想用它做一个网页爬虫
- 因为有的网站没有header就不能爬,所以要加上header才可以。
- 但是总是加上会很麻烦,所以考虑把这部分获取网页代码的功能提取出来。
//helper.js
var request = require('request');
var iconv = require('iconv-lite');
function _get (_url,type,callback){
request({url:_url,headers:{'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection':'close',
'Referer':'None'},encoding:null}, function (err, res,body) {
if (err) return callback(err);
// 根据网页内容创建DOM操作对象
body = iconv.decode(body, 'gbk');
// console.log(body);
// return body;
callback(body);
});
};
exports.get = get;
测试代码如下
//test.js
var helper = require('./helper.js');
var cheerio = require('cheerio');
var body = helper.get("http://taobao.com");
var $ = cheerio.load(body);
console.log(body);
结果:
node test.js undefined Iconv-lite warning: decode()-ing strings is deprecated. Refer to https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding
TypeError: undefined is not a function at Request._callback (d:\workplace\crawler\update\helper.js:34:4) at Request.self.callback (d:\workplace\crawler\node_modules\request\request.js:129:22) at Request.emit (events.js:98:17) at Request.<anonymous> (d:\workplace\crawler\node_modules\request\request.js:873:14) at Request.emit (events.js:117:20) at IncomingMessage.<anonymous> (d:\workplace\crawler\node_modules\request\request.js:824:12) at IncomingMessage.emit (events.js:117:20) at _stream_readable.js:943:16 at process._tickCallback (node.js:419:13)
如果是callback的话,就是上面那个结果,如果是return的话,就只有一句undefined。
接触nodejs以来,一直对这方面不太明白,求各位大神指点一二。。。