首页 > 技术文章 > node爬虫 Crawler superagent

fm060 2017-01-19 15:05 原文

 

先获取各入口链接 再分别获取内容

 

var c = new Crawler({
maxConnections : 1000,
// This will be called for each crawled page
callback : function (error, res, done) {
if(error){
console.log(error);
}else{
var $ = res.$;
var items=[];
$('#resultList .el').each(function (idx, element) {
var $element = $(element);
//var li="<li><a href='"+ul+$element.find('.t1 a').attr('href')+"'"+$element.find('.t1 a').text()+"</li>";
var plink=$element.find('.t1 a').attr('href');
var ptitle=$element.find('.t2 a').attr('title');
if(plink!=undefined){
items.push(plink);
}

});
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : 'root',
database: 'node',
});

connection.connect();
var userDelSql = 'DELETE FROM news ';
connection.query(userDelSql,function (err, result) {
if(err){
console.log('[DELETE ERROR] - ',err.message);
return;
}
console.log('-------------DELETE--------------');
});
link.queue(items);
}

}
});

var ttt=[];
var link = new Crawler({
maxConnections : 1000,
// This will be called for each crawled page
forceUTF8:true,
callback : function (error, res, done) {
if(error){
console.log(error);
}else{
var $ = res.$;
var cname=$('.cname').find('a').attr('title');
var clink=$('.cname').find('a').attr('href');
var cpric=$('.cn').find('strong').text();
var caad=$('.bmsg.inbox').find('.fp').eq(2).text().replace(/(^\s+)|(\s+$)/g,"");
//var result=str.replace(/(^\s+)|(\s+$)/g,"");
var ctxt=$('.bmsg.job_msg.inbox').html().replace(/(^\s+)|(\s+$)/g,"").replace(/\s/g,"");
var tt=unescape(ctxt.replace(/&#x/g, '%u').replace(/;/g, ''));
//var ctxt=$('.bmsg.job_msg.inbox').html();
//console.log(cadd);.replace(/\s/g,"")
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : 'root',
database: 'node',
});

connection.connect();
var userAddSql = 'INSERT INTO news(title,link,txt,pric,aad) VALUES(?,?,?,?,?)';
var userAddSql_Params = [cname,clink,tt,cpric,caad];
connection.query(userAddSql,userAddSql_Params,function (err, result) {
if(err){
console.log('[INSERT ERROR] - ',err.message);
return;
}
console.log('-------INSERT----------');
});
}

}

});

res.render('bb', { code: "ttt" });

c.queue('http://search.51job.com/list/030200,000000,0000,00,9,06%252C07%252C08%252C09%252C10,%25E5%2589%258D%25E7%25AB%25AF%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=03%2C04%2C05%2C06%2C07&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=%E5%91%A8%E6%9C%AB%E5%8F%8C%E4%BC%91');

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

superagent.get('http://www.37zw.com/0/330/')
.end(function (err, sres) {
// 常规的错误处理
if (err) {
return next(err);
}
// sres.text 里面存储着网页的 html 内容,将它传给 cheerio.load 之后
// 就可以得到一个实现了 jquery 接口的变量,我们习惯性地将它命名为 `$`
// 剩下就都是 jquery 的内容了
var $ = cheerio.load(sres.text);
var items = [];
$('#list dd').each(function (idx, element) {
var $element = $(element);

var li="<li><a href='"+ul+$element.find('a').attr('href')+"'"+$element.find('a').text()+"</li>";
console.log(li);
/*items.push(
//title: $element.attr('title'),
ul+$element.find('a').attr('href')
);
cont+=li;
//console.log($element.find('a').attr('href'));
});
//aa(items);
//res.send(c.queue(items));
res.render('bb', { title: cont });
});

 

 

 

 

加入数据库

var userinto = 'INSERT INTO book'+sid+'(title,cont) VALUES(?,?)';
var userinto_Params = [_title,_cont];
connection.query(userinto, userinto_Params, function(err, result) {
if (err) {
console.log('[ERROR] - ', err.message);
return;
} else {
console.log(_title+"章节成功加入");
}
});

推荐阅读