这篇文章是根据我之前写的一个前端爬虫优化的后端爬虫项目
点击查看前端爬取的方法
这个实例代码需要运行在node环境下
At first time you have to install the dependence ‘node-xlsx’1
npm i node-xlsx
Main code1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64//http://c.v.qq.com/vchannelinfo?otype=json&uin=ea1b1b8626787db2d198c58fb86eb4dc&qm=1&pagenum=1&num=50&sorttype=0&orderflag=0&callback=callback&low_login=1&_=1554648724958
const
http = require('http'),
fs = require('fs'),
xlsx = require('node-xlsx');
let list = [
['标题', '阅读量', '时间']
],
pageNum = 1;
const writeXlsx = data => {
const buffer = xlsx.build([{
name: 'Tencent Video Reading',
data
}]);
fs.writeFileSync('./harvest/tencent/new.xlsx', buffer, {
'flag': 'w'
}); //生成excel
};
const startRequest = (page, url) => {
url += '&pagenum=' + pageNum;
//采用http模块向服务器发起一次get请求
http.get(url, function (res) {
let json = ''; //用来存储请求网页的整个html内容
res.setEncoding('utf-8'); //防止中文乱码
//监听data事件,每次取一块数据
res.on('data', function (chunk) {
json += chunk;
});
//监听end事件,如果整个网页内容的html都获取完毕,就执行回调函数
res.on('end', function () {
function callback(data) {
data.videolst.forEach(({
title,
play_count,
uploadtime
}) => {
title = title.replace(/,/g, ",").replace(/:/g, ":")
play_count = play_count.replace('万', '') * 10000;
list.push([title, play_count, uploadtime]);
});
if (pageNum == page) {
writeXlsx(list);
console.log('Complete')
} else {
pageNum++;
startRequest(page, url);
}
}
eval(json);
});
}).on('error', function (err) {
console.log(err);
});
};
/**
* @param {Number} pageNum 页码
* @param {String} url 访问的url
*/
startRequest(3, "http://c.v.qq.com/vchannelinfo?otype=json&uin=ea1b1b8626787db2d198c58fb86eb4dc&qm=1&num=24&sorttype=0&orderflag=0&low_login=1&_=1554648724958&callback=callback"); //主程序开始运行
或者克隆我的项目1
2
3
4
5
6
7
8//克隆项目
git clone https://github.com/JimmieMax/explore-spider
//进入项目目录
cd explore-spider
//安装依赖
npm i
//运行程序
node new-tencent-spider