nodejs爬虫简单实现
const fs = require('fs');
const URL = require('url')
const gbk = require('gbk')
const { JSDOM } = require('jsdom')
class Getdata {
static http(url) {
let Url = URL.parse(url)
let http;
if (Url.protocol == 'http:') {
http = require('http')
return { "http": http, 'hostname': Url.hostname, 'path': Url.path }
} else {
http = require('https')
return {
"http": http, 'hostname': Url.hostname, 'path': Url.path
};
}
}
get(url, Filename) {
Getdata.a += 1
let app = Getdata.http(url);
let https = app.http.request({
'hostname': app.hostname,
'path': app.path
}, res => {
if (res.statusCode == 200) {
// console.log(res.statusCode)
let arr = []
res.on('data', data => {
// console.log(data);
arr.push(data)
}).on('end', () => {
let b = Buffer.concat(arr)
// let html = gbk.toString('utf-8', b)
let dom = new JSDOM(b);
let doc = dom.window.document.querySelectorAll('.Left_list_cont2 img');
for (let i = 0; i < doc.length; i++) {
let a = doc[i].getAttribute('data-original') // console.log()
this.set(a, `${Filename}${i}`, 'jpg')
}
})
} else {
console.log(Getdata.a);
console.log(res.statusCode, res.headers);
this.get(url, Filename)
}
})
https.end()
}
set(url, Filename, kz = 'html') {
Getdata.a += 1
let app = Getdata.http(url);
let https = app.http.request({
'hostname': app.hostname,
'path': app.path
}, res => {
if (res.statusCode == 200) {
// console.log(res.statusCode)
let arr = []
res.on('data', data => {
// console.log(data);
arr.push(data)
}).on('end', () => {
let b = Buffer.concat(arr)
fs.writeFile('img/' + Filename + '.' + kz, b, () => {
console.log('成功了');
})
})
} else {
console.log(Getdata.a);
console.log(res.statusCode, res.headers);
this.get(url, Filename)
}
})
https.end()
}
}
Getdata.a = 0;
更多精彩

