// baseUrl用以拼接url let baseUrl = 'http://blog.huqing.site' // 选择以下三个目录 let categories = [ '/categories/%E5%89%8D%E7%AB%AF-FrontEnd/', '/categories/%E5%B7%A5%E5%85%B7-Tools/', '/categories/%E7%AE%97%E6%B3%95-Algorithm/' ] // 根据url获取HTML内容的方法 functiongetCatagory(url) { returnnewPromise((resolve, reject) => { console.log('正在爬取:' + url) http .get(baseUrl + url, res => { let data res.on('data', html => { data += html }) res.on('end', () => { resolve(parseHtml(data)) }) }) .on('error', error => { console.log('获取数据出错!') reject(error) }) }) }
处理HTML获取需要的内容
这里使用到了cheerio这个库,使用方法几乎和jQuery一样
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
functionparseHtml(html) { const $ = cheerio.load(html) let articles = $('.post-header') let parsedData = [] articles.each(function () { let section = $(this) let title = section.find('.post-title-link span').text() let time = section.find('.post-meta time').attr('content') let item = { title, time } parsedData.push(item) }) return parsedData }