- 在商品信息示例中,调整了抓取频率的默认值,从 2 小时更改为 8 小时,以提高抓取效率 - 增强了日志输出,添加了更清晰的抓取和保存商品信息的提示,提升了用户体验 - 在爬虫配置中,添加了乐天平台的选择器和请求头信息,确保爬虫能够正确抓取乐天商品信息 - 这些更改旨在提高代码可读性和爬虫的功能性,符合 SOLID 原则和命名规范master
axios.defaults.baseURL = serverClient.baseURL; | axios.defaults.baseURL = serverClient.baseURL; | ||||
axios.defaults.timeout = serverClient.timeout; | axios.defaults.timeout = serverClient.timeout; | ||||
// 获取商品列表和抓取配置的执行频率 | |||||
const frequency = 1000 * 60 * 60 * 24; // 24小时 | |||||
// 设置抓取配置 | // 设置抓取配置 | ||||
const config = { | const config = { | ||||
platform: "amazon", // 平台 | platform: "amazon", // 平台 | ||||
needScreenshot: true, // 是否需要截图 | needScreenshot: true, // 是否需要截图 | ||||
monitorFrequency: 2, // 监控频率(小时) | |||||
monitorFrequency: 8, // 监控频率(小时) | |||||
goodsList: [], // 商品列表 | goodsList: [], // 商品列表 | ||||
isRunning: false, // 是否正在执行抓取任务 | isRunning: false, // 是否正在执行抓取任务 | ||||
timer: null, // 定时器 | timer: null, // 定时器 | ||||
*/ | */ | ||||
async function fetchProductInfo(goods, isRetry = false) { | async function fetchProductInfo(goods, isRetry = false) { | ||||
try { | try { | ||||
console.log(`${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`); | |||||
console.log(`\n`); | |||||
console.log(`3️⃣ ${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`); | |||||
const productInfo = await localClient.getProductInfo({ | const productInfo = await localClient.getProductInfo({ | ||||
url: goods.goodsSkuUrl, | url: goods.goodsSkuUrl, | ||||
platform: goods.platform, | platform: goods.platform, | ||||
needScreenshot: config.needScreenshot, | needScreenshot: config.needScreenshot, | ||||
}); | }); | ||||
console.log( | |||||
`${isRetry ? "重试" : "商品"} 抓取成功: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||||
); | |||||
console.log(productInfo); | |||||
console.log(`\n`); | |||||
console.log(`----------------抓取结果----------------`); | |||||
console.log(`抓取商品: ${productInfo[0].title}`); | |||||
console.log(`抓取SKU : ${goods.goodsSkuSn}`); | |||||
console.log(`基准价格: ${goods.initPrice}`); | |||||
console.log(`抓取价格: ${productInfo[0].price}`); | |||||
console.log(`----------------------------------------`); | |||||
console.log(`\n`); | |||||
return productInfo; | return productInfo; | ||||
} catch (error) { | } catch (error) { | ||||
console.error( | console.error( | ||||
async function saveProductInfo(goods, productInfo) { | async function saveProductInfo(goods, productInfo) { | ||||
try { | try { | ||||
console.log( | console.log( | ||||
`开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||||
`4️⃣ 开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||||
); | ); | ||||
const { title, price, sku, screenshotUrl } = productInfo[0]; | const { title, price, sku, screenshotUrl } = productInfo[0]; | ||||
const res = await axios.post( | const res = await axios.post( | ||||
screenshotUrl: screenshotUrl, | screenshotUrl: screenshotUrl, | ||||
} | } | ||||
); | ); | ||||
console.log(res.data); | |||||
console.log(`\n`); | |||||
console.log(res.data.success ? `✅️ ${goods.goodsSkuSn} 保存成功` : `❌️ ${goods.goodsSkuSn} 保存失败`); | |||||
console.log(`\n`); | |||||
return true; | return true; | ||||
} catch (saveError) { | } catch (saveError) { | ||||
console.log(`\n`); | |||||
console.error( | console.error( | ||||
`商品信息保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`, | |||||
`保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`, | |||||
saveError.message | saveError.message | ||||
); | ); | ||||
console.log(`\n`); | |||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
*/ | */ | ||||
async function fetchConfig() { | async function fetchConfig() { | ||||
try { | try { | ||||
console.log(`开始获取抓取配置: ${new Date().toLocaleString()}`); | |||||
console.log(`\n\n\n\n`); | |||||
console.log(`1️⃣ 开始获取抓取配置`); | |||||
const res = await axios.get(serverClient.baseURL + "/system/operationWarnconfig/noVerifyList", { | const res = await axios.get(serverClient.baseURL + "/system/operationWarnconfig/noVerifyList", { | ||||
params: serverClient.params, | params: serverClient.params, | ||||
}); | }); | ||||
console.log(res.data); | |||||
console.log(`阈值: ${res.data.rows[0].priceChangeThreshold}`); | |||||
console.log(`频率: ${res.data.rows[0].monitorFrequency}小时`); | |||||
const { rows } = res.data; | const { rows } = res.data; | ||||
if (rows.length > 0) { | if (rows.length > 0) { | ||||
config.monitorFrequency = rows[0].monitorFrequency; | config.monitorFrequency = rows[0].monitorFrequency; | ||||
} else { | } else { | ||||
config.monitorFrequency = 2; // 默认2小时 | |||||
config.monitorFrequency = 8; // 默认8小时 | |||||
} | } | ||||
console.log(`抓取频率设置为 ${config.monitorFrequency} 小时`); | |||||
return true; | return true; | ||||
} catch (error) { | } catch (error) { | ||||
console.error(`获取抓取配置失败: ${new Date().toLocaleString()}`, error.message); | console.error(`获取抓取配置失败: ${new Date().toLocaleString()}`, error.message); | ||||
console.log(`---------------------------------------------------------`); | |||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
} | } | ||||
config.isRunning = true; | config.isRunning = true; | ||||
console.log(`开始执行抓取任务: ${new Date().toLocaleString()}`); | |||||
try { | try { | ||||
const res = await axios.get(serverClient.baseURL + "/system/operationGoods/noVerifyList", { | const res = await axios.get(serverClient.baseURL + "/system/operationGoods/noVerifyList", { | ||||
params: { | params: { | ||||
...serverClient.params, | ...serverClient.params, | ||||
isDisabled: 1, | |||||
isDisabled: 0, | |||||
}, | }, | ||||
}); | }); | ||||
console.log(res.data); | |||||
console.log(`\n`); | |||||
console.log(`2️⃣ 开始获取商品列表, 共${res.data.rows.length}个商品`); | |||||
const d = res.data.rows; | |||||
d.forEach((row, index) => { | |||||
console.log(`(${index + 1} / ${d.length})`); | |||||
console.log(`商品名称: ${row.goodsSkuName}`); | |||||
console.log(`商品SKU : ${row.goodsSkuSn}`); | |||||
console.log(`基准价格: ${row.initPrice}`); | |||||
console.log(`备注: ${row.remark}`); | |||||
}); | |||||
const { rows } = res.data; | const { rows } = res.data; | ||||
config.goodsList = rows; | config.goodsList = rows; | ||||
await processProduct(goods); | await processProduct(goods); | ||||
} | } | ||||
console.log("所有商品抓取完成", new Date().toLocaleString()); | |||||
console.log("✌️ 所有商品抓取完成", new Date().toLocaleString()); | |||||
} catch (error) { | } catch (error) { | ||||
console.error(`获取商品列表失败: ${new Date().toLocaleString()}`, error.message); | |||||
console.error(`⛔️ 获取商品列表失败: ${new Date().toLocaleString()}`, error.message); | |||||
} finally { | } finally { | ||||
config.isRunning = false; | config.isRunning = false; | ||||
} | } | ||||
// 设置定时器,根据monitorFrequency的小时数定时执行 | // 设置定时器,根据monitorFrequency的小时数定时执行 | ||||
const intervalMs = config.monitorFrequency * 60 * 60 * 1000; // 转换为毫秒 | const intervalMs = config.monitorFrequency * 60 * 60 * 1000; // 转换为毫秒 | ||||
console.log(`\n\n\n\n`); | |||||
console.log(`设置定时任务,每 ${config.monitorFrequency} 小时执行一次,下次执行时间: ${new Date(Date.now() + intervalMs).toLocaleString()}`); | console.log(`设置定时任务,每 ${config.monitorFrequency} 小时执行一次,下次执行时间: ${new Date(Date.now() + intervalMs).toLocaleString()}`); | ||||
console.log(`\n\n\n\n --------------------------------------------------------------------------------------------------------- \n\n\n\n`); | |||||
config.timer = setInterval(async () => { | config.timer = setInterval(async () => { | ||||
console.log(`定时任务触发: ${new Date().toLocaleString()}`); | console.log(`定时任务触发: ${new Date().toLocaleString()}`); | ||||
// 重新获取配置(频率可能会改变) | // 重新获取配置(频率可能会改变) | ||||
// 启动调度器 | // 启动调度器 | ||||
startScheduler(); | startScheduler(); | ||||
// 输出启动信息 | |||||
console.log(`抓取服务已启动: ${new Date().toLocaleString()}`); | |||||
console.log(`初始监控频率: ${config.monitorFrequency} 小时`); |
}, | }, | ||||
page: { | page: { | ||||
locale: 'ja-JP', | locale: 'ja-JP', | ||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |||||
} | } | ||||
}, | }, | ||||
delay: 6000 | delay: 6000 | ||||
}, | }, | ||||
headers: { | headers: { | ||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |||||
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7' | |||||
} | } | ||||
}, | }, | ||||
// 乐天配置(预留) | |||||
// 乐天配置 | |||||
rakuten: { | rakuten: { | ||||
selectors: {}, | |||||
selectors: { | |||||
title: '.normal_reserve_item_name', | |||||
price: 'span[irc="Price"] .number-display--3s4mj', | |||||
coupon: '', | |||||
variants: '' | |||||
}, | |||||
timeouts: { | timeouts: { | ||||
pageLoad: 5000, | |||||
elementWait: 1000, | |||||
networkIdle: 500 | |||||
pageLoad: 60000, | |||||
elementWait: 10000, | |||||
networkIdle: 50000 | |||||
}, | }, | ||||
retry: { | retry: { | ||||
maxAttempts: 3, | maxAttempts: 3, | ||||
delay: 1000 | |||||
delay: 6000 | |||||
}, | |||||
headers: { | |||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |||||
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7' | |||||
} | } | ||||
}, | }, | ||||
const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler'); | const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler'); | ||||
const RakutenCrawler = require('./crawlers/rakuten/RakutenCrawler'); | |||||
const CrawlerError = require('./errors/CrawlerError'); | const CrawlerError = require('./errors/CrawlerError'); | ||||
/** | /** | ||||
switch (platform.toLowerCase()) { | switch (platform.toLowerCase()) { | ||||
case 'amazon': | case 'amazon': | ||||
return new AmazonCrawler(config); | return new AmazonCrawler(config); | ||||
// 可以在这里添加其他平台的爬虫 | |||||
// case 'rakuten': | |||||
// return new RakutenCrawler(config); | |||||
case 'rakuten': | |||||
return new RakutenCrawler(config); | |||||
// case 'yahoo': | // case 'yahoo': | ||||
// return new YahooCrawler(config); | // return new YahooCrawler(config); | ||||
default: | default: |
const BaseCrawler = require("../base/BaseCrawler"); | |||||
const CrawlerError = require("../../errors/CrawlerError"); | |||||
const path = require("path"); | |||||
const fs = require("fs"); | |||||
const fsPromises = require("fs").promises; | |||||
const axios = require("axios"); | |||||
const FormData = require("form-data"); | |||||
/** | |||||
* 乐天爬虫实现类 | |||||
*/ | |||||
class RakutenCrawler extends BaseCrawler { | |||||
constructor(config) { | |||||
super(config); | |||||
console.log(config); | |||||
this.selectors = config.selectors; | |||||
this.timeouts = config.timeouts; | |||||
this.retryConfig = config.retry; | |||||
this.uploadConfig = config.common?.upload; | |||||
} | |||||
/** | |||||
* 创建截图目录 | |||||
* @returns {Promise<string>} 截图目录路径 | |||||
*/ | |||||
async createScreenshotDir() { | |||||
const dir = path.join(process.cwd(), "screenshots"); | |||||
try { | |||||
await fsPromises.mkdir(dir, { recursive: true }); | |||||
} catch (error) { | |||||
if (error.code !== "EEXIST") { | |||||
throw new CrawlerError( | |||||
"创建截图目录失败", | |||||
"SCREENSHOT_DIR_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} | |||||
} | |||||
return dir; | |||||
} | |||||
/** | |||||
* 上传图片到服务器 | |||||
* @param {string} imagePath - 图片路径 | |||||
* @returns {Promise<string>} 图片URL | |||||
*/ | |||||
async uploadImage(imagePath) { | |||||
try { | |||||
const formData = new FormData(); | |||||
formData.append("file", fs.createReadStream(imagePath)); | |||||
formData.append("scene", this.uploadConfig.scene); | |||||
const response = await axios.post(this.uploadConfig.url, formData, { | |||||
headers: { | |||||
...formData.getHeaders(), | |||||
"Content-Type": "multipart/form-data", | |||||
}, | |||||
timeout: this.uploadConfig.timeout, | |||||
}); | |||||
if (!response.data || !response.data.url) { | |||||
throw new Error("上传响应格式错误"); | |||||
} | |||||
return response.data.url; | |||||
} catch (error) { | |||||
if (error.response) { | |||||
throw new CrawlerError( | |||||
`图片上传失败: ${error.response.status} ${error.response.statusText}`, | |||||
"IMAGE_UPLOAD_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} | |||||
throw new CrawlerError( | |||||
"图片上传失败", | |||||
"IMAGE_UPLOAD_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} | |||||
} | |||||
/** | |||||
* 带重试的页面导航 | |||||
* @param {string} url - 目标URL | |||||
* @returns {Promise<void>} | |||||
*/ | |||||
async navigateWithRetry(url) { | |||||
let lastError; | |||||
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { | |||||
try { | |||||
await this.page.goto(url, { | |||||
waitUntil: "networkidle", | |||||
timeout: this.timeouts.pageLoad, | |||||
}); | |||||
return; | |||||
} catch (error) { | |||||
lastError = error; | |||||
console.log( | |||||
`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, | |||||
error.message | |||||
); | |||||
if (attempt < this.retryConfig.maxAttempts) { | |||||
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); | |||||
await new Promise((resolve) => | |||||
setTimeout(resolve, this.retryConfig.delay) | |||||
); | |||||
// 重新初始化浏览器 | |||||
await this.closeBrowser(); | |||||
await this.initBrowser(); | |||||
} | |||||
} | |||||
} | |||||
throw new CrawlerError( | |||||
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, | |||||
"NAVIGATION_ERROR", | |||||
"rakuten", | |||||
lastError | |||||
); | |||||
} | |||||
/** | |||||
* 等待元素出现 | |||||
* @param {string} selector - 选择器 | |||||
* @returns {Promise<void>} | |||||
*/ | |||||
async waitForElement(selector) { | |||||
try { | |||||
await this.page.waitForSelector(selector, { | |||||
timeout: this.timeouts.elementWait, | |||||
}); | |||||
} catch (error) { | |||||
throw new CrawlerError( | |||||
`等待元素超时: ${selector}`, | |||||
"ELEMENT_WAIT_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} | |||||
} | |||||
/** | |||||
* 获取单个SKU信息 | |||||
* @returns {Promise<Object>} SKU信息 | |||||
*/ | |||||
async getSingleSkuInfo() { | |||||
try { | |||||
// 等待页面加载完成 | |||||
await this.page.waitForLoadState("networkidle"); | |||||
// 等待标题元素出现 | |||||
await this.waitForElement(this.selectors.title); | |||||
// 处理优惠券 | |||||
// const couponValue = await this.handleCoupon(); | |||||
// 获取商品信息 | |||||
const info = await this.page.evaluate( | |||||
({ selectors, couponValue }) => { | |||||
const title = | |||||
document.querySelector(selectors.title)?.textContent.trim() || null; | |||||
let priceText = | |||||
document.querySelector(selectors.price)?.textContent.trim() || null; | |||||
// 解析价格 | |||||
const price = priceText; | |||||
const url = window.location.href; | |||||
const asin = | |||||
document | |||||
.querySelector(".normal_reserve_item_number") | |||||
?.textContent.trim() || null; | |||||
return { | |||||
title, | |||||
price: price ? price.toLocaleString() : null, | |||||
sku: asin, | |||||
url, | |||||
}; | |||||
}, | |||||
{ selectors: this.selectors, couponValue } | |||||
); | |||||
console.log(info); | |||||
// 验证必要信息 | |||||
if (!info.title || !info.price || !info.sku) { | |||||
throw new Error("商品信息不完整"); | |||||
} | |||||
return info; | |||||
} catch (error) { | |||||
throw new CrawlerError( | |||||
"获取SKU信息失败", | |||||
"SKU_INFO_GET_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} | |||||
} | |||||
/** | |||||
* 主方法:抓取商品信息 | |||||
* @param {string} url - 商品URL | |||||
* @param {boolean} needScreenshot - 是否需要截图 | |||||
* @returns {Promise<Array>} 商品信息数组 | |||||
*/ | |||||
async crawl(url, needScreenshot = false) { | |||||
try { | |||||
await this.initBrowser(); | |||||
// 设置页面超时 | |||||
this.page.setDefaultTimeout(this.timeouts.elementWait); | |||||
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); | |||||
// 设置请求拦截 | |||||
await this.page.route("**/*", (route) => { | |||||
const resourceType = route.request().resourceType(); | |||||
// 只阻止字体和媒体资源,允许加载图片 | |||||
if (["font", "media"].includes(resourceType)) { | |||||
route.abort(); | |||||
} else { | |||||
route.continue(); | |||||
} | |||||
}); | |||||
// 导航到目标页面 | |||||
await this.navigateWithRetry(url); | |||||
// 只获取单个SKU信息 | |||||
const data = [await this.getSingleSkuInfo()]; | |||||
if (needScreenshot) { | |||||
try { | |||||
const dir = await this.createScreenshotDir(); | |||||
const filename = `${Date.now()}.png`; | |||||
const shot = path.join(dir, filename); | |||||
// 等待页面完全加载 | |||||
await this.page.waitForLoadState("networkidle"); | |||||
// 截取全页面 | |||||
await this.page.screenshot({ | |||||
path: shot, | |||||
fullPage: true, | |||||
timeout: this.timeouts.elementWait, | |||||
}); | |||||
// 上传图片并获取URL | |||||
const imageUrl = await this.uploadImage(shot); | |||||
// 更新数据,添加图片URL | |||||
data.forEach((item) => { | |||||
item.screenshotUrl = imageUrl; | |||||
}); | |||||
// 删除临时文件 | |||||
try { | |||||
await fsPromises.unlink(shot); | |||||
} catch (error) { | |||||
console.error("删除临时截图文件失败:", error); | |||||
} | |||||
} catch (error) { | |||||
console.error("截图处理失败:", error); | |||||
// 截图失败不影响主流程 | |||||
} | |||||
} | |||||
return data; | |||||
} catch (error) { | |||||
throw new CrawlerError( | |||||
"商品信息抓取失败", | |||||
"CRAWL_ERROR", | |||||
"rakuten", | |||||
error | |||||
); | |||||
} finally { | |||||
await this.closeBrowser(); | |||||
} | |||||
} | |||||
} | |||||
module.exports = RakutenCrawler; |