- 在商品信息示例中,调整了抓取频率的默认值,从 2 小时更改为 8 小时,以提高抓取效率 - 增强了日志输出,添加了更清晰的抓取和保存商品信息的提示,提升了用户体验 - 在爬虫配置中,添加了乐天平台的选择器和请求头信息,确保爬虫能够正确抓取乐天商品信息 - 这些更改旨在提高代码可读性和爬虫的功能性,符合 SOLID 原则和命名规范master
@@ -21,14 +21,11 @@ const serverClient = { | |||
axios.defaults.baseURL = serverClient.baseURL; | |||
axios.defaults.timeout = serverClient.timeout; | |||
// 获取商品列表和抓取配置的执行频率 | |||
const frequency = 1000 * 60 * 60 * 24; // 24小时 | |||
// 设置抓取配置 | |||
const config = { | |||
platform: "amazon", // 平台 | |||
needScreenshot: true, // 是否需要截图 | |||
monitorFrequency: 2, // 监控频率(小时) | |||
monitorFrequency: 8, // 监控频率(小时) | |||
goodsList: [], // 商品列表 | |||
isRunning: false, // 是否正在执行抓取任务 | |||
timer: null, // 定时器 | |||
@@ -42,16 +39,22 @@ const config = { | |||
*/ | |||
async function fetchProductInfo(goods, isRetry = false) { | |||
try { | |||
console.log(`${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`); | |||
console.log(`\n`); | |||
console.log(`3️⃣ ${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`); | |||
const productInfo = await localClient.getProductInfo({ | |||
url: goods.goodsSkuUrl, | |||
platform: goods.platform, | |||
needScreenshot: config.needScreenshot, | |||
}); | |||
console.log( | |||
`${isRetry ? "重试" : "商品"} 抓取成功: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||
); | |||
console.log(productInfo); | |||
console.log(`\n`); | |||
console.log(`----------------抓取结果----------------`); | |||
console.log(`抓取商品: ${productInfo[0].title}`); | |||
console.log(`抓取SKU : ${goods.goodsSkuSn}`); | |||
console.log(`基准价格: ${goods.initPrice}`); | |||
console.log(`抓取价格: ${productInfo[0].price}`); | |||
console.log(`----------------------------------------`); | |||
console.log(`\n`); | |||
return productInfo; | |||
} catch (error) { | |||
console.error( | |||
@@ -71,7 +74,7 @@ async function fetchProductInfo(goods, isRetry = false) { | |||
async function saveProductInfo(goods, productInfo) { | |||
try { | |||
console.log( | |||
`开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||
`4️⃣ 开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}` | |||
); | |||
const { title, price, sku, screenshotUrl } = productInfo[0]; | |||
const res = await axios.post( | |||
@@ -85,13 +88,17 @@ async function saveProductInfo(goods, productInfo) { | |||
screenshotUrl: screenshotUrl, | |||
} | |||
); | |||
console.log(res.data); | |||
console.log(`\n`); | |||
console.log(res.data.success ? `✅️ ${goods.goodsSkuSn} 保存成功` : `❌️ ${goods.goodsSkuSn} 保存失败`); | |||
console.log(`\n`); | |||
return true; | |||
} catch (saveError) { | |||
console.log(`\n`); | |||
console.error( | |||
`商品信息保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`, | |||
`保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`, | |||
saveError.message | |||
); | |||
console.log(`\n`); | |||
return false; | |||
} | |||
} | |||
@@ -127,21 +134,24 @@ async function processProduct(goods) { | |||
*/ | |||
async function fetchConfig() { | |||
try { | |||
console.log(`开始获取抓取配置: ${new Date().toLocaleString()}`); | |||
console.log(`\n\n\n\n`); | |||
console.log(`1️⃣ 开始获取抓取配置`); | |||
const res = await axios.get(serverClient.baseURL + "/system/operationWarnconfig/noVerifyList", { | |||
params: serverClient.params, | |||
}); | |||
console.log(res.data); | |||
console.log(`阈值: ${res.data.rows[0].priceChangeThreshold}`); | |||
console.log(`频率: ${res.data.rows[0].monitorFrequency}小时`); | |||
const { rows } = res.data; | |||
if (rows.length > 0) { | |||
config.monitorFrequency = rows[0].monitorFrequency; | |||
} else { | |||
config.monitorFrequency = 2; // 默认2小时 | |||
config.monitorFrequency = 8; // 默认8小时 | |||
} | |||
console.log(`抓取频率设置为 ${config.monitorFrequency} 小时`); | |||
return true; | |||
} catch (error) { | |||
console.error(`获取抓取配置失败: ${new Date().toLocaleString()}`, error.message); | |||
console.log(`---------------------------------------------------------`); | |||
return false; | |||
} | |||
} | |||
@@ -157,16 +167,25 @@ async function fetchGoodsListAndProcess() { | |||
} | |||
config.isRunning = true; | |||
console.log(`开始执行抓取任务: ${new Date().toLocaleString()}`); | |||
try { | |||
const res = await axios.get(serverClient.baseURL + "/system/operationGoods/noVerifyList", { | |||
params: { | |||
...serverClient.params, | |||
isDisabled: 1, | |||
isDisabled: 0, | |||
}, | |||
}); | |||
console.log(res.data); | |||
console.log(`\n`); | |||
console.log(`2️⃣ 开始获取商品列表, 共${res.data.rows.length}个商品`); | |||
const d = res.data.rows; | |||
d.forEach((row, index) => { | |||
console.log(`(${index + 1} / ${d.length})`); | |||
console.log(`商品名称: ${row.goodsSkuName}`); | |||
console.log(`商品SKU : ${row.goodsSkuSn}`); | |||
console.log(`基准价格: ${row.initPrice}`); | |||
console.log(`备注: ${row.remark}`); | |||
}); | |||
const { rows } = res.data; | |||
config.goodsList = rows; | |||
@@ -175,9 +194,9 @@ async function fetchGoodsListAndProcess() { | |||
await processProduct(goods); | |||
} | |||
console.log("所有商品抓取完成", new Date().toLocaleString()); | |||
console.log("✌️ 所有商品抓取完成", new Date().toLocaleString()); | |||
} catch (error) { | |||
console.error(`获取商品列表失败: ${new Date().toLocaleString()}`, error.message); | |||
console.error(`⛔️ 获取商品列表失败: ${new Date().toLocaleString()}`, error.message); | |||
} finally { | |||
config.isRunning = false; | |||
} | |||
@@ -200,8 +219,10 @@ async function startScheduler() { | |||
// 设置定时器,根据monitorFrequency的小时数定时执行 | |||
const intervalMs = config.monitorFrequency * 60 * 60 * 1000; // 转换为毫秒 | |||
console.log(`\n\n\n\n`); | |||
console.log(`设置定时任务,每 ${config.monitorFrequency} 小时执行一次,下次执行时间: ${new Date(Date.now() + intervalMs).toLocaleString()}`); | |||
console.log(`\n\n\n\n --------------------------------------------------------------------------------------------------------- \n\n\n\n`); | |||
config.timer = setInterval(async () => { | |||
console.log(`定时任务触发: ${new Date().toLocaleString()}`); | |||
// 重新获取配置(频率可能会改变) | |||
@@ -231,6 +252,3 @@ async function startScheduler() { | |||
// 启动调度器 | |||
startScheduler(); | |||
// 输出启动信息 | |||
console.log(`抓取服务已启动: ${new Date().toLocaleString()}`); | |||
console.log(`初始监控频率: ${config.monitorFrequency} 小时`); |
@@ -16,7 +16,6 @@ module.exports = { | |||
}, | |||
page: { | |||
locale: 'ja-JP', | |||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |||
} | |||
}, | |||
@@ -38,22 +37,29 @@ module.exports = { | |||
delay: 6000 | |||
}, | |||
headers: { | |||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |||
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7' | |||
} | |||
}, | |||
// 乐天配置(预留) | |||
// 乐天配置 | |||
rakuten: { | |||
selectors: {}, | |||
selectors: { | |||
title: '.normal_reserve_item_name', | |||
price: 'span[irc="Price"] .number-display--3s4mj', | |||
coupon: '', | |||
variants: '' | |||
}, | |||
timeouts: { | |||
pageLoad: 5000, | |||
elementWait: 1000, | |||
networkIdle: 500 | |||
pageLoad: 60000, | |||
elementWait: 10000, | |||
networkIdle: 50000 | |||
}, | |||
retry: { | |||
maxAttempts: 3, | |||
delay: 1000 | |||
delay: 6000 | |||
}, | |||
headers: { | |||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |||
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7' | |||
} | |||
}, | |||
@@ -1,4 +1,5 @@ | |||
const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler'); | |||
const RakutenCrawler = require('./crawlers/rakuten/RakutenCrawler'); | |||
const CrawlerError = require('./errors/CrawlerError'); | |||
/** | |||
@@ -17,9 +18,8 @@ class CrawlerFactory { | |||
switch (platform.toLowerCase()) { | |||
case 'amazon': | |||
return new AmazonCrawler(config); | |||
// 可以在这里添加其他平台的爬虫 | |||
// case 'rakuten': | |||
// return new RakutenCrawler(config); | |||
case 'rakuten': | |||
return new RakutenCrawler(config); | |||
// case 'yahoo': | |||
// return new YahooCrawler(config); | |||
default: |
@@ -0,0 +1,287 @@ | |||
const BaseCrawler = require("../base/BaseCrawler"); | |||
const CrawlerError = require("../../errors/CrawlerError"); | |||
const path = require("path"); | |||
const fs = require("fs"); | |||
const fsPromises = require("fs").promises; | |||
const axios = require("axios"); | |||
const FormData = require("form-data"); | |||
/** | |||
* 乐天爬虫实现类 | |||
*/ | |||
class RakutenCrawler extends BaseCrawler { | |||
constructor(config) { | |||
super(config); | |||
console.log(config); | |||
this.selectors = config.selectors; | |||
this.timeouts = config.timeouts; | |||
this.retryConfig = config.retry; | |||
this.uploadConfig = config.common?.upload; | |||
} | |||
/** | |||
* 创建截图目录 | |||
* @returns {Promise<string>} 截图目录路径 | |||
*/ | |||
async createScreenshotDir() { | |||
const dir = path.join(process.cwd(), "screenshots"); | |||
try { | |||
await fsPromises.mkdir(dir, { recursive: true }); | |||
} catch (error) { | |||
if (error.code !== "EEXIST") { | |||
throw new CrawlerError( | |||
"创建截图目录失败", | |||
"SCREENSHOT_DIR_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} | |||
} | |||
return dir; | |||
} | |||
/** | |||
* 上传图片到服务器 | |||
* @param {string} imagePath - 图片路径 | |||
* @returns {Promise<string>} 图片URL | |||
*/ | |||
async uploadImage(imagePath) { | |||
try { | |||
const formData = new FormData(); | |||
formData.append("file", fs.createReadStream(imagePath)); | |||
formData.append("scene", this.uploadConfig.scene); | |||
const response = await axios.post(this.uploadConfig.url, formData, { | |||
headers: { | |||
...formData.getHeaders(), | |||
"Content-Type": "multipart/form-data", | |||
}, | |||
timeout: this.uploadConfig.timeout, | |||
}); | |||
if (!response.data || !response.data.url) { | |||
throw new Error("上传响应格式错误"); | |||
} | |||
return response.data.url; | |||
} catch (error) { | |||
if (error.response) { | |||
throw new CrawlerError( | |||
`图片上传失败: ${error.response.status} ${error.response.statusText}`, | |||
"IMAGE_UPLOAD_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} | |||
throw new CrawlerError( | |||
"图片上传失败", | |||
"IMAGE_UPLOAD_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} | |||
} | |||
/** | |||
* 带重试的页面导航 | |||
* @param {string} url - 目标URL | |||
* @returns {Promise<void>} | |||
*/ | |||
async navigateWithRetry(url) { | |||
let lastError; | |||
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { | |||
try { | |||
await this.page.goto(url, { | |||
waitUntil: "networkidle", | |||
timeout: this.timeouts.pageLoad, | |||
}); | |||
return; | |||
} catch (error) { | |||
lastError = error; | |||
console.log( | |||
`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, | |||
error.message | |||
); | |||
if (attempt < this.retryConfig.maxAttempts) { | |||
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); | |||
await new Promise((resolve) => | |||
setTimeout(resolve, this.retryConfig.delay) | |||
); | |||
// 重新初始化浏览器 | |||
await this.closeBrowser(); | |||
await this.initBrowser(); | |||
} | |||
} | |||
} | |||
throw new CrawlerError( | |||
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, | |||
"NAVIGATION_ERROR", | |||
"rakuten", | |||
lastError | |||
); | |||
} | |||
/** | |||
* 等待元素出现 | |||
* @param {string} selector - 选择器 | |||
* @returns {Promise<void>} | |||
*/ | |||
async waitForElement(selector) { | |||
try { | |||
await this.page.waitForSelector(selector, { | |||
timeout: this.timeouts.elementWait, | |||
}); | |||
} catch (error) { | |||
throw new CrawlerError( | |||
`等待元素超时: ${selector}`, | |||
"ELEMENT_WAIT_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} | |||
} | |||
/** | |||
* 获取单个SKU信息 | |||
* @returns {Promise<Object>} SKU信息 | |||
*/ | |||
async getSingleSkuInfo() { | |||
try { | |||
// 等待页面加载完成 | |||
await this.page.waitForLoadState("networkidle"); | |||
// 等待标题元素出现 | |||
await this.waitForElement(this.selectors.title); | |||
// 处理优惠券 | |||
// const couponValue = await this.handleCoupon(); | |||
// 获取商品信息 | |||
const info = await this.page.evaluate( | |||
({ selectors, couponValue }) => { | |||
const title = | |||
document.querySelector(selectors.title)?.textContent.trim() || null; | |||
let priceText = | |||
document.querySelector(selectors.price)?.textContent.trim() || null; | |||
// 解析价格 | |||
const price = priceText; | |||
const url = window.location.href; | |||
const asin = | |||
document | |||
.querySelector(".normal_reserve_item_number") | |||
?.textContent.trim() || null; | |||
return { | |||
title, | |||
price: price ? price.toLocaleString() : null, | |||
sku: asin, | |||
url, | |||
}; | |||
}, | |||
{ selectors: this.selectors, couponValue } | |||
); | |||
console.log(info); | |||
// 验证必要信息 | |||
if (!info.title || !info.price || !info.sku) { | |||
throw new Error("商品信息不完整"); | |||
} | |||
return info; | |||
} catch (error) { | |||
throw new CrawlerError( | |||
"获取SKU信息失败", | |||
"SKU_INFO_GET_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} | |||
} | |||
/** | |||
* 主方法:抓取商品信息 | |||
* @param {string} url - 商品URL | |||
* @param {boolean} needScreenshot - 是否需要截图 | |||
* @returns {Promise<Array>} 商品信息数组 | |||
*/ | |||
async crawl(url, needScreenshot = false) { | |||
try { | |||
await this.initBrowser(); | |||
// 设置页面超时 | |||
this.page.setDefaultTimeout(this.timeouts.elementWait); | |||
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); | |||
// 设置请求拦截 | |||
await this.page.route("**/*", (route) => { | |||
const resourceType = route.request().resourceType(); | |||
// 只阻止字体和媒体资源,允许加载图片 | |||
if (["font", "media"].includes(resourceType)) { | |||
route.abort(); | |||
} else { | |||
route.continue(); | |||
} | |||
}); | |||
// 导航到目标页面 | |||
await this.navigateWithRetry(url); | |||
// 只获取单个SKU信息 | |||
const data = [await this.getSingleSkuInfo()]; | |||
if (needScreenshot) { | |||
try { | |||
const dir = await this.createScreenshotDir(); | |||
const filename = `${Date.now()}.png`; | |||
const shot = path.join(dir, filename); | |||
// 等待页面完全加载 | |||
await this.page.waitForLoadState("networkidle"); | |||
// 截取全页面 | |||
await this.page.screenshot({ | |||
path: shot, | |||
fullPage: true, | |||
timeout: this.timeouts.elementWait, | |||
}); | |||
// 上传图片并获取URL | |||
const imageUrl = await this.uploadImage(shot); | |||
// 更新数据,添加图片URL | |||
data.forEach((item) => { | |||
item.screenshotUrl = imageUrl; | |||
}); | |||
// 删除临时文件 | |||
try { | |||
await fsPromises.unlink(shot); | |||
} catch (error) { | |||
console.error("删除临时截图文件失败:", error); | |||
} | |||
} catch (error) { | |||
console.error("截图处理失败:", error); | |||
// 截图失败不影响主流程 | |||
} | |||
} | |||
return data; | |||
} catch (error) { | |||
throw new CrawlerError( | |||
"商品信息抓取失败", | |||
"CRAWL_ERROR", | |||
"rakuten", | |||
error | |||
); | |||
} finally { | |||
await this.closeBrowser(); | |||
} | |||
} | |||
} | |||
module.exports = RakutenCrawler; |