瀏覽代碼

refactor: 优化商品信息示例和爬虫配置

- 在商品信息示例中,调整了抓取频率的默认值,从 2 小时更改为 8 小时,以提高抓取效率
- 增强了日志输出,添加了更清晰的抓取和保存商品信息的提示,提升了用户体验
- 在爬虫配置中,添加了乐天平台的选择器和请求头信息,确保爬虫能够正确抓取乐天商品信息
- 这些更改旨在提高代码可读性和爬虫的功能性,符合 SOLID 原则和命名规范
master
lizhuang 1 月之前
父節點
當前提交
5cfc9ddb0e

+ 43
- 25
examples/productInfoExample.js 查看文件

@@ -21,14 +21,11 @@ const serverClient = {
axios.defaults.baseURL = serverClient.baseURL;
axios.defaults.timeout = serverClient.timeout;

// 获取商品列表和抓取配置的执行频率
const frequency = 1000 * 60 * 60 * 24; // 24小时

// 设置抓取配置
const config = {
platform: "amazon", // 平台
needScreenshot: true, // 是否需要截图
monitorFrequency: 2, // 监控频率(小时)
monitorFrequency: 8, // 监控频率(小时)
goodsList: [], // 商品列表
isRunning: false, // 是否正在执行抓取任务
timer: null, // 定时器
@@ -42,16 +39,22 @@ const config = {
*/
async function fetchProductInfo(goods, isRetry = false) {
try {
console.log(`${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`);
console.log(`\n`);
console.log(`3️⃣ ${isRetry ? "重试" : "开始"}抓取商品: ${goods.goodsSkuSn}`);
const productInfo = await localClient.getProductInfo({
url: goods.goodsSkuUrl,
platform: goods.platform,
needScreenshot: config.needScreenshot,
});
console.log(
`${isRetry ? "重试" : "商品"} 抓取成功: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`
);
console.log(productInfo);
console.log(`\n`);
console.log(`----------------抓取结果----------------`);
console.log(`抓取商品: ${productInfo[0].title}`);
console.log(`抓取SKU : ${goods.goodsSkuSn}`);
console.log(`基准价格: ${goods.initPrice}`);
console.log(`抓取价格: ${productInfo[0].price}`);
console.log(`----------------------------------------`);
console.log(`\n`);
return productInfo;
} catch (error) {
console.error(
@@ -71,7 +74,7 @@ async function fetchProductInfo(goods, isRetry = false) {
async function saveProductInfo(goods, productInfo) {
try {
console.log(
`开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`
`4️⃣ 开始保存商品信息: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`
);
const { title, price, sku, screenshotUrl } = productInfo[0];
const res = await axios.post(
@@ -85,13 +88,17 @@ async function saveProductInfo(goods, productInfo) {
screenshotUrl: screenshotUrl,
}
);
console.log(res.data);
console.log(`\n`);
console.log(res.data.success ? `✅️ ${goods.goodsSkuSn} 保存成功` : `❌️ ${goods.goodsSkuSn} 保存失败`);
console.log(`\n`);
return true;
} catch (saveError) {
console.log(`\n`);
console.error(
`商品信息保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`,
`保存失败: ${goods.goodsSkuSn} - ${new Date().toLocaleString()}`,
saveError.message
);
console.log(`\n`);
return false;
}
}
@@ -127,21 +134,24 @@ async function processProduct(goods) {
*/
async function fetchConfig() {
try {
console.log(`开始获取抓取配置: ${new Date().toLocaleString()}`);
console.log(`\n\n\n\n`);
console.log(`1️⃣ 开始获取抓取配置`);
const res = await axios.get(serverClient.baseURL + "/system/operationWarnconfig/noVerifyList", {
params: serverClient.params,
});
console.log(res.data);
console.log(`阈值: ${res.data.rows[0].priceChangeThreshold}`);
console.log(`频率: ${res.data.rows[0].monitorFrequency}小时`);

const { rows } = res.data;
if (rows.length > 0) {
config.monitorFrequency = rows[0].monitorFrequency;
} else {
config.monitorFrequency = 2; // 默认2小时
config.monitorFrequency = 8; // 默认8小时
}
console.log(`抓取频率设置为 ${config.monitorFrequency} 小时`);
return true;
} catch (error) {
console.error(`获取抓取配置失败: ${new Date().toLocaleString()}`, error.message);
console.log(`---------------------------------------------------------`);
return false;
}
}
@@ -157,16 +167,25 @@ async function fetchGoodsListAndProcess() {
}

config.isRunning = true;
console.log(`开始执行抓取任务: ${new Date().toLocaleString()}`);

try {
const res = await axios.get(serverClient.baseURL + "/system/operationGoods/noVerifyList", {
params: {
...serverClient.params,
isDisabled: 1,
isDisabled: 0,
},
});
console.log(res.data);
console.log(`\n`);
console.log(`2️⃣ 开始获取商品列表, 共${res.data.rows.length}个商品`);
const d = res.data.rows;

d.forEach((row, index) => {
console.log(`(${index + 1} / ${d.length})`);
console.log(`商品名称: ${row.goodsSkuName}`);
console.log(`商品SKU : ${row.goodsSkuSn}`);
console.log(`基准价格: ${row.initPrice}`);
console.log(`备注: ${row.remark}`);
});
const { rows } = res.data;
config.goodsList = rows;

@@ -175,9 +194,9 @@ async function fetchGoodsListAndProcess() {
await processProduct(goods);
}

console.log("所有商品抓取完成", new Date().toLocaleString());
console.log("✌️ 所有商品抓取完成", new Date().toLocaleString());
} catch (error) {
console.error(`获取商品列表失败: ${new Date().toLocaleString()}`, error.message);
console.error(`⛔️ 获取商品列表失败: ${new Date().toLocaleString()}`, error.message);
} finally {
config.isRunning = false;
}
@@ -200,8 +219,10 @@ async function startScheduler() {
// 设置定时器,根据monitorFrequency的小时数定时执行
const intervalMs = config.monitorFrequency * 60 * 60 * 1000; // 转换为毫秒
console.log(`\n\n\n\n`);
console.log(`设置定时任务,每 ${config.monitorFrequency} 小时执行一次,下次执行时间: ${new Date(Date.now() + intervalMs).toLocaleString()}`);
console.log(`\n\n\n\n --------------------------------------------------------------------------------------------------------- \n\n\n\n`);

config.timer = setInterval(async () => {
console.log(`定时任务触发: ${new Date().toLocaleString()}`);
// 重新获取配置(频率可能会改变)
@@ -231,6 +252,3 @@ async function startScheduler() {
// 启动调度器
startScheduler();

// 输出启动信息
console.log(`抓取服务已启动: ${new Date().toLocaleString()}`);
console.log(`初始监控频率: ${config.monitorFrequency} 小时`);

+ 15
- 9
src/config/crawler.config.js 查看文件

@@ -16,7 +16,6 @@ module.exports = {
},
page: {
locale: 'ja-JP',
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
},

@@ -38,22 +37,29 @@ module.exports = {
delay: 6000
},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7'
}
},

// 乐天配置(预留)
// 乐天配置
rakuten: {
selectors: {},
selectors: {
title: '.normal_reserve_item_name',
price: 'span[irc="Price"] .number-display--3s4mj',
coupon: '',
variants: ''
},
timeouts: {
pageLoad: 5000,
elementWait: 1000,
networkIdle: 500
pageLoad: 60000,
elementWait: 10000,
networkIdle: 50000
},
retry: {
maxAttempts: 3,
delay: 1000
delay: 6000
},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7'
}
},


+ 3
- 3
src/services/crawlerFactory.js 查看文件

@@ -1,4 +1,5 @@
const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler');
const RakutenCrawler = require('./crawlers/rakuten/RakutenCrawler');
const CrawlerError = require('./errors/CrawlerError');

/**
@@ -17,9 +18,8 @@ class CrawlerFactory {
switch (platform.toLowerCase()) {
case 'amazon':
return new AmazonCrawler(config);
// 可以在这里添加其他平台的爬虫
// case 'rakuten':
// return new RakutenCrawler(config);
case 'rakuten':
return new RakutenCrawler(config);
// case 'yahoo':
// return new YahooCrawler(config);
default:

+ 287
- 0
src/services/crawlers/rakuten/RakutenCrawler.js 查看文件

@@ -0,0 +1,287 @@
const BaseCrawler = require("../base/BaseCrawler");
const CrawlerError = require("../../errors/CrawlerError");
const path = require("path");
const fs = require("fs");
const fsPromises = require("fs").promises;
const axios = require("axios");
const FormData = require("form-data");

/**
* 乐天爬虫实现类
*/
class RakutenCrawler extends BaseCrawler {
constructor(config) {
super(config);
console.log(config);
this.selectors = config.selectors;
this.timeouts = config.timeouts;
this.retryConfig = config.retry;
this.uploadConfig = config.common?.upload;
}

/**
* 创建截图目录
* @returns {Promise<string>} 截图目录路径
*/
async createScreenshotDir() {
const dir = path.join(process.cwd(), "screenshots");
try {
await fsPromises.mkdir(dir, { recursive: true });
} catch (error) {
if (error.code !== "EEXIST") {
throw new CrawlerError(
"创建截图目录失败",
"SCREENSHOT_DIR_ERROR",
"rakuten",
error
);
}
}
return dir;
}

/**
* 上传图片到服务器
* @param {string} imagePath - 图片路径
* @returns {Promise<string>} 图片URL
*/
async uploadImage(imagePath) {
try {
const formData = new FormData();
formData.append("file", fs.createReadStream(imagePath));
formData.append("scene", this.uploadConfig.scene);

const response = await axios.post(this.uploadConfig.url, formData, {
headers: {
...formData.getHeaders(),
"Content-Type": "multipart/form-data",
},
timeout: this.uploadConfig.timeout,
});

if (!response.data || !response.data.url) {
throw new Error("上传响应格式错误");
}

return response.data.url;
} catch (error) {
if (error.response) {
throw new CrawlerError(
`图片上传失败: ${error.response.status} ${error.response.statusText}`,
"IMAGE_UPLOAD_ERROR",
"rakuten",
error
);
}
throw new CrawlerError(
"图片上传失败",
"IMAGE_UPLOAD_ERROR",
"rakuten",
error
);
}
}

/**
* 带重试的页面导航
* @param {string} url - 目标URL
* @returns {Promise<void>}
*/
async navigateWithRetry(url) {
let lastError;
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
try {
await this.page.goto(url, {
waitUntil: "networkidle",
timeout: this.timeouts.pageLoad,
});
return;
} catch (error) {
lastError = error;
console.log(
`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`,
error.message
);

if (attempt < this.retryConfig.maxAttempts) {
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
await new Promise((resolve) =>
setTimeout(resolve, this.retryConfig.delay)
);

// 重新初始化浏览器
await this.closeBrowser();
await this.initBrowser();
}
}
}
throw new CrawlerError(
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
"NAVIGATION_ERROR",
"rakuten",
lastError
);
}

/**
* 等待元素出现
* @param {string} selector - 选择器
* @returns {Promise<void>}
*/
async waitForElement(selector) {
try {
await this.page.waitForSelector(selector, {
timeout: this.timeouts.elementWait,
});
} catch (error) {
throw new CrawlerError(
`等待元素超时: ${selector}`,
"ELEMENT_WAIT_ERROR",
"rakuten",
error
);
}
}

/**
* 获取单个SKU信息
* @returns {Promise<Object>} SKU信息
*/
async getSingleSkuInfo() {
try {
// 等待页面加载完成
await this.page.waitForLoadState("networkidle");

// 等待标题元素出现
await this.waitForElement(this.selectors.title);

// 处理优惠券
// const couponValue = await this.handleCoupon();

// 获取商品信息
const info = await this.page.evaluate(
({ selectors, couponValue }) => {
const title =
document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText =
document.querySelector(selectors.price)?.textContent.trim() || null;

// 解析价格
const price = priceText;
const url = window.location.href;
const asin =
document
.querySelector(".normal_reserve_item_number")
?.textContent.trim() || null;

return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url,
};
},
{ selectors: this.selectors, couponValue }
);


console.log(info);

// 验证必要信息
if (!info.title || !info.price || !info.sku) {
throw new Error("商品信息不完整");
}

return info;
} catch (error) {
throw new CrawlerError(
"获取SKU信息失败",
"SKU_INFO_GET_ERROR",
"rakuten",
error
);
}
}

/**
* 主方法:抓取商品信息
* @param {string} url - 商品URL
* @param {boolean} needScreenshot - 是否需要截图
* @returns {Promise<Array>} 商品信息数组
*/
async crawl(url, needScreenshot = false) {
try {
await this.initBrowser();

// 设置页面超时
this.page.setDefaultTimeout(this.timeouts.elementWait);
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);

// 设置请求拦截
await this.page.route("**/*", (route) => {
const resourceType = route.request().resourceType();
// 只阻止字体和媒体资源,允许加载图片
if (["font", "media"].includes(resourceType)) {
route.abort();
} else {
route.continue();
}
});

// 导航到目标页面
await this.navigateWithRetry(url);

// 只获取单个SKU信息
const data = [await this.getSingleSkuInfo()];

if (needScreenshot) {
try {
const dir = await this.createScreenshotDir();
const filename = `${Date.now()}.png`;
const shot = path.join(dir, filename);

// 等待页面完全加载
await this.page.waitForLoadState("networkidle");

// 截取全页面
await this.page.screenshot({
path: shot,
fullPage: true,
timeout: this.timeouts.elementWait,
});

// 上传图片并获取URL
const imageUrl = await this.uploadImage(shot);

// 更新数据,添加图片URL
data.forEach((item) => {
item.screenshotUrl = imageUrl;
});

// 删除临时文件
try {
await fsPromises.unlink(shot);
} catch (error) {
console.error("删除临时截图文件失败:", error);
}
} catch (error) {
console.error("截图处理失败:", error);
// 截图失败不影响主流程
}
}

return data;
} catch (error) {
throw new CrawlerError(
"商品信息抓取失败",
"CRAWL_ERROR",
"rakuten",
error
);
} finally {
await this.closeBrowser();
}
}
}

module.exports = RakutenCrawler;

Loading…
取消
儲存