ソースを参照

refactor: 优化亚马逊爬虫代码格式和异常处理

- 统一了代码中的字符串引号风格,使用双引号替代单引号,以提高代码一致性
- 增强了异常处理逻辑,确保在处理积分和优惠券时发生错误时返回0,而不是抛出异常,提升了代码的健壮性
- 该更改旨在提高代码可读性和维护性,符合 SOLID 原则和命名规范
master
lizhuang 1ヶ月前
コミット
87028a3877
1個のファイルの変更194行の追加110行の削除
  1. 194
    110
      src/services/crawlers/amazon/AmazonCrawler.js

+ 194
- 110
src/services/crawlers/amazon/AmazonCrawler.js ファイルの表示

@@ -1,10 +1,10 @@
const BaseCrawler = require('../base/BaseCrawler');
const CrawlerError = require('../../errors/CrawlerError');
const path = require('path');
const fs = require('fs');
const fsPromises = require('fs').promises;
const axios = require('axios');
const FormData = require('form-data');
const BaseCrawler = require("../base/BaseCrawler");
const CrawlerError = require("../../errors/CrawlerError");
const path = require("path");
const fs = require("fs");
const fsPromises = require("fs").promises;
const axios = require("axios");
const FormData = require("form-data");

/**
* 亚马逊爬虫实现类
@@ -13,24 +13,25 @@ class AmazonCrawler extends BaseCrawler {
constructor(config) {
super(config);
this.selectors = {
title: '#productTitle',
price: 'span.a-price > span.a-offscreen',
title: "#productTitle",
price: "span.a-price > span.a-offscreen",
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
variants: ".a-cardui-body #twister-plus-inline-twister > .a-section",
point: "#points_feature_div .a-color-price",
};
this.timeouts = config.timeouts || {
pageLoad: 60000, // 页面加载超时时间
pageLoad: 60000, // 页面加载超时时间
elementWait: 10000, // 元素等待超时时间
networkIdle: 5000 // 网络空闲超时时间
networkIdle: 5000, // 网络空闲超时时间
};
this.retryConfig = config.retry || {
maxAttempts: 1, // 最大重试次数
delay: 2000 // 重试延迟时间
maxAttempts: 1, // 最大重试次数
delay: 2000, // 重试延迟时间
};
this.uploadConfig = config.common?.upload || {
url: 'https://apibase.sohomall.jp/uploaders',
scene: 'goods',
timeout: 600000
url: "https://apibase.sohomall.jp/uploaders",
scene: "goods",
timeout: 600000,
};
}

@@ -39,12 +40,17 @@ class AmazonCrawler extends BaseCrawler {
* @returns {Promise<string>} 截图目录路径
*/
async createScreenshotDir() {
const dir = path.join(process.cwd(), 'screenshots');
const dir = path.join(process.cwd(), "screenshots");
try {
await fsPromises.mkdir(dir, { recursive: true });
} catch (error) {
if (error.code !== 'EEXIST') {
throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error);
if (error.code !== "EEXIST") {
throw new CrawlerError(
"创建截图目录失败",
"SCREENSHOT_DIR_ERROR",
"amazon",
error
);
}
}
return dir;
@@ -58,19 +64,19 @@ class AmazonCrawler extends BaseCrawler {
async uploadImage(imagePath) {
try {
const formData = new FormData();
formData.append('file', fs.createReadStream(imagePath));
formData.append('scene', this.uploadConfig.scene);
formData.append("file", fs.createReadStream(imagePath));
formData.append("scene", this.uploadConfig.scene);

const response = await axios.post(this.uploadConfig.url, formData, {
headers: {
...formData.getHeaders(),
'Content-Type': 'multipart/form-data'
"Content-Type": "multipart/form-data",
},
timeout: this.uploadConfig.timeout
timeout: this.uploadConfig.timeout,
});

if (!response.data || !response.data.url) {
throw new Error('上传响应格式错误');
throw new Error("上传响应格式错误");
}

return response.data.url;
@@ -78,12 +84,17 @@ class AmazonCrawler extends BaseCrawler {
if (error.response) {
throw new CrawlerError(
`图片上传失败: ${error.response.status} ${error.response.statusText}`,
'IMAGE_UPLOAD_ERROR',
'amazon',
"IMAGE_UPLOAD_ERROR",
"amazon",
error
);
}
throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error);
throw new CrawlerError(
"图片上传失败",
"IMAGE_UPLOAD_ERROR",
"amazon",
error
);
}
}

@@ -97,18 +108,23 @@ class AmazonCrawler extends BaseCrawler {
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
try {
await this.page.goto(url, {
waitUntil: 'networkidle',
timeout: this.timeouts.pageLoad
waitUntil: "networkidle",
timeout: this.timeouts.pageLoad,
});
return;
} catch (error) {
lastError = error;
console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message);
console.log(
`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`,
error.message
);

if (attempt < this.retryConfig.maxAttempts) {
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay));
await new Promise((resolve) =>
setTimeout(resolve, this.retryConfig.delay)
);

// 重新初始化浏览器
await this.closeBrowser();
await this.initBrowser();
@@ -117,8 +133,8 @@ class AmazonCrawler extends BaseCrawler {
}
throw new CrawlerError(
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
'NAVIGATION_ERROR',
'amazon',
"NAVIGATION_ERROR",
"amazon",
lastError
);
}
@@ -131,13 +147,13 @@ class AmazonCrawler extends BaseCrawler {
async waitForElement(selector) {
try {
await this.page.waitForSelector(selector, {
timeout: this.timeouts.elementWait
timeout: this.timeouts.elementWait,
});
} catch (error) {
throw new CrawlerError(
`等待元素超时: ${selector}`,
'ELEMENT_WAIT_ERROR',
'amazon',
"ELEMENT_WAIT_ERROR",
"amazon",
error
);
}
@@ -151,23 +167,55 @@ class AmazonCrawler extends BaseCrawler {
async parsePrice(priceText) {
try {
if (!priceText) return null;
// 移除货币符号和空格
priceText = priceText.replace(/[¥JP¥\s]/g, '');
priceText = priceText.replace(/[¥JP¥\s]/g, "");
// 如果包含积分信息,只取价格部分
if (priceText.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
if (priceText.includes("ポイント")) {
priceText = priceText.split("ポイント")[0].trim();
}
// 提取数字部分
const match = priceText.match(/([\d,]+)/);
if (!match) return null;
// 转换价格
return parseInt(match[1].replace(/,/g, ''));
return parseInt(match[1].replace(/,/g, ""));
} catch (error) {
throw new CrawlerError(
"价格解析失败",
"PRICE_PARSE_ERROR",
"amazon",
error
);
}
}

/**
* 处理积分
* @returns {Promise<number>} 积分金额
*/
async handlePoint() {
try {
let pointValue = 0;

const pointTrigger = await this.page.$(this.selectors.point);
if (!pointTrigger) {
return 0; // 没有积分,直接返回0
} else {
const pointText = await this.page.$eval(this.selectors.point, (el) =>
el.textContent.trim()
);
const match = pointText.match(/\d+/);
if (match) {
pointValue = match[0];
}
return pointValue;
}
} catch (error) {
throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error);
console.log("积分处理失败:", error.message);
return 0; // 发生错误时返回0,而不是抛出异常
}
}

@@ -178,7 +226,7 @@ class AmazonCrawler extends BaseCrawler {
async handleCoupon() {
try {
let couponValue = 0;
// 等待优惠券元素出现
const couponTrigger = await this.page.$(this.selectors.coupon);
if (!couponTrigger) {
@@ -191,37 +239,39 @@ class AmazonCrawler extends BaseCrawler {
await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示

// 等待优惠券文本出现
const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim());
const couponText = await this.page.$eval(".couponLabelText", (el) =>
el.textContent.trim()
);

// 解析优惠券金额
const match = couponText.match(/¥\s*([\d,]+)/);
if (match) {
couponValue = parseInt(match[1].replace(/,/g, ''));
couponValue = parseInt(match[1].replace(/,/g, ""));
}

// 尝试关闭弹窗
try {
await this.page.click('button.a-modal-close', { timeout: 2000 });
await this.page.click("button.a-modal-close", { timeout: 2000 });
} catch (closeError) {
// 如果找不到关闭按钮,尝试按ESC键
await this.page.keyboard.press('Escape');
await this.page.keyboard.press("Escape");
}

// 等待弹窗消失
await this.page.waitForTimeout(500);
} catch (clickError) {
console.log('没有优惠券', clickError.message);
console.log("没有优惠券", clickError.message);
// 如果点击失败,尝试按ESC键关闭可能的弹窗
try {
await this.page.keyboard.press('Escape');
await this.page.keyboard.press("Escape");
} catch (escError) {
console.log('ESC键关闭失败:', escError.message);
console.log("ESC键关闭失败:", escError.message);
}
}

return couponValue;
} catch (error) {
console.log('优惠券处理失败:', error.message);
console.log("优惠券处理失败:", error.message);
return 0; // 发生错误时返回0,而不是抛出异常
}
}
@@ -232,9 +282,16 @@ class AmazonCrawler extends BaseCrawler {
*/
async getTitle() {
try {
return await this.page.$eval(this.selectors.title, el => el.textContent.trim());
return await this.page.$eval(this.selectors.title, (el) =>
el.textContent.trim()
);
} catch (error) {
throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取标题失败",
"TITLE_GET_ERROR",
"amazon",
error
);
}
}

@@ -247,7 +304,7 @@ class AmazonCrawler extends BaseCrawler {
const url = this.page.url();
return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
} catch (error) {
throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error);
throw new CrawlerError("获取SKU失败", "SKU_GET_ERROR", "amazon", error);
}
}

@@ -260,15 +317,20 @@ class AmazonCrawler extends BaseCrawler {
await this.page.waitForSelector(this.selectors.variants);
const groupEls = await this.page.$$(this.selectors.variants);
const groups = [];
for (const groupEl of groupEls) {
const btns = await groupEl.$$('.a-button-inner .a-button-input');
const btns = await groupEl.$$(".a-button-inner .a-button-input");
if (btns.length) groups.push(btns);
}
return groups;
} catch (error) {
throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取变体信息失败",
"VARIANTS_GET_ERROR",
"amazon",
error
);
}
}

@@ -279,45 +341,62 @@ class AmazonCrawler extends BaseCrawler {
async getSingleSkuInfo() {
try {
// 等待页面加载完成
await this.page.waitForLoadState('networkidle');
await this.page.waitForLoadState("networkidle");
// 等待标题元素出现
await this.waitForElement(this.selectors.title);
// 处理优惠券
const couponValue = await this.handleCoupon();

// 处理积分
const pointValue = await this.handlePoint();

// 获取商品信息
const info = await this.page.evaluate(({ selectors, couponValue }) => {
const title = document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText = document.querySelector(selectors.price)?.textContent.trim() || null;
// 处理价格文本
if (priceText?.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
}
// 解析价格
const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url
};
}, { selectors: this.selectors, couponValue });
const info = await this.page.evaluate(
({ selectors, couponValue, pointValue }) => {
const title =
document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText =
document.querySelector(selectors.price)?.textContent.trim() || null;

// 处理价格文本
if (priceText?.includes("ポイント")) {
priceText = priceText.split("ポイント")[0].trim();
}

// 解析价格
const price = priceText
? parseInt(priceText.replace(/[¥JP¥\s,]/g, "")) -
couponValue -
pointValue
: null;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;

return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url,
};
},
{ selectors: this.selectors, couponValue, pointValue }
);

// 验证必要信息
if (!info.title || !info.price || !info.sku) {
throw new Error('商品信息不完整');
throw new Error("商品信息不完整");
}

return info;
} catch (error) {
throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取SKU信息失败",
"SKU_INFO_GET_ERROR",
"amazon",
error
);
}
}

@@ -330,16 +409,16 @@ class AmazonCrawler extends BaseCrawler {
async crawl(url, needScreenshot = false) {
try {
await this.initBrowser();
// 设置页面超时
this.page.setDefaultTimeout(this.timeouts.elementWait);
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
// 设置请求拦截
await this.page.route('**/*', route => {
await this.page.route("**/*", (route) => {
const resourceType = route.request().resourceType();
// 只阻止字体和媒体资源,允许加载图片
if (['font', 'media'].includes(resourceType)) {
if (["font", "media"].includes(resourceType)) {
route.abort();
} else {
route.continue();
@@ -347,7 +426,7 @@ class AmazonCrawler extends BaseCrawler {
});

// 导航到目标页面
await this.navigateWithRetry(url.split('?')[0]);
await this.navigateWithRetry(url.split("?")[0]);

// 只获取单个SKU信息
const data = [await this.getSingleSkuInfo()];
@@ -357,22 +436,22 @@ class AmazonCrawler extends BaseCrawler {
const dir = await this.createScreenshotDir();
const filename = `${Date.now()}.png`;
const shot = path.join(dir, filename);
// 等待页面完全加载
await this.page.waitForLoadState('networkidle');
await this.page.waitForLoadState("networkidle");
// 截取全页面
await this.page.screenshot({
path: shot,
await this.page.screenshot({
path: shot,
fullPage: true,
timeout: this.timeouts.elementWait
timeout: this.timeouts.elementWait,
});
// 上传图片并获取URL
const imageUrl = await this.uploadImage(shot);
// 更新数据,添加图片URL
data.forEach(item => {
data.forEach((item) => {
item.screenshotUrl = imageUrl;
});

@@ -380,21 +459,26 @@ class AmazonCrawler extends BaseCrawler {
try {
await fsPromises.unlink(shot);
} catch (error) {
console.error('删除临时截图文件失败:', error);
console.error("删除临时截图文件失败:", error);
}
} catch (error) {
console.error('截图处理失败:', error);
console.error("截图处理失败:", error);
// 截图失败不影响主流程
}
}

return data;
} catch (error) {
throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error);
throw new CrawlerError(
"商品信息抓取失败",
"CRAWL_ERROR",
"amazon",
error
);
} finally {
await this.closeBrowser();
}
}
}

module.exports = AmazonCrawler;
module.exports = AmazonCrawler;

読み込み中…
キャンセル
保存