Browse Source

refactor: 优化亚马逊爬虫代码格式和异常处理

- 统一了代码中的字符串引号风格,使用双引号替代单引号,以提高代码一致性
- 增强了异常处理逻辑,确保在处理积分和优惠券时发生错误时返回0,而不是抛出异常,提升了代码的健壮性
- 该更改旨在提高代码可读性和维护性,符合 SOLID 原则和命名规范
master
lizhuang 1 month ago
parent
commit
87028a3877
1 changed files with 194 additions and 110 deletions
  1. 194
    110
      src/services/crawlers/amazon/AmazonCrawler.js

+ 194
- 110
src/services/crawlers/amazon/AmazonCrawler.js View File

const BaseCrawler = require('../base/BaseCrawler');
const CrawlerError = require('../../errors/CrawlerError');
const path = require('path');
const fs = require('fs');
const fsPromises = require('fs').promises;
const axios = require('axios');
const FormData = require('form-data');
const BaseCrawler = require("../base/BaseCrawler");
const CrawlerError = require("../../errors/CrawlerError");
const path = require("path");
const fs = require("fs");
const fsPromises = require("fs").promises;
const axios = require("axios");
const FormData = require("form-data");


/** /**
* 亚马逊爬虫实现类 * 亚马逊爬虫实现类
constructor(config) { constructor(config) {
super(config); super(config);
this.selectors = { this.selectors = {
title: '#productTitle',
price: 'span.a-price > span.a-offscreen',
title: "#productTitle",
price: "span.a-price > span.a-offscreen",
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText', coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
variants: ".a-cardui-body #twister-plus-inline-twister > .a-section",
point: "#points_feature_div .a-color-price",
}; };
this.timeouts = config.timeouts || { this.timeouts = config.timeouts || {
pageLoad: 60000, // 页面加载超时时间
pageLoad: 60000, // 页面加载超时时间
elementWait: 10000, // 元素等待超时时间 elementWait: 10000, // 元素等待超时时间
networkIdle: 5000 // 网络空闲超时时间
networkIdle: 5000, // 网络空闲超时时间
}; };
this.retryConfig = config.retry || { this.retryConfig = config.retry || {
maxAttempts: 1, // 最大重试次数
delay: 2000 // 重试延迟时间
maxAttempts: 1, // 最大重试次数
delay: 2000, // 重试延迟时间
}; };
this.uploadConfig = config.common?.upload || { this.uploadConfig = config.common?.upload || {
url: 'https://apibase.sohomall.jp/uploaders',
scene: 'goods',
timeout: 600000
url: "https://apibase.sohomall.jp/uploaders",
scene: "goods",
timeout: 600000,
}; };
} }


* @returns {Promise<string>} 截图目录路径 * @returns {Promise<string>} 截图目录路径
*/ */
async createScreenshotDir() { async createScreenshotDir() {
const dir = path.join(process.cwd(), 'screenshots');
const dir = path.join(process.cwd(), "screenshots");
try { try {
await fsPromises.mkdir(dir, { recursive: true }); await fsPromises.mkdir(dir, { recursive: true });
} catch (error) { } catch (error) {
if (error.code !== 'EEXIST') {
throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error);
if (error.code !== "EEXIST") {
throw new CrawlerError(
"创建截图目录失败",
"SCREENSHOT_DIR_ERROR",
"amazon",
error
);
} }
} }
return dir; return dir;
async uploadImage(imagePath) { async uploadImage(imagePath) {
try { try {
const formData = new FormData(); const formData = new FormData();
formData.append('file', fs.createReadStream(imagePath));
formData.append('scene', this.uploadConfig.scene);
formData.append("file", fs.createReadStream(imagePath));
formData.append("scene", this.uploadConfig.scene);


const response = await axios.post(this.uploadConfig.url, formData, { const response = await axios.post(this.uploadConfig.url, formData, {
headers: { headers: {
...formData.getHeaders(), ...formData.getHeaders(),
'Content-Type': 'multipart/form-data'
"Content-Type": "multipart/form-data",
}, },
timeout: this.uploadConfig.timeout
timeout: this.uploadConfig.timeout,
}); });


if (!response.data || !response.data.url) { if (!response.data || !response.data.url) {
throw new Error('上传响应格式错误');
throw new Error("上传响应格式错误");
} }


return response.data.url; return response.data.url;
if (error.response) { if (error.response) {
throw new CrawlerError( throw new CrawlerError(
`图片上传失败: ${error.response.status} ${error.response.statusText}`, `图片上传失败: ${error.response.status} ${error.response.statusText}`,
'IMAGE_UPLOAD_ERROR',
'amazon',
"IMAGE_UPLOAD_ERROR",
"amazon",
error error
); );
} }
throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error);
throw new CrawlerError(
"图片上传失败",
"IMAGE_UPLOAD_ERROR",
"amazon",
error
);
} }
} }


for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
try { try {
await this.page.goto(url, { await this.page.goto(url, {
waitUntil: 'networkidle',
timeout: this.timeouts.pageLoad
waitUntil: "networkidle",
timeout: this.timeouts.pageLoad,
}); });
return; return;
} catch (error) { } catch (error) {
lastError = error; lastError = error;
console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message);
console.log(
`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`,
error.message
);

if (attempt < this.retryConfig.maxAttempts) { if (attempt < this.retryConfig.maxAttempts) {
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay));
await new Promise((resolve) =>
setTimeout(resolve, this.retryConfig.delay)
);

// 重新初始化浏览器 // 重新初始化浏览器
await this.closeBrowser(); await this.closeBrowser();
await this.initBrowser(); await this.initBrowser();
} }
throw new CrawlerError( throw new CrawlerError(
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
'NAVIGATION_ERROR',
'amazon',
"NAVIGATION_ERROR",
"amazon",
lastError lastError
); );
} }
async waitForElement(selector) { async waitForElement(selector) {
try { try {
await this.page.waitForSelector(selector, { await this.page.waitForSelector(selector, {
timeout: this.timeouts.elementWait
timeout: this.timeouts.elementWait,
}); });
} catch (error) { } catch (error) {
throw new CrawlerError( throw new CrawlerError(
`等待元素超时: ${selector}`, `等待元素超时: ${selector}`,
'ELEMENT_WAIT_ERROR',
'amazon',
"ELEMENT_WAIT_ERROR",
"amazon",
error error
); );
} }
async parsePrice(priceText) { async parsePrice(priceText) {
try { try {
if (!priceText) return null; if (!priceText) return null;
// 移除货币符号和空格 // 移除货币符号和空格
priceText = priceText.replace(/[¥JP¥\s]/g, '');
priceText = priceText.replace(/[¥JP¥\s]/g, "");
// 如果包含积分信息,只取价格部分 // 如果包含积分信息,只取价格部分
if (priceText.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
if (priceText.includes("ポイント")) {
priceText = priceText.split("ポイント")[0].trim();
} }
// 提取数字部分 // 提取数字部分
const match = priceText.match(/([\d,]+)/); const match = priceText.match(/([\d,]+)/);
if (!match) return null; if (!match) return null;
// 转换价格 // 转换价格
return parseInt(match[1].replace(/,/g, ''));
return parseInt(match[1].replace(/,/g, ""));
} catch (error) {
throw new CrawlerError(
"价格解析失败",
"PRICE_PARSE_ERROR",
"amazon",
error
);
}
}

/**
* 处理积分
* @returns {Promise<number>} 积分金额
*/
async handlePoint() {
try {
let pointValue = 0;

const pointTrigger = await this.page.$(this.selectors.point);
if (!pointTrigger) {
return 0; // 没有积分,直接返回0
} else {
const pointText = await this.page.$eval(this.selectors.point, (el) =>
el.textContent.trim()
);
const match = pointText.match(/\d+/);
if (match) {
pointValue = match[0];
}
return pointValue;
}
} catch (error) { } catch (error) {
throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error);
console.log("积分处理失败:", error.message);
return 0; // 发生错误时返回0,而不是抛出异常
} }
} }


async handleCoupon() { async handleCoupon() {
try { try {
let couponValue = 0; let couponValue = 0;
// 等待优惠券元素出现 // 等待优惠券元素出现
const couponTrigger = await this.page.$(this.selectors.coupon); const couponTrigger = await this.page.$(this.selectors.coupon);
if (!couponTrigger) { if (!couponTrigger) {
await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示 await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示


// 等待优惠券文本出现 // 等待优惠券文本出现
const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim());
const couponText = await this.page.$eval(".couponLabelText", (el) =>
el.textContent.trim()
);

// 解析优惠券金额 // 解析优惠券金额
const match = couponText.match(/¥\s*([\d,]+)/); const match = couponText.match(/¥\s*([\d,]+)/);
if (match) { if (match) {
couponValue = parseInt(match[1].replace(/,/g, ''));
couponValue = parseInt(match[1].replace(/,/g, ""));
} }


// 尝试关闭弹窗 // 尝试关闭弹窗
try { try {
await this.page.click('button.a-modal-close', { timeout: 2000 });
await this.page.click("button.a-modal-close", { timeout: 2000 });
} catch (closeError) { } catch (closeError) {
// 如果找不到关闭按钮,尝试按ESC键 // 如果找不到关闭按钮,尝试按ESC键
await this.page.keyboard.press('Escape');
await this.page.keyboard.press("Escape");
} }


// 等待弹窗消失 // 等待弹窗消失
await this.page.waitForTimeout(500); await this.page.waitForTimeout(500);
} catch (clickError) { } catch (clickError) {
console.log('没有优惠券', clickError.message);
console.log("没有优惠券", clickError.message);
// 如果点击失败,尝试按ESC键关闭可能的弹窗 // 如果点击失败,尝试按ESC键关闭可能的弹窗
try { try {
await this.page.keyboard.press('Escape');
await this.page.keyboard.press("Escape");
} catch (escError) { } catch (escError) {
console.log('ESC键关闭失败:', escError.message);
console.log("ESC键关闭失败:", escError.message);
} }
} }


return couponValue; return couponValue;
} catch (error) { } catch (error) {
console.log('优惠券处理失败:', error.message);
console.log("优惠券处理失败:", error.message);
return 0; // 发生错误时返回0,而不是抛出异常 return 0; // 发生错误时返回0,而不是抛出异常
} }
} }
*/ */
async getTitle() { async getTitle() {
try { try {
return await this.page.$eval(this.selectors.title, el => el.textContent.trim());
return await this.page.$eval(this.selectors.title, (el) =>
el.textContent.trim()
);
} catch (error) { } catch (error) {
throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取标题失败",
"TITLE_GET_ERROR",
"amazon",
error
);
} }
} }


const url = this.page.url(); const url = this.page.url();
return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
} catch (error) { } catch (error) {
throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error);
throw new CrawlerError("获取SKU失败", "SKU_GET_ERROR", "amazon", error);
} }
} }


await this.page.waitForSelector(this.selectors.variants); await this.page.waitForSelector(this.selectors.variants);
const groupEls = await this.page.$$(this.selectors.variants); const groupEls = await this.page.$$(this.selectors.variants);
const groups = []; const groups = [];
for (const groupEl of groupEls) { for (const groupEl of groupEls) {
const btns = await groupEl.$$('.a-button-inner .a-button-input');
const btns = await groupEl.$$(".a-button-inner .a-button-input");
if (btns.length) groups.push(btns); if (btns.length) groups.push(btns);
} }
return groups; return groups;
} catch (error) { } catch (error) {
throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取变体信息失败",
"VARIANTS_GET_ERROR",
"amazon",
error
);
} }
} }


async getSingleSkuInfo() { async getSingleSkuInfo() {
try { try {
// 等待页面加载完成 // 等待页面加载完成
await this.page.waitForLoadState('networkidle');
await this.page.waitForLoadState("networkidle");
// 等待标题元素出现 // 等待标题元素出现
await this.waitForElement(this.selectors.title); await this.waitForElement(this.selectors.title);
// 处理优惠券 // 处理优惠券
const couponValue = await this.handleCoupon(); const couponValue = await this.handleCoupon();

// 处理积分
const pointValue = await this.handlePoint();

// 获取商品信息 // 获取商品信息
const info = await this.page.evaluate(({ selectors, couponValue }) => {
const title = document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText = document.querySelector(selectors.price)?.textContent.trim() || null;
// 处理价格文本
if (priceText?.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
}
// 解析价格
const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url
};
}, { selectors: this.selectors, couponValue });
const info = await this.page.evaluate(
({ selectors, couponValue, pointValue }) => {
const title =
document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText =
document.querySelector(selectors.price)?.textContent.trim() || null;

// 处理价格文本
if (priceText?.includes("ポイント")) {
priceText = priceText.split("ポイント")[0].trim();
}

// 解析价格
const price = priceText
? parseInt(priceText.replace(/[¥JP¥\s,]/g, "")) -
couponValue -
pointValue
: null;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;

return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url,
};
},
{ selectors: this.selectors, couponValue, pointValue }
);


// 验证必要信息 // 验证必要信息
if (!info.title || !info.price || !info.sku) { if (!info.title || !info.price || !info.sku) {
throw new Error('商品信息不完整');
throw new Error("商品信息不完整");
} }


return info; return info;
} catch (error) { } catch (error) {
throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error);
throw new CrawlerError(
"获取SKU信息失败",
"SKU_INFO_GET_ERROR",
"amazon",
error
);
} }
} }


async crawl(url, needScreenshot = false) { async crawl(url, needScreenshot = false) {
try { try {
await this.initBrowser(); await this.initBrowser();
// 设置页面超时 // 设置页面超时
this.page.setDefaultTimeout(this.timeouts.elementWait); this.page.setDefaultTimeout(this.timeouts.elementWait);
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
// 设置请求拦截 // 设置请求拦截
await this.page.route('**/*', route => {
await this.page.route("**/*", (route) => {
const resourceType = route.request().resourceType(); const resourceType = route.request().resourceType();
// 只阻止字体和媒体资源,允许加载图片 // 只阻止字体和媒体资源,允许加载图片
if (['font', 'media'].includes(resourceType)) {
if (["font", "media"].includes(resourceType)) {
route.abort(); route.abort();
} else { } else {
route.continue(); route.continue();
}); });


// 导航到目标页面 // 导航到目标页面
await this.navigateWithRetry(url.split('?')[0]);
await this.navigateWithRetry(url.split("?")[0]);


// 只获取单个SKU信息 // 只获取单个SKU信息
const data = [await this.getSingleSkuInfo()]; const data = [await this.getSingleSkuInfo()];
const dir = await this.createScreenshotDir(); const dir = await this.createScreenshotDir();
const filename = `${Date.now()}.png`; const filename = `${Date.now()}.png`;
const shot = path.join(dir, filename); const shot = path.join(dir, filename);
// 等待页面完全加载 // 等待页面完全加载
await this.page.waitForLoadState('networkidle');
await this.page.waitForLoadState("networkidle");
// 截取全页面 // 截取全页面
await this.page.screenshot({
path: shot,
await this.page.screenshot({
path: shot,
fullPage: true, fullPage: true,
timeout: this.timeouts.elementWait
timeout: this.timeouts.elementWait,
}); });
// 上传图片并获取URL // 上传图片并获取URL
const imageUrl = await this.uploadImage(shot); const imageUrl = await this.uploadImage(shot);
// 更新数据,添加图片URL // 更新数据,添加图片URL
data.forEach(item => {
data.forEach((item) => {
item.screenshotUrl = imageUrl; item.screenshotUrl = imageUrl;
}); });


try { try {
await fsPromises.unlink(shot); await fsPromises.unlink(shot);
} catch (error) { } catch (error) {
console.error('删除临时截图文件失败:', error);
console.error("删除临时截图文件失败:", error);
} }
} catch (error) { } catch (error) {
console.error('截图处理失败:', error);
console.error("截图处理失败:", error);
// 截图失败不影响主流程 // 截图失败不影响主流程
} }
} }


return data; return data;
} catch (error) { } catch (error) {
throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error);
throw new CrawlerError(
"商品信息抓取失败",
"CRAWL_ERROR",
"amazon",
error
);
} finally { } finally {
await this.closeBrowser(); await this.closeBrowser();
} }
} }
} }


module.exports = AmazonCrawler;
module.exports = AmazonCrawler;

Loading…
Cancel
Save