- const BaseCrawler = require("../base/BaseCrawler");
- const CrawlerError = require("../../errors/CrawlerError");
- const path = require("path");
- const fs = require("fs");
- const fsPromises = require("fs").promises;
- const axios = require("axios");
- const FormData = require("form-data");
-
- /**
- * 亚马逊爬虫实现类
- */
- class AmazonCrawler extends BaseCrawler {
- constructor(config) {
- super(config);
- this.selectors = {
- title: "#productTitle",
- price: "span.a-price > span.a-offscreen",
- coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
- variants: ".a-cardui-body #twister-plus-inline-twister > .a-section",
- point: "#points_feature_div .a-color-price",
- };
- this.timeouts = config.timeouts || {
- pageLoad: 60000, // 页面加载超时时间
- elementWait: 10000, // 元素等待超时时间
- networkIdle: 5000, // 网络空闲超时时间
- };
- this.retryConfig = config.retry || {
- maxAttempts: 1, // 最大重试次数
- delay: 2000, // 重试延迟时间
- };
- this.uploadConfig = config.common?.upload || {
- url: "https://apibase.sohomall.jp/uploaders",
- scene: "digital-yy",
- timeout: 600000,
- };
- }
-
- /**
- * 创建截图目录
- * @returns {Promise<string>} 截图目录路径
- */
- async createScreenshotDir() {
- const dir = path.join(process.cwd(), "screenshots");
- try {
- await fsPromises.mkdir(dir, { recursive: true });
- } catch (error) {
- if (error.code !== "EEXIST") {
- throw new CrawlerError(
- "创建截图目录失败",
- "SCREENSHOT_DIR_ERROR",
- "amazon",
- error
- );
- }
- }
- return dir;
- }
-
- /**
- * 上传图片到服务器
- * @param {string} imagePath - 图片路径
- * @returns {Promise<string>} 图片URL
- */
- async uploadImage(imagePath) {
- try {
- const formData = new FormData();
- formData.append("file", fs.createReadStream(imagePath));
- formData.append("scene", this.uploadConfig.scene);
-
- const response = await axios.post(this.uploadConfig.url, formData, {
- headers: {
- ...formData.getHeaders(),
- "Content-Type": "multipart/form-data",
- },
- timeout: this.uploadConfig.timeout,
- });
-
- if (!response.data || !response.data.url) {
- throw new Error("上传响应格式错误");
- }
-
- return response.data.url;
- } catch (error) {
- if (error.response) {
- throw new CrawlerError(
- `图片上传失败: ${error.response.status} ${error.response.statusText}`,
- "IMAGE_UPLOAD_ERROR",
- "amazon",
- error
- );
- }
- throw new CrawlerError(
- "图片上传失败",
- "IMAGE_UPLOAD_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 带重试的页面导航
- * @param {string} url - 目标URL
- * @returns {Promise<void>}
- */
- async navigateWithRetry(url) {
- let lastError;
- for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
- try {
- await this.page.goto(url, {
- waitUntil: "networkidle",
- timeout: this.timeouts.pageLoad,
- });
- return;
- } catch (error) {
- lastError = error;
- console.log(
- `导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`,
- error.message
- );
-
- if (attempt < this.retryConfig.maxAttempts) {
- console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
- await new Promise((resolve) =>
- setTimeout(resolve, this.retryConfig.delay)
- );
-
- // 重新初始化浏览器
- await this.closeBrowser();
- await this.initBrowser();
- }
- }
- }
- throw new CrawlerError(
- `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
- "NAVIGATION_ERROR",
- "amazon",
- lastError
- );
- }
-
- /**
- * 等待元素出现
- * @param {string} selector - 选择器
- * @returns {Promise<void>}
- */
- async waitForElement(selector) {
- try {
- await this.page.waitForSelector(selector, {
- timeout: this.timeouts.elementWait,
- });
- } catch (error) {
- throw new CrawlerError(
- `等待元素超时: ${selector}`,
- "ELEMENT_WAIT_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 解析价格
- * @param {string} priceText - 价格文本
- * @returns {Promise<number>} 解析后的价格
- */
- async parsePrice(priceText) {
- try {
- if (!priceText) return null;
-
- // 移除货币符号和空格
- priceText = priceText.replace(/[¥JP¥\s]/g, "");
-
- // 如果包含积分信息,只取价格部分
- if (priceText.includes("ポイント")) {
- priceText = priceText.split("ポイント")[0].trim();
- }
-
- // 提取数字部分
- const match = priceText.match(/([\d,]+)/);
- if (!match) return null;
-
- // 转换价格
- return parseInt(match[1].replace(/,/g, ""));
- } catch (error) {
- throw new CrawlerError(
- "价格解析失败",
- "PRICE_PARSE_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 处理积分
- * @returns {Promise<number>} 积分金额
- */
- async handlePoint() {
- try {
- let pointValue = 0;
-
- const pointTrigger = await this.page.$(this.selectors.point);
- if (!pointTrigger) {
- return 0; // 没有积分,直接返回0
- } else {
- const pointText = await this.page.$eval(this.selectors.point, (el) =>
- el.textContent.trim()
- );
- const match = pointText.match(/\d+/);
- if (match) {
- pointValue = match[0];
- }
- return pointValue;
- }
- } catch (error) {
- console.log("积分处理失败:", error.message);
- return 0; // 发生错误时返回0,而不是抛出异常
- }
- }
-
- /**
- * 处理优惠券
- * @returns {Promise<number>} 优惠券金额
- */
- async handleCoupon() {
- try {
- let couponValue = 0;
-
- // 等待优惠券元素出现
- const couponTrigger = await this.page.$(this.selectors.coupon);
- if (!couponTrigger) {
- return 0; // 没有优惠券,直接返回0
- }
-
- try {
- // 点击优惠券按钮
- await couponTrigger.click();
- await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示
-
- // 等待优惠券文本出现
- const couponText = await this.page.$eval(".couponLabelText", (el) =>
- el.textContent.trim()
- );
-
- // 解析优惠券金额
- const match = couponText.match(/¥\s*([\d,]+)/);
- if (match) {
- couponValue = parseInt(match[1].replace(/,/g, ""));
- }
-
- // 尝试关闭弹窗
- try {
- await this.page.click("button.a-modal-close", { timeout: 2000 });
- } catch (closeError) {
- // 如果找不到关闭按钮,尝试按ESC键
- await this.page.keyboard.press("Escape");
- }
-
- // 等待弹窗消失
- await this.page.waitForTimeout(500);
- } catch (clickError) {
- console.log("没有优惠券", clickError.message);
- // 如果点击失败,尝试按ESC键关闭可能的弹窗
- try {
- await this.page.keyboard.press("Escape");
- } catch (escError) {
- console.log("ESC键关闭失败:", escError.message);
- }
- }
-
- return couponValue;
- } catch (error) {
- console.log("优惠券处理失败:", error.message);
- return 0; // 发生错误时返回0,而不是抛出异常
- }
- }
-
- /**
- * 获取商品标题
- * @returns {Promise<string>} 商品标题
- */
- async getTitle() {
- try {
- return await this.page.$eval(this.selectors.title, (el) =>
- el.textContent.trim()
- );
- } catch (error) {
- throw new CrawlerError(
- "获取标题失败",
- "TITLE_GET_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 获取商品SKU
- * @returns {Promise<string>} 商品SKU
- */
- async getSku() {
- try {
- const url = this.page.url();
- return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
- } catch (error) {
- throw new CrawlerError("获取SKU失败", "SKU_GET_ERROR", "amazon", error);
- }
- }
-
- /**
- * 获取商品变体信息
- * @returns {Promise<Array>} 变体信息数组
- */
- async getVariants() {
- try {
- await this.page.waitForSelector(this.selectors.variants);
- const groupEls = await this.page.$$(this.selectors.variants);
- const groups = [];
-
- for (const groupEl of groupEls) {
- const btns = await groupEl.$$(".a-button-inner .a-button-input");
- if (btns.length) groups.push(btns);
- }
-
- return groups;
- } catch (error) {
- throw new CrawlerError(
- "获取变体信息失败",
- "VARIANTS_GET_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 获取单个SKU信息
- * @returns {Promise<Object>} SKU信息
- */
- async getSingleSkuInfo() {
- try {
- // 等待页面加载完成
- await this.page.waitForLoadState("networkidle");
-
- // 等待标题元素出现
- await this.waitForElement(this.selectors.title);
-
- // 处理优惠券
- const couponValue = await this.handleCoupon();
-
- // 处理积分
- const pointValue = await this.handlePoint();
-
- // 获取商品信息
- const info = await this.page.evaluate(
- ({ selectors, couponValue, pointValue }) => {
- const title =
- document.querySelector(selectors.title)?.textContent.trim() || null;
- let priceText =
- document.querySelector(selectors.price)?.textContent.trim() || null;
-
- // 处理价格文本
- if (priceText?.includes("ポイント")) {
- priceText = priceText.split("ポイント")[0].trim();
- }
-
- // 解析价格
- const price = priceText
- ? parseInt(priceText.replace(/[¥JP¥\s,]/g, "")) -
- couponValue -
- pointValue
- : null;
- const url = window.location.href;
- const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
-
- return {
- title,
- price: price ? price.toLocaleString() : null,
- sku: asin,
- url,
- };
- },
- { selectors: this.selectors, couponValue, pointValue }
- );
-
- // 验证必要信息
- if (!info.title || !info.price || !info.sku) {
- throw new Error("商品信息不完整");
- }
-
- return info;
- } catch (error) {
- throw new CrawlerError(
- "获取SKU信息失败",
- "SKU_INFO_GET_ERROR",
- "amazon",
- error
- );
- }
- }
-
- /**
- * 主方法:抓取商品信息
- * @param {string} url - 商品URL
- * @param {boolean} needScreenshot - 是否需要截图
- * @returns {Promise<Array>} 商品信息数组
- */
- async crawl(url, needScreenshot = false) {
- try {
- await this.initBrowser();
-
- // 设置页面超时
- this.page.setDefaultTimeout(this.timeouts.elementWait);
- this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
-
- // 设置请求拦截
- await this.page.route("**/*", (route) => {
- const resourceType = route.request().resourceType();
- // 只阻止字体和媒体资源,允许加载图片
- if (["font", "media"].includes(resourceType)) {
- route.abort();
- } else {
- route.continue();
- }
- });
-
- // 导航到目标页面
- await this.navigateWithRetry(url.split("?")[0]);
-
- // 只获取单个SKU信息
- const data = [await this.getSingleSkuInfo()];
-
- if (needScreenshot) {
- try {
- const dir = await this.createScreenshotDir();
- const filename = `${Date.now()}.png`;
- const shot = path.join(dir, filename);
-
- // 等待页面完全加载
- await this.page.waitForLoadState("networkidle");
-
- // 截取全页面
- await this.page.screenshot({
- path: shot,
- fullPage: true,
- timeout: this.timeouts.elementWait,
- });
-
- // 上传图片并获取URL
- const imageUrl = await this.uploadImage(shot);
-
- // 更新数据,添加图片URL
- data.forEach((item) => {
- item.screenshotUrl = imageUrl;
- });
-
- // 删除临时文件
- try {
- await fsPromises.unlink(shot);
- } catch (error) {
- console.error("删除临时截图文件失败:", error);
- }
- } catch (error) {
- console.error("截图处理失败:", error);
- // 截图失败不影响主流程
- }
- }
-
- return data;
- } catch (error) {
- throw new CrawlerError(
- "商品信息抓取失败",
- "CRAWL_ERROR",
- "amazon",
- error
- );
- } finally {
- await this.closeBrowser();
- }
- }
- }
-
- module.exports = AmazonCrawler;
|