123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441 |
- const BaseCrawler = require('../base/BaseCrawler');
- const CrawlerError = require('../../errors/CrawlerError');
- const path = require('path');
- const fs = require('fs');
- const fsPromises = require('fs').promises;
- const axios = require('axios');
- const FormData = require('form-data');
-
- /**
- * 亚马逊爬虫实现类
- */
- class AmazonCrawler extends BaseCrawler {
- constructor(config) {
- super(config);
- this.selectors = {
- title: '#productTitle',
- price: 'span.a-price > span.a-offscreen',
- coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
- variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
- };
- this.timeouts = config.timeouts || {
- pageLoad: 60000, // 页面加载超时时间
- elementWait: 10000, // 元素等待超时时间
- networkIdle: 5000 // 网络空闲超时时间
- };
- this.retryConfig = config.retry || {
- maxAttempts: 3, // 最大重试次数
- delay: 2000 // 重试延迟时间
- };
- this.uploadConfig = config.common?.upload || {
- url: 'https://apibase.sohomall.jp/uploaders',
- scene: 'goods',
- timeout: 600000
- };
- }
-
- /**
- * 创建截图目录
- * @returns {Promise<string>} 截图目录路径
- */
- async createScreenshotDir() {
- const dir = path.join(process.cwd(), 'screenshots');
- try {
- await fsPromises.mkdir(dir, { recursive: true });
- } catch (error) {
- if (error.code !== 'EEXIST') {
- throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error);
- }
- }
- return dir;
- }
-
- /**
- * 上传图片到服务器
- * @param {string} imagePath - 图片路径
- * @returns {Promise<string>} 图片URL
- */
- async uploadImage(imagePath) {
- try {
- const formData = new FormData();
- formData.append('file', fs.createReadStream(imagePath));
- formData.append('scene', this.uploadConfig.scene);
-
- const response = await axios.post(this.uploadConfig.url, formData, {
- headers: {
- ...formData.getHeaders(),
- 'Content-Type': 'multipart/form-data'
- },
- timeout: this.uploadConfig.timeout
- });
-
- if (!response.data || !response.data.url) {
- throw new Error('上传响应格式错误');
- }
-
- return response.data.url;
- } catch (error) {
- if (error.response) {
- throw new CrawlerError(
- `图片上传失败: ${error.response.status} ${error.response.statusText}`,
- 'IMAGE_UPLOAD_ERROR',
- 'amazon',
- error
- );
- }
- throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 带重试的页面导航
- * @param {string} url - 目标URL
- * @returns {Promise<void>}
- */
- async navigateWithRetry(url) {
- let lastError;
- for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
- try {
- await this.page.goto(url, {
- waitUntil: 'networkidle',
- timeout: this.timeouts.pageLoad
- });
- return;
- } catch (error) {
- lastError = error;
- console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message);
-
- if (attempt < this.retryConfig.maxAttempts) {
- console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
- await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay));
-
- // 重新初始化浏览器
- await this.closeBrowser();
- await this.initBrowser();
- }
- }
- }
- throw new CrawlerError(
- `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
- 'NAVIGATION_ERROR',
- 'amazon',
- lastError
- );
- }
-
- /**
- * 等待元素出现
- * @param {string} selector - 选择器
- * @returns {Promise<void>}
- */
- async waitForElement(selector) {
- try {
- await this.page.waitForSelector(selector, {
- timeout: this.timeouts.elementWait
- });
- } catch (error) {
- throw new CrawlerError(
- `等待元素超时: ${selector}`,
- 'ELEMENT_WAIT_ERROR',
- 'amazon',
- error
- );
- }
- }
-
- /**
- * 解析价格
- * @param {string} priceText - 价格文本
- * @returns {Promise<number>} 解析后的价格
- */
- async parsePrice(priceText) {
- try {
- if (!priceText) return null;
-
- // 移除货币符号和空格
- priceText = priceText.replace(/[¥JP¥\s]/g, '');
-
- // 如果包含积分信息,只取价格部分
- if (priceText.includes('ポイント')) {
- priceText = priceText.split('ポイント')[0].trim();
- }
-
- // 提取数字部分
- const match = priceText.match(/([\d,]+)/);
- if (!match) return null;
-
- // 转换价格
- return parseInt(match[1].replace(/,/g, ''));
- } catch (error) {
- throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 处理优惠券
- * @returns {Promise<number>} 优惠券金额
- */
- async handleCoupon() {
- try {
- let couponValue = 0;
-
- // 等待优惠券元素出现
- const couponTrigger = await this.page.$(this.selectors.coupon);
- if (!couponTrigger) {
- return 0; // 没有优惠券,直接返回0
- }
-
- try {
- // 点击优惠券按钮
- await couponTrigger.click();
- await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示
-
- // 等待优惠券文本出现
- const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim());
-
- // 解析优惠券金额
- const match = couponText.match(/¥\s*([\d,]+)/);
- if (match) {
- couponValue = parseInt(match[1].replace(/,/g, ''));
- }
-
- // 尝试关闭弹窗
- try {
- await this.page.click('button.a-modal-close', { timeout: 2000 });
- } catch (closeError) {
- // 如果找不到关闭按钮,尝试按ESC键
- await this.page.keyboard.press('Escape');
- }
-
- // 等待弹窗消失
- await this.page.waitForTimeout(500);
- } catch (clickError) {
- console.log('优惠券点击或处理失败:', clickError.message);
- // 如果点击失败,尝试按ESC键关闭可能的弹窗
- try {
- await this.page.keyboard.press('Escape');
- } catch (escError) {
- console.log('ESC键关闭失败:', escError.message);
- }
- }
-
- return couponValue;
- } catch (error) {
- console.log('优惠券处理失败:', error.message);
- return 0; // 发生错误时返回0,而不是抛出异常
- }
- }
-
- /**
- * 获取商品标题
- * @returns {Promise<string>} 商品标题
- */
- async getTitle() {
- try {
- return await this.page.$eval(this.selectors.title, el => el.textContent.trim());
- } catch (error) {
- throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 获取商品SKU
- * @returns {Promise<string>} 商品SKU
- */
- async getSku() {
- try {
- const url = this.page.url();
- return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
- } catch (error) {
- throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 获取商品变体信息
- * @returns {Promise<Array>} 变体信息数组
- */
- async getVariants() {
- try {
- await this.page.waitForSelector(this.selectors.variants);
- const groupEls = await this.page.$$(this.selectors.variants);
- const groups = [];
-
- for (const groupEl of groupEls) {
- const btns = await groupEl.$$('.a-button-inner .a-button-input');
- if (btns.length) groups.push(btns);
- }
-
- return groups;
- } catch (error) {
- throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 获取单个SKU信息
- * @returns {Promise<Object>} SKU信息
- */
- async getSingleSkuInfo() {
- try {
- // 等待页面加载完成
- await this.page.waitForLoadState('networkidle');
-
- // 等待标题元素出现
- await this.waitForElement(this.selectors.title);
-
- // 处理优惠券
- const couponValue = await this.handleCoupon();
-
- // 获取商品信息
- const info = await this.page.evaluate(({ selectors, couponValue }) => {
- const title = document.querySelector(selectors.title)?.textContent.trim() || null;
- let priceText = document.querySelector(selectors.price)?.textContent.trim() || null;
-
- // 处理价格文本
- if (priceText?.includes('ポイント')) {
- priceText = priceText.split('ポイント')[0].trim();
- }
-
- // 解析价格
- const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null;
- const url = window.location.href;
- const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
-
- return {
- title,
- price: price ? price.toLocaleString() : null,
- sku: asin,
- url,
- remark: couponValue > 0 ? `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` : null
- };
- }, { selectors: this.selectors, couponValue });
-
- // 验证必要信息
- if (!info.title || !info.price || !info.sku) {
- throw new Error('商品信息不完整');
- }
-
- return info;
- } catch (error) {
- throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 获取所有SKU组合信息
- * @returns {Promise<Array>} SKU信息数组
- */
- async getAllSkuInfo() {
- try {
- const groups = await this.getVariants();
- if (!groups.length) return [await this.getSingleSkuInfo()];
-
- // 生成笛卡尔积组合
- const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b]));
- let combos = groups[0].map(b => [b]);
- for (let i = 1; i < groups.length; i++) {
- combos = cartesian(combos, groups[i]);
- }
-
- const results = [];
- for (const combo of combos) {
- // 依次点击每个维度按钮
- for (const btn of combo) {
- await btn.click();
- await this.page.waitForLoadState('networkidle');
- }
-
- // 获取当前组合信息
- const info = await this.getSingleSkuInfo();
- info.variants = await Promise.all(
- combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title'))
- );
- results.push(info);
- }
-
- return results;
- } catch (error) {
- throw new CrawlerError('获取所有SKU信息失败', 'ALL_SKU_INFO_GET_ERROR', 'amazon', error);
- }
- }
-
- /**
- * 主方法:抓取商品信息
- * @param {string} url - 商品URL
- * @param {boolean} needScreenshot - 是否需要截图
- * @param {boolean} includeAllSkus - 是否包含所有SKU
- * @returns {Promise<Array>} 商品信息数组
- */
- async crawl(url, needScreenshot = false, includeAllSkus = false) {
- try {
- await this.initBrowser();
-
- // 设置页面超时
- this.page.setDefaultTimeout(this.timeouts.elementWait);
- this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
-
- // 设置请求拦截
- await this.page.route('**/*', route => {
- const resourceType = route.request().resourceType();
- // 只阻止字体和媒体资源,允许加载图片
- if (['font', 'media'].includes(resourceType)) {
- route.abort();
- } else {
- route.continue();
- }
- });
-
- // 导航到目标页面
- await this.navigateWithRetry(url.split('?')[0]);
-
- const data = includeAllSkus
- ? await this.getAllSkuInfo()
- : [await this.getSingleSkuInfo()];
-
- if (needScreenshot) {
- try {
- const dir = await this.createScreenshotDir();
- const filename = `${Date.now()}.png`;
- const shot = path.join(dir, filename);
-
- // 等待页面完全加载
- await this.page.waitForLoadState('networkidle');
-
- // 截取全页面
- await this.page.screenshot({
- path: shot,
- fullPage: true,
- timeout: this.timeouts.elementWait
- });
-
- // 上传图片并获取URL
- const imageUrl = await this.uploadImage(shot);
-
- // 更新数据,添加图片URL
- data.forEach(item => {
- item.screenshotUrl = imageUrl;
- });
-
- // 删除临时文件
- try {
- await fsPromises.unlink(shot);
- } catch (error) {
- console.error('删除临时截图文件失败:', error);
- }
- } catch (error) {
- console.error('截图处理失败:', error);
- // 截图失败不影响主流程
- }
- }
-
- return data;
- } catch (error) {
- throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error);
- } finally {
- await this.closeBrowser();
- }
- }
- }
-
- module.exports = AmazonCrawler;
|