Переглянути джерело

feat: 完善商品信息爬虫服务功能

- 更新 package.json 和 package-lock.json,添加 axios、express-rate-limit、morgan 等依赖
- 修改入口文件为 src/server.js,优化启动脚本
- 新增环境配置和爬虫配置文件,支持不同环境的配置管理
- 实现请求限制和错误处理的中间件
- 创建爬虫工厂类,支持不同平台的爬虫实例化
- 实现亚马逊爬虫,支持商品信息抓取、价格解析和优惠券处理
- 更新 README 文档,增加平台参数说明和许可证信息
master
lizhuang 1 місяць тому
джерело
коміт
ab35d06d8b

+ 2
- 1
README.md Переглянути файл

@@ -76,6 +76,7 @@ npm start
- 参数:
- url: 商品URL(必填)
- needScreenshot: 是否需要截图(可选,默认 false)
- platform: 平台(可选,目前仅支持 amazon)

## 开发规范

@@ -102,4 +103,4 @@ npm test

## 许可证

MIT License
ISC License

+ 1875
- 23
package-lock.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 32
- 6
package.json Переглянути файл

@@ -3,11 +3,17 @@
"version": "1.0.0",
"description": "商品信息爬虫服务",
"author": "lizhuang",
"main": "src/app.js",
"main": "src/server.js",
"scripts": {
"start": "node src/server.js",
"dev": "nodemon src/app.js",
"test": "jest"
"start": "cross-env NODE_ENV=production node src/server.js",
"dev": "cross-env NODE_ENV=development nodemon src/server.js",
"test": "jest",
"test:watch": "jest --watch",
"test:coverage": "jest --coverage",
"lint": "eslint src/**/*.js",
"lint:fix": "eslint src/**/*.js --fix",
"format": "prettier --write \"src/**/*.js\"",
"prepare": "husky install"
},
"keywords": [
"crawler",
@@ -16,21 +22,41 @@
],
"license": "ISC",
"dependencies": {
"axios": "^1.9.0",
"cors": "^2.8.5",
"dotenv": "^16.4.5",
"express": "^4.18.2",
"express-rate-limit": "^7.1.5",
"form-data": "^4.0.2",
"helmet": "^7.1.0",
"morgan": "^1.10.0",
"node-fetch": "^2.7.0",
"playwright": "^1.42.1",
"punycode": "^2.3.1",
"winston": "^3.11.0"
},
"devDependencies": {
"cross-env": "^7.0.3",
"eslint": "^8.57.0",
"husky": "^9.0.11",
"jest": "^29.7.0",
"nodemon": "^3.1.0"
"lint-staged": "^15.2.2",
"nodemon": "^3.1.0",
"prettier": "^3.2.5"
},
"engines": {
"node": ">=22.0.0"
},
"lint-staged": {
"*.js": [
"eslint --fix",
"prettier --write"
]
},
"jest": {
"testEnvironment": "node",
"coverageDirectory": "coverage",
"collectCoverageFrom": [
"src/**/*.js"
]
}
}

+ 73
- 0
src/config/crawler.config.js Переглянути файл

@@ -0,0 +1,73 @@
/**
* 爬虫配置
*/
module.exports = {
// 通用配置
common: {
screenshotDir: 'screenshots',
upload: {
url: 'https://apibase.sohomall.jp/uploaders',
scene: 'goods',
timeout: 30000
},
browser: {
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
},
page: {
locale: 'ja-JP',
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
},

// 亚马逊配置
amazon: {
selectors: {
title: '#productTitle',
price: 'span.a-price > span.a-offscreen',
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
},
timeouts: {
pageLoad: 60000,
elementWait: 10000,
networkIdle: 5000
},
retry: {
maxAttempts: 3,
delay: 2000
},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7'
}
},

// 乐天配置(预留)
rakuten: {
selectors: {},
timeouts: {
pageLoad: 5000,
elementWait: 1000,
networkIdle: 500
},
retry: {
maxAttempts: 3,
delay: 1000
}
},

// 雅虎配置(预留)
yahoo: {
selectors: {},
timeouts: {
pageLoad: 5000,
elementWait: 1000,
networkIdle: 500
},
retry: {
maxAttempts: 3,
delay: 1000
}
}
};

+ 62
- 0
src/config/env.config.js Переглянути файл

@@ -0,0 +1,62 @@
/**
* 环境配置
*/
const config = {
// 开发环境配置
development: {
port: 8991,
host: '0.0.0.0',
cors: {
origin: '*',
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'],
allowedHeaders: ['Content-Type', 'Authorization']
},
rateLimit: {
windowMs: 15 * 60 * 1000, // 15分钟
max: 100 // 限制每个IP 15分钟内最多100个请求
},
browser: {
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
},
logging: {
level: 'debug',
format: 'dev'
}
},

// 生产环境配置
production: {
port: process.env.PORT || 8991,
host: '0.0.0.0',
cors: {
origin: process.env.ALLOWED_ORIGINS?.split(',') || '*',
methods: ['GET', 'POST'],
allowedHeaders: ['Content-Type', 'Authorization']
},
rateLimit: {
windowMs: 15 * 60 * 1000,
max: 50 // 生产环境限制更严格
},
browser: {
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu'
]
},
logging: {
level: 'info',
format: 'combined'
}
}
};

// 获取当前环境
const env = process.env.NODE_ENV || 'development';

// 导出当前环境的配置
module.exports = config[env];

+ 87
- 0
src/middlewares/crawlerMiddleware.js Переглянути файл

@@ -0,0 +1,87 @@
const rateLimit = require('express-rate-limit');
const CrawlerError = require('../services/errors/CrawlerError');

/**
* 创建请求限制中间件
* @param {Object} options - 限制选项
* @returns {Function} 中间件函数
*/
const createRateLimiter = (options = {}) => {
return rateLimit({
windowMs: options.windowMs || 15 * 60 * 1000, // 15分钟
max: options.max || 100, // 限制每个IP 15分钟内最多100个请求
message: {
error: '请求过于频繁,请稍后再试',
code: 'RATE_LIMIT_EXCEEDED'
}
});
};

/**
* 错误处理中间件
* @param {Error} err - 错误对象
* @param {Request} req - 请求对象
* @param {Response} res - 响应对象
* @param {Function} next - 下一个中间件
*/
const errorHandler = (err, req, res, next) => {
console.error('Error:', err);

if (err instanceof CrawlerError) {
return res.status(400).json({
success: false,
error: err.message,
code: err.code,
platform: err.platform,
timestamp: err.timestamp
});
}

// 处理其他类型的错误
res.status(500).json({
error: '服务器内部错误',
code: 'INTERNAL_SERVER_ERROR',
message: process.env.NODE_ENV === 'development' ? err.message : undefined
});
};

/**
* 请求验证中间件
* @param {Request} req - 请求对象
* @param {Response} res - 响应对象
* @param {Function} next - 下一个中间件
*/
const validateRequest = (req, res, next) => {
const { url, platform } = req.query;

if (!url) {
return res.status(400).json({
error: '缺少必要参数: url',
code: 'MISSING_PARAMETER'
});
}

if (!platform) {
return res.status(400).json({
error: '缺少必要参数: platform',
code: 'MISSING_PARAMETER'
});
}

try {
new URL(url);
} catch (err) {
return res.status(400).json({
error: '无效的URL格式',
code: 'INVALID_URL'
});
}

next();
};

module.exports = {
createRateLimiter,
errorHandler,
validateRequest
};

+ 51
- 0
src/routes/crawler.js Переглянути файл

@@ -0,0 +1,51 @@
const express = require('express');
const router = express.Router();
const CrawlerFactory = require('../services/crawlerFactory');
const { createRateLimiter, validateRequest } = require('../middlewares/crawlerMiddleware');
const config = require('../config/crawler.config');

// 创建请求限制中间件
const rateLimiter = createRateLimiter({
windowMs: 15 * 60 * 1000, // 15分钟
max: 100 // 限制每个IP 15分钟内最多100个请求
});

/**
* 获取商品信息
* GET /api/product/info
*/
router.get('/product/info', rateLimiter, validateRequest, async (req, res, next) => {
try {
const { url, platform, needScreenshot, includeAllSkus } = req.query;
// 创建爬虫实例
const crawler = CrawlerFactory.createCrawler(platform, config[platform]);
// 抓取商品信息
const data = await crawler.crawl(
url,
needScreenshot === 'true',
includeAllSkus === 'true'
);
res.json({
success: true,
data
});
} catch (error) {
next(error);
}
});

/**
* 获取支持的平台列表
* GET /api/platforms
*/
router.get('/platforms', (req, res) => {
res.json({
success: true,
data: CrawlerFactory.getSupportedPlatforms()
});
});

module.exports = router;

+ 43
- 62
src/server.js Переглянути файл

@@ -1,8 +1,14 @@
// 加载环境变量
require('dotenv').config();

const express = require('express');
const path = require('path');
const cors = require('cors');
const helmet = require('helmet');
const crawlerService = require('./services/crawlerService');
const morgan = require('morgan');
const crawlerRoutes = require('./routes/crawler');
const { errorHandler } = require('./middlewares/crawlerMiddleware');
const envConfig = require('./config/env.config');

// 处理 punycode 弃用警告
process.removeAllListeners('warning');
@@ -21,11 +27,10 @@ app.use(helmet({
}));

// CORS配置
app.use(cors({
origin: '*', // 允许所有来源访问,生产环境建议设置具体的域名
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'],
allowedHeaders: ['Content-Type', 'Authorization']
}));
app.use(cors(envConfig.cors));

// 日志中间件
app.use(morgan(envConfig.logging.format));

// 解析 JSON 请求体
app.use(express.json());
@@ -57,42 +62,16 @@ app.use((req, res, next) => {
next();
});

// 商品信息API路由
app.get('/api/product', async (req, res) => {
try {
const { url, needScreenshot = false, includeAllSkus = false } = req.query;
console.log('收到商品信息请求:', { url, needScreenshot, includeAllSkus });
if (!url) {
console.log('请求缺少URL参数');
return res.status(400).json({
success: false,
message: '请提供商品URL',
example: '/api/product?url=https://www.amazon.co.jp/dp/XXXXX'
});
}
// 路由
app.use('/api', crawlerRoutes);

console.log('开始爬取商品信息:', url);
const data = await crawlerService.crawlProductInfo(url, needScreenshot === 'true', includeAllSkus === 'true');
console.log('爬取完成:', data);
res.json({
success: true,
data
});
} catch (error) {
console.error('获取商品信息失败:', error);
res.status(500).json({
success: false,
message: '获取商品信息失败',
error: error.message
});
}
});

// 添加健康检查端点
// 健康检查端点
app.get('/health', (req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
res.json({
status: 'ok',
timestamp: new Date().toISOString(),
environment: process.env.NODE_ENV || 'development'
});
});

// 404 处理
@@ -105,30 +84,32 @@ app.use((req, res) => {
});

// 错误处理中间件
app.use((err, req, res, next) => {
console.error('服务器错误:', err);
res.status(500).json({
success: false,
message: '服务器内部错误',
error: process.env.NODE_ENV === 'development' ? err.message : undefined
});
});

// 设置端口
const PORT = process.env.PORT || 8991;
const HOST = '0.0.0.0'; // 监听所有网络接口
app.use(errorHandler);

// 启动服务器
app.listen(PORT, HOST, () => {
const server = app.listen(envConfig.port, envConfig.host, () => {
const localIP = getLocalIP();
console.log('服务器配置:');
console.log(`- 监听地址: ${HOST}:${PORT}`);
console.log(`- 本地访问: http://localhost:${PORT}`);
console.log(`- 局域网访问: http://${localIP}:${PORT}`);
console.log(`- 健康检查: http://${localIP}:${PORT}/health`);
console.log('\nAPI 使用示例:');
console.log('http://localhost:8991/api/product?url=https://www.amazon.co.jp/dp/XXXXX');
console.log('可选参数:');
console.log('- needScreenshot=true');
console.log('- includeAllSkus=true');
console.log(`- 环境: ${process.env.NODE_ENV || 'development'}`);
console.log(`- 监听地址: ${envConfig.host}:${envConfig.port}`);
console.log(`- 本地访问: http://localhost:${envConfig.port}`);
console.log(`- 局域网访问: http://${localIP}:${envConfig.port}`);
console.log(`- 健康检查: http://${localIP}:${envConfig.port}/health`);
});

// 优雅关闭
process.on('SIGTERM', () => {
console.log('收到 SIGTERM 信号,准备关闭服务器');
server.close(() => {
console.log('服务器已关闭');
process.exit(0);
});
});

process.on('SIGINT', () => {
console.log('收到 SIGINT 信号,准备关闭服务器');
server.close(() => {
console.log('服务器已关闭');
process.exit(0);
});
});

+ 43
- 0
src/services/crawlerFactory.js Переглянути файл

@@ -0,0 +1,43 @@
const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler');
const CrawlerError = require('./errors/CrawlerError');

/**
* 爬虫工厂类
* 用于创建不同平台的爬虫实例
*/
class CrawlerFactory {
/**
* 创建爬虫实例
* @param {string} platform - 平台名称
* @param {Object} config - 配置信息
* @returns {BaseCrawler} 爬虫实例
* @throws {CrawlerError} 不支持的平台
*/
static createCrawler(platform, config = {}) {
switch (platform.toLowerCase()) {
case 'amazon':
return new AmazonCrawler(config);
// 可以在这里添加其他平台的爬虫
// case 'rakuten':
// return new RakutenCrawler(config);
// case 'yahoo':
// return new YahooCrawler(config);
default:
throw new CrawlerError(
`不支持的平台: ${platform}`,
'UNSUPPORTED_PLATFORM',
platform
);
}
}

/**
* 获取支持的平台列表
* @returns {string[]} 平台列表
*/
static getSupportedPlatforms() {
return ['amazon']; // 可以在这里添加其他支持的平台
}
}

module.exports = CrawlerFactory;

+ 441
- 0
src/services/crawlers/amazon/AmazonCrawler.js Переглянути файл

@@ -0,0 +1,441 @@
const BaseCrawler = require('../base/BaseCrawler');
const CrawlerError = require('../../errors/CrawlerError');
const path = require('path');
const fs = require('fs');
const fsPromises = require('fs').promises;
const axios = require('axios');
const FormData = require('form-data');

/**
* 亚马逊爬虫实现类
*/
class AmazonCrawler extends BaseCrawler {
constructor(config) {
super(config);
this.selectors = {
title: '#productTitle',
price: 'span.a-price > span.a-offscreen',
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
};
this.timeouts = config.timeouts || {
pageLoad: 60000, // 页面加载超时时间
elementWait: 10000, // 元素等待超时时间
networkIdle: 5000 // 网络空闲超时时间
};
this.retryConfig = config.retry || {
maxAttempts: 3, // 最大重试次数
delay: 2000 // 重试延迟时间
};
this.uploadConfig = config.common?.upload || {
url: 'https://apibase.sohomall.jp/uploaders',
scene: 'goods',
timeout: 30000
};
}

/**
* 创建截图目录
* @returns {Promise<string>} 截图目录路径
*/
async createScreenshotDir() {
const dir = path.join(process.cwd(), 'screenshots');
try {
await fsPromises.mkdir(dir, { recursive: true });
} catch (error) {
if (error.code !== 'EEXIST') {
throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error);
}
}
return dir;
}

/**
* 上传图片到服务器
* @param {string} imagePath - 图片路径
* @returns {Promise<string>} 图片URL
*/
async uploadImage(imagePath) {
try {
const formData = new FormData();
formData.append('file', fs.createReadStream(imagePath));
formData.append('scene', this.uploadConfig.scene);

const response = await axios.post(this.uploadConfig.url, formData, {
headers: {
...formData.getHeaders(),
'Content-Type': 'multipart/form-data'
},
timeout: this.uploadConfig.timeout
});

if (!response.data || !response.data.url) {
throw new Error('上传响应格式错误');
}

return response.data.url;
} catch (error) {
if (error.response) {
throw new CrawlerError(
`图片上传失败: ${error.response.status} ${error.response.statusText}`,
'IMAGE_UPLOAD_ERROR',
'amazon',
error
);
}
throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error);
}
}

/**
* 带重试的页面导航
* @param {string} url - 目标URL
* @returns {Promise<void>}
*/
async navigateWithRetry(url) {
let lastError;
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
try {
await this.page.goto(url, {
waitUntil: 'networkidle',
timeout: this.timeouts.pageLoad
});
return;
} catch (error) {
lastError = error;
console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message);
if (attempt < this.retryConfig.maxAttempts) {
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay));
// 重新初始化浏览器
await this.closeBrowser();
await this.initBrowser();
}
}
}
throw new CrawlerError(
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
'NAVIGATION_ERROR',
'amazon',
lastError
);
}

/**
* 等待元素出现
* @param {string} selector - 选择器
* @returns {Promise<void>}
*/
async waitForElement(selector) {
try {
await this.page.waitForSelector(selector, {
timeout: this.timeouts.elementWait
});
} catch (error) {
throw new CrawlerError(
`等待元素超时: ${selector}`,
'ELEMENT_WAIT_ERROR',
'amazon',
error
);
}
}

/**
* 解析价格
* @param {string} priceText - 价格文本
* @returns {Promise<number>} 解析后的价格
*/
async parsePrice(priceText) {
try {
if (!priceText) return null;
// 移除货币符号和空格
priceText = priceText.replace(/[¥JP¥\s]/g, '');
// 如果包含积分信息,只取价格部分
if (priceText.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
}
// 提取数字部分
const match = priceText.match(/([\d,]+)/);
if (!match) return null;
// 转换价格
return parseInt(match[1].replace(/,/g, ''));
} catch (error) {
throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error);
}
}

/**
* 处理优惠券
* @returns {Promise<number>} 优惠券金额
*/
async handleCoupon() {
try {
let couponValue = 0;
// 等待优惠券元素出现
const couponTrigger = await this.page.$(this.selectors.coupon);
if (!couponTrigger) {
return 0; // 没有优惠券,直接返回0
}

try {
// 点击优惠券按钮
await couponTrigger.click();
await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示

// 等待优惠券文本出现
const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim());
// 解析优惠券金额
const match = couponText.match(/¥\s*([\d,]+)/);
if (match) {
couponValue = parseInt(match[1].replace(/,/g, ''));
}

// 尝试关闭弹窗
try {
await this.page.click('button.a-modal-close', { timeout: 2000 });
} catch (closeError) {
// 如果找不到关闭按钮,尝试按ESC键
await this.page.keyboard.press('Escape');
}

// 等待弹窗消失
await this.page.waitForTimeout(500);
} catch (clickError) {
console.log('优惠券点击或处理失败:', clickError.message);
// 如果点击失败,尝试按ESC键关闭可能的弹窗
try {
await this.page.keyboard.press('Escape');
} catch (escError) {
console.log('ESC键关闭失败:', escError.message);
}
}

return couponValue;
} catch (error) {
console.log('优惠券处理失败:', error.message);
return 0; // 发生错误时返回0,而不是抛出异常
}
}

/**
* 获取商品标题
* @returns {Promise<string>} 商品标题
*/
async getTitle() {
try {
return await this.page.$eval(this.selectors.title, el => el.textContent.trim());
} catch (error) {
throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error);
}
}

/**
* 获取商品SKU
* @returns {Promise<string>} 商品SKU
*/
async getSku() {
try {
const url = this.page.url();
return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
} catch (error) {
throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error);
}
}

/**
* 获取商品变体信息
* @returns {Promise<Array>} 变体信息数组
*/
async getVariants() {
try {
await this.page.waitForSelector(this.selectors.variants);
const groupEls = await this.page.$$(this.selectors.variants);
const groups = [];
for (const groupEl of groupEls) {
const btns = await groupEl.$$('.a-button-inner .a-button-input');
if (btns.length) groups.push(btns);
}
return groups;
} catch (error) {
throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error);
}
}

/**
* 获取单个SKU信息
* @returns {Promise<Object>} SKU信息
*/
async getSingleSkuInfo() {
try {
// 等待页面加载完成
await this.page.waitForLoadState('networkidle');
// 等待标题元素出现
await this.waitForElement(this.selectors.title);
// 处理优惠券
const couponValue = await this.handleCoupon();
// 获取商品信息
const info = await this.page.evaluate(({ selectors, couponValue }) => {
const title = document.querySelector(selectors.title)?.textContent.trim() || null;
let priceText = document.querySelector(selectors.price)?.textContent.trim() || null;
// 处理价格文本
if (priceText?.includes('ポイント')) {
priceText = priceText.split('ポイント')[0].trim();
}
// 解析价格
const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
return {
title,
price: price ? price.toLocaleString() : null,
sku: asin,
url,
remark: couponValue > 0 ? `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` : null
};
}, { selectors: this.selectors, couponValue });

// 验证必要信息
if (!info.title || !info.price || !info.sku) {
throw new Error('商品信息不完整');
}

return info;
} catch (error) {
throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error);
}
}

/**
* 获取所有SKU组合信息
* @returns {Promise<Array>} SKU信息数组
*/
async getAllSkuInfo() {
try {
const groups = await this.getVariants();
if (!groups.length) return [await this.getSingleSkuInfo()];

// 生成笛卡尔积组合
const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b]));
let combos = groups[0].map(b => [b]);
for (let i = 1; i < groups.length; i++) {
combos = cartesian(combos, groups[i]);
}

const results = [];
for (const combo of combos) {
// 依次点击每个维度按钮
for (const btn of combo) {
await btn.click();
await this.page.waitForLoadState('networkidle');
}
// 获取当前组合信息
const info = await this.getSingleSkuInfo();
info.variants = await Promise.all(
combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title'))
);
results.push(info);
}
return results;
} catch (error) {
throw new CrawlerError('获取所有SKU信息失败', 'ALL_SKU_INFO_GET_ERROR', 'amazon', error);
}
}

/**
* 主方法:抓取商品信息
* @param {string} url - 商品URL
* @param {boolean} needScreenshot - 是否需要截图
* @param {boolean} includeAllSkus - 是否包含所有SKU
* @returns {Promise<Array>} 商品信息数组
*/
async crawl(url, needScreenshot = false, includeAllSkus = false) {
try {
await this.initBrowser();
// 设置页面超时
this.page.setDefaultTimeout(this.timeouts.elementWait);
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
// 设置请求拦截
await this.page.route('**/*', route => {
const resourceType = route.request().resourceType();
// 只阻止字体和媒体资源,允许加载图片
if (['font', 'media'].includes(resourceType)) {
route.abort();
} else {
route.continue();
}
});

// 导航到目标页面
await this.navigateWithRetry(url.split('?')[0]);

const data = includeAllSkus
? await this.getAllSkuInfo()
: [await this.getSingleSkuInfo()];

if (needScreenshot) {
try {
const dir = await this.createScreenshotDir();
const filename = `${Date.now()}.png`;
const shot = path.join(dir, filename);
// 等待页面完全加载
await this.page.waitForLoadState('networkidle');
// 截取全页面
await this.page.screenshot({
path: shot,
fullPage: true,
timeout: this.timeouts.elementWait
});
// 上传图片并获取URL
const imageUrl = await this.uploadImage(shot);
// 更新数据,添加图片URL
data.forEach(item => {
item.screenshotUrl = imageUrl;
});

// 删除临时文件
try {
await fsPromises.unlink(shot);
} catch (error) {
console.error('删除临时截图文件失败:', error);
}
} catch (error) {
console.error('截图处理失败:', error);
// 截图失败不影响主流程
}
}

return data;
} catch (error) {
throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error);
} finally {
await this.closeBrowser();
}
}
}

module.exports = AmazonCrawler;

+ 107
- 0
src/services/crawlers/base/BaseCrawler.js Переглянути файл

@@ -0,0 +1,107 @@
const { chromium } = require('playwright');
const path = require('path');
const fs = require('fs').promises;

/**
* 基础爬虫类
* 提供所有爬虫共用的基础功能
*/
class BaseCrawler {
constructor(config) {
this.config = config;
this.browser = null;
this.context = null;
this.page = null;
}

/**
* 初始化浏览器
* @returns {Promise<void>}
*/
async initBrowser() {
this.browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
this.context = await this.browser.newContext({
locale: 'ja-JP',
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
});
this.page = await this.context.newPage();
}

/**
* 关闭浏览器
* @returns {Promise<void>}
*/
async closeBrowser() {
if (this.browser) {
await this.browser.close();
this.browser = null;
this.context = null;
this.page = null;
}
}

/**
* 创建截图目录
* @returns {Promise<string>} 截图目录路径
*/
async createScreenshotDir() {
const dir = path.join(process.cwd(), 'screenshots');
await fs.mkdir(dir, { recursive: true });
return dir;
}

/**
* 获取商品信息
* @param {string} url - 商品URL
* @returns {Promise<Object>} 商品信息
*/
async crawl(url) {
throw new Error('Method not implemented');
}

/**
* 解析价格
* @param {string} priceText - 价格文本
* @returns {Promise<number>} 解析后的价格
*/
async parsePrice(priceText) {
throw new Error('Method not implemented');
}

/**
* 处理优惠券
* @returns {Promise<number>} 优惠券金额
*/
async handleCoupon() {
throw new Error('Method not implemented');
}

/**
* 获取商品标题
* @returns {Promise<string>} 商品标题
*/
async getTitle() {
throw new Error('Method not implemented');
}

/**
* 获取商品SKU
* @returns {Promise<string>} 商品SKU
*/
async getSku() {
throw new Error('Method not implemented');
}

/**
* 获取商品变体信息
* @returns {Promise<Array>} 变体信息数组
*/
async getVariants() {
throw new Error('Method not implemented');
}
}

module.exports = BaseCrawler;

+ 41
- 0
src/services/errors/CrawlerError.js Переглянути файл

@@ -0,0 +1,41 @@
/**
* 爬虫错误类
* @extends Error
*/
class CrawlerError extends Error {
/**
* @param {string} message - 错误信息
* @param {string} code - 错误代码
* @param {string} platform - 平台名称
* @param {Error} [originalError] - 原始错误对象
*/
constructor(message, code, platform, originalError = null) {
super(message);
this.name = 'CrawlerError';
this.code = code;
this.platform = platform;
this.originalError = originalError;
this.timestamp = new Date().toISOString();
}

/**
* 转换为JSON对象
* @returns {Object} 错误信息对象
*/
toJSON() {
return {
name: this.name,
message: this.message,
code: this.code,
platform: this.platform,
timestamp: this.timestamp,
stack: this.stack,
originalError: this.originalError ? {
message: this.originalError.message,
stack: this.originalError.stack
} : null
};
}
}

module.exports = CrawlerError;

Завантаження…
Відмінити
Зберегти