- 更新 package.json 和 package-lock.json,添加 axios、express-rate-limit、morgan 等依赖 - 修改入口文件为 src/server.js,优化启动脚本 - 新增环境配置和爬虫配置文件,支持不同环境的配置管理 - 实现请求限制和错误处理的中间件 - 创建爬虫工厂类,支持不同平台的爬虫实例化 - 实现亚马逊爬虫,支持商品信息抓取、价格解析和优惠券处理 - 更新 README 文档,增加平台参数说明和许可证信息master
@@ -76,6 +76,7 @@ npm start | |||
- 参数: | |||
- url: 商品URL(必填) | |||
- needScreenshot: 是否需要截图(可选,默认 false) | |||
- platform: 平台(可选,目前仅支持 amazon) | |||
## 开发规范 | |||
@@ -102,4 +103,4 @@ npm test | |||
## 许可证 | |||
MIT License | |||
ISC License |
@@ -3,11 +3,17 @@ | |||
"version": "1.0.0", | |||
"description": "商品信息爬虫服务", | |||
"author": "lizhuang", | |||
"main": "src/app.js", | |||
"main": "src/server.js", | |||
"scripts": { | |||
"start": "node src/server.js", | |||
"dev": "nodemon src/app.js", | |||
"test": "jest" | |||
"start": "cross-env NODE_ENV=production node src/server.js", | |||
"dev": "cross-env NODE_ENV=development nodemon src/server.js", | |||
"test": "jest", | |||
"test:watch": "jest --watch", | |||
"test:coverage": "jest --coverage", | |||
"lint": "eslint src/**/*.js", | |||
"lint:fix": "eslint src/**/*.js --fix", | |||
"format": "prettier --write \"src/**/*.js\"", | |||
"prepare": "husky install" | |||
}, | |||
"keywords": [ | |||
"crawler", | |||
@@ -16,21 +22,41 @@ | |||
], | |||
"license": "ISC", | |||
"dependencies": { | |||
"axios": "^1.9.0", | |||
"cors": "^2.8.5", | |||
"dotenv": "^16.4.5", | |||
"express": "^4.18.2", | |||
"express-rate-limit": "^7.1.5", | |||
"form-data": "^4.0.2", | |||
"helmet": "^7.1.0", | |||
"morgan": "^1.10.0", | |||
"node-fetch": "^2.7.0", | |||
"playwright": "^1.42.1", | |||
"punycode": "^2.3.1", | |||
"winston": "^3.11.0" | |||
}, | |||
"devDependencies": { | |||
"cross-env": "^7.0.3", | |||
"eslint": "^8.57.0", | |||
"husky": "^9.0.11", | |||
"jest": "^29.7.0", | |||
"nodemon": "^3.1.0" | |||
"lint-staged": "^15.2.2", | |||
"nodemon": "^3.1.0", | |||
"prettier": "^3.2.5" | |||
}, | |||
"engines": { | |||
"node": ">=22.0.0" | |||
}, | |||
"lint-staged": { | |||
"*.js": [ | |||
"eslint --fix", | |||
"prettier --write" | |||
] | |||
}, | |||
"jest": { | |||
"testEnvironment": "node", | |||
"coverageDirectory": "coverage", | |||
"collectCoverageFrom": [ | |||
"src/**/*.js" | |||
] | |||
} | |||
} |
@@ -0,0 +1,73 @@ | |||
/** | |||
* 爬虫配置 | |||
*/ | |||
module.exports = { | |||
// 通用配置 | |||
common: { | |||
screenshotDir: 'screenshots', | |||
upload: { | |||
url: 'https://apibase.sohomall.jp/uploaders', | |||
scene: 'goods', | |||
timeout: 30000 | |||
}, | |||
browser: { | |||
headless: true, | |||
args: ['--no-sandbox', '--disable-setuid-sandbox'] | |||
}, | |||
page: { | |||
locale: 'ja-JP', | |||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |||
} | |||
}, | |||
// 亚马逊配置 | |||
amazon: { | |||
selectors: { | |||
title: '#productTitle', | |||
price: 'span.a-price > span.a-offscreen', | |||
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText', | |||
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section' | |||
}, | |||
timeouts: { | |||
pageLoad: 60000, | |||
elementWait: 10000, | |||
networkIdle: 5000 | |||
}, | |||
retry: { | |||
maxAttempts: 3, | |||
delay: 2000 | |||
}, | |||
headers: { | |||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |||
'Accept-Language': 'ja-JP,ja;q=0.9,en-US;q=0.8,en;q=0.7' | |||
} | |||
}, | |||
// 乐天配置(预留) | |||
rakuten: { | |||
selectors: {}, | |||
timeouts: { | |||
pageLoad: 5000, | |||
elementWait: 1000, | |||
networkIdle: 500 | |||
}, | |||
retry: { | |||
maxAttempts: 3, | |||
delay: 1000 | |||
} | |||
}, | |||
// 雅虎配置(预留) | |||
yahoo: { | |||
selectors: {}, | |||
timeouts: { | |||
pageLoad: 5000, | |||
elementWait: 1000, | |||
networkIdle: 500 | |||
}, | |||
retry: { | |||
maxAttempts: 3, | |||
delay: 1000 | |||
} | |||
} | |||
}; |
@@ -0,0 +1,62 @@ | |||
/** | |||
* 环境配置 | |||
*/ | |||
const config = { | |||
// 开发环境配置 | |||
development: { | |||
port: 8991, | |||
host: '0.0.0.0', | |||
cors: { | |||
origin: '*', | |||
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'], | |||
allowedHeaders: ['Content-Type', 'Authorization'] | |||
}, | |||
rateLimit: { | |||
windowMs: 15 * 60 * 1000, // 15分钟 | |||
max: 100 // 限制每个IP 15分钟内最多100个请求 | |||
}, | |||
browser: { | |||
headless: true, | |||
args: ['--no-sandbox', '--disable-setuid-sandbox'] | |||
}, | |||
logging: { | |||
level: 'debug', | |||
format: 'dev' | |||
} | |||
}, | |||
// 生产环境配置 | |||
production: { | |||
port: process.env.PORT || 8991, | |||
host: '0.0.0.0', | |||
cors: { | |||
origin: process.env.ALLOWED_ORIGINS?.split(',') || '*', | |||
methods: ['GET', 'POST'], | |||
allowedHeaders: ['Content-Type', 'Authorization'] | |||
}, | |||
rateLimit: { | |||
windowMs: 15 * 60 * 1000, | |||
max: 50 // 生产环境限制更严格 | |||
}, | |||
browser: { | |||
headless: true, | |||
args: [ | |||
'--no-sandbox', | |||
'--disable-setuid-sandbox', | |||
'--disable-dev-shm-usage', | |||
'--disable-accelerated-2d-canvas', | |||
'--disable-gpu' | |||
] | |||
}, | |||
logging: { | |||
level: 'info', | |||
format: 'combined' | |||
} | |||
} | |||
}; | |||
// 获取当前环境 | |||
const env = process.env.NODE_ENV || 'development'; | |||
// 导出当前环境的配置 | |||
module.exports = config[env]; |
@@ -0,0 +1,87 @@ | |||
const rateLimit = require('express-rate-limit'); | |||
const CrawlerError = require('../services/errors/CrawlerError'); | |||
/** | |||
* 创建请求限制中间件 | |||
* @param {Object} options - 限制选项 | |||
* @returns {Function} 中间件函数 | |||
*/ | |||
const createRateLimiter = (options = {}) => { | |||
return rateLimit({ | |||
windowMs: options.windowMs || 15 * 60 * 1000, // 15分钟 | |||
max: options.max || 100, // 限制每个IP 15分钟内最多100个请求 | |||
message: { | |||
error: '请求过于频繁,请稍后再试', | |||
code: 'RATE_LIMIT_EXCEEDED' | |||
} | |||
}); | |||
}; | |||
/** | |||
* 错误处理中间件 | |||
* @param {Error} err - 错误对象 | |||
* @param {Request} req - 请求对象 | |||
* @param {Response} res - 响应对象 | |||
* @param {Function} next - 下一个中间件 | |||
*/ | |||
const errorHandler = (err, req, res, next) => { | |||
console.error('Error:', err); | |||
if (err instanceof CrawlerError) { | |||
return res.status(400).json({ | |||
success: false, | |||
error: err.message, | |||
code: err.code, | |||
platform: err.platform, | |||
timestamp: err.timestamp | |||
}); | |||
} | |||
// 处理其他类型的错误 | |||
res.status(500).json({ | |||
error: '服务器内部错误', | |||
code: 'INTERNAL_SERVER_ERROR', | |||
message: process.env.NODE_ENV === 'development' ? err.message : undefined | |||
}); | |||
}; | |||
/** | |||
* 请求验证中间件 | |||
* @param {Request} req - 请求对象 | |||
* @param {Response} res - 响应对象 | |||
* @param {Function} next - 下一个中间件 | |||
*/ | |||
const validateRequest = (req, res, next) => { | |||
const { url, platform } = req.query; | |||
if (!url) { | |||
return res.status(400).json({ | |||
error: '缺少必要参数: url', | |||
code: 'MISSING_PARAMETER' | |||
}); | |||
} | |||
if (!platform) { | |||
return res.status(400).json({ | |||
error: '缺少必要参数: platform', | |||
code: 'MISSING_PARAMETER' | |||
}); | |||
} | |||
try { | |||
new URL(url); | |||
} catch (err) { | |||
return res.status(400).json({ | |||
error: '无效的URL格式', | |||
code: 'INVALID_URL' | |||
}); | |||
} | |||
next(); | |||
}; | |||
module.exports = { | |||
createRateLimiter, | |||
errorHandler, | |||
validateRequest | |||
}; |
@@ -0,0 +1,51 @@ | |||
const express = require('express'); | |||
const router = express.Router(); | |||
const CrawlerFactory = require('../services/crawlerFactory'); | |||
const { createRateLimiter, validateRequest } = require('../middlewares/crawlerMiddleware'); | |||
const config = require('../config/crawler.config'); | |||
// 创建请求限制中间件 | |||
const rateLimiter = createRateLimiter({ | |||
windowMs: 15 * 60 * 1000, // 15分钟 | |||
max: 100 // 限制每个IP 15分钟内最多100个请求 | |||
}); | |||
/** | |||
* 获取商品信息 | |||
* GET /api/product/info | |||
*/ | |||
router.get('/product/info', rateLimiter, validateRequest, async (req, res, next) => { | |||
try { | |||
const { url, platform, needScreenshot, includeAllSkus } = req.query; | |||
// 创建爬虫实例 | |||
const crawler = CrawlerFactory.createCrawler(platform, config[platform]); | |||
// 抓取商品信息 | |||
const data = await crawler.crawl( | |||
url, | |||
needScreenshot === 'true', | |||
includeAllSkus === 'true' | |||
); | |||
res.json({ | |||
success: true, | |||
data | |||
}); | |||
} catch (error) { | |||
next(error); | |||
} | |||
}); | |||
/** | |||
* 获取支持的平台列表 | |||
* GET /api/platforms | |||
*/ | |||
router.get('/platforms', (req, res) => { | |||
res.json({ | |||
success: true, | |||
data: CrawlerFactory.getSupportedPlatforms() | |||
}); | |||
}); | |||
module.exports = router; |
@@ -1,8 +1,14 @@ | |||
// 加载环境变量 | |||
require('dotenv').config(); | |||
const express = require('express'); | |||
const path = require('path'); | |||
const cors = require('cors'); | |||
const helmet = require('helmet'); | |||
const crawlerService = require('./services/crawlerService'); | |||
const morgan = require('morgan'); | |||
const crawlerRoutes = require('./routes/crawler'); | |||
const { errorHandler } = require('./middlewares/crawlerMiddleware'); | |||
const envConfig = require('./config/env.config'); | |||
// 处理 punycode 弃用警告 | |||
process.removeAllListeners('warning'); | |||
@@ -21,11 +27,10 @@ app.use(helmet({ | |||
})); | |||
// CORS配置 | |||
app.use(cors({ | |||
origin: '*', // 允许所有来源访问,生产环境建议设置具体的域名 | |||
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'], | |||
allowedHeaders: ['Content-Type', 'Authorization'] | |||
})); | |||
app.use(cors(envConfig.cors)); | |||
// 日志中间件 | |||
app.use(morgan(envConfig.logging.format)); | |||
// 解析 JSON 请求体 | |||
app.use(express.json()); | |||
@@ -57,42 +62,16 @@ app.use((req, res, next) => { | |||
next(); | |||
}); | |||
// 商品信息API路由 | |||
app.get('/api/product', async (req, res) => { | |||
try { | |||
const { url, needScreenshot = false, includeAllSkus = false } = req.query; | |||
console.log('收到商品信息请求:', { url, needScreenshot, includeAllSkus }); | |||
if (!url) { | |||
console.log('请求缺少URL参数'); | |||
return res.status(400).json({ | |||
success: false, | |||
message: '请提供商品URL', | |||
example: '/api/product?url=https://www.amazon.co.jp/dp/XXXXX' | |||
}); | |||
} | |||
// 路由 | |||
app.use('/api', crawlerRoutes); | |||
console.log('开始爬取商品信息:', url); | |||
const data = await crawlerService.crawlProductInfo(url, needScreenshot === 'true', includeAllSkus === 'true'); | |||
console.log('爬取完成:', data); | |||
res.json({ | |||
success: true, | |||
data | |||
}); | |||
} catch (error) { | |||
console.error('获取商品信息失败:', error); | |||
res.status(500).json({ | |||
success: false, | |||
message: '获取商品信息失败', | |||
error: error.message | |||
}); | |||
} | |||
}); | |||
// 添加健康检查端点 | |||
// 健康检查端点 | |||
app.get('/health', (req, res) => { | |||
res.json({ status: 'ok', timestamp: new Date().toISOString() }); | |||
res.json({ | |||
status: 'ok', | |||
timestamp: new Date().toISOString(), | |||
environment: process.env.NODE_ENV || 'development' | |||
}); | |||
}); | |||
// 404 处理 | |||
@@ -105,30 +84,32 @@ app.use((req, res) => { | |||
}); | |||
// 错误处理中间件 | |||
app.use((err, req, res, next) => { | |||
console.error('服务器错误:', err); | |||
res.status(500).json({ | |||
success: false, | |||
message: '服务器内部错误', | |||
error: process.env.NODE_ENV === 'development' ? err.message : undefined | |||
}); | |||
}); | |||
// 设置端口 | |||
const PORT = process.env.PORT || 8991; | |||
const HOST = '0.0.0.0'; // 监听所有网络接口 | |||
app.use(errorHandler); | |||
// 启动服务器 | |||
app.listen(PORT, HOST, () => { | |||
const server = app.listen(envConfig.port, envConfig.host, () => { | |||
const localIP = getLocalIP(); | |||
console.log('服务器配置:'); | |||
console.log(`- 监听地址: ${HOST}:${PORT}`); | |||
console.log(`- 本地访问: http://localhost:${PORT}`); | |||
console.log(`- 局域网访问: http://${localIP}:${PORT}`); | |||
console.log(`- 健康检查: http://${localIP}:${PORT}/health`); | |||
console.log('\nAPI 使用示例:'); | |||
console.log('http://localhost:8991/api/product?url=https://www.amazon.co.jp/dp/XXXXX'); | |||
console.log('可选参数:'); | |||
console.log('- needScreenshot=true'); | |||
console.log('- includeAllSkus=true'); | |||
console.log(`- 环境: ${process.env.NODE_ENV || 'development'}`); | |||
console.log(`- 监听地址: ${envConfig.host}:${envConfig.port}`); | |||
console.log(`- 本地访问: http://localhost:${envConfig.port}`); | |||
console.log(`- 局域网访问: http://${localIP}:${envConfig.port}`); | |||
console.log(`- 健康检查: http://${localIP}:${envConfig.port}/health`); | |||
}); | |||
// 优雅关闭 | |||
process.on('SIGTERM', () => { | |||
console.log('收到 SIGTERM 信号,准备关闭服务器'); | |||
server.close(() => { | |||
console.log('服务器已关闭'); | |||
process.exit(0); | |||
}); | |||
}); | |||
process.on('SIGINT', () => { | |||
console.log('收到 SIGINT 信号,准备关闭服务器'); | |||
server.close(() => { | |||
console.log('服务器已关闭'); | |||
process.exit(0); | |||
}); | |||
}); |
@@ -0,0 +1,43 @@ | |||
const AmazonCrawler = require('./crawlers/amazon/AmazonCrawler'); | |||
const CrawlerError = require('./errors/CrawlerError'); | |||
/** | |||
* 爬虫工厂类 | |||
* 用于创建不同平台的爬虫实例 | |||
*/ | |||
class CrawlerFactory { | |||
/** | |||
* 创建爬虫实例 | |||
* @param {string} platform - 平台名称 | |||
* @param {Object} config - 配置信息 | |||
* @returns {BaseCrawler} 爬虫实例 | |||
* @throws {CrawlerError} 不支持的平台 | |||
*/ | |||
static createCrawler(platform, config = {}) { | |||
switch (platform.toLowerCase()) { | |||
case 'amazon': | |||
return new AmazonCrawler(config); | |||
// 可以在这里添加其他平台的爬虫 | |||
// case 'rakuten': | |||
// return new RakutenCrawler(config); | |||
// case 'yahoo': | |||
// return new YahooCrawler(config); | |||
default: | |||
throw new CrawlerError( | |||
`不支持的平台: ${platform}`, | |||
'UNSUPPORTED_PLATFORM', | |||
platform | |||
); | |||
} | |||
} | |||
/** | |||
* 获取支持的平台列表 | |||
* @returns {string[]} 平台列表 | |||
*/ | |||
static getSupportedPlatforms() { | |||
return ['amazon']; // 可以在这里添加其他支持的平台 | |||
} | |||
} | |||
module.exports = CrawlerFactory; |
@@ -0,0 +1,441 @@ | |||
const BaseCrawler = require('../base/BaseCrawler'); | |||
const CrawlerError = require('../../errors/CrawlerError'); | |||
const path = require('path'); | |||
const fs = require('fs'); | |||
const fsPromises = require('fs').promises; | |||
const axios = require('axios'); | |||
const FormData = require('form-data'); | |||
/** | |||
* 亚马逊爬虫实现类 | |||
*/ | |||
class AmazonCrawler extends BaseCrawler { | |||
constructor(config) { | |||
super(config); | |||
this.selectors = { | |||
title: '#productTitle', | |||
price: 'span.a-price > span.a-offscreen', | |||
coupon: '.a-declarative[data-action="a-modal"], .couponLabelText', | |||
variants: '.a-cardui-body #twister-plus-inline-twister > .a-section' | |||
}; | |||
this.timeouts = config.timeouts || { | |||
pageLoad: 60000, // 页面加载超时时间 | |||
elementWait: 10000, // 元素等待超时时间 | |||
networkIdle: 5000 // 网络空闲超时时间 | |||
}; | |||
this.retryConfig = config.retry || { | |||
maxAttempts: 3, // 最大重试次数 | |||
delay: 2000 // 重试延迟时间 | |||
}; | |||
this.uploadConfig = config.common?.upload || { | |||
url: 'https://apibase.sohomall.jp/uploaders', | |||
scene: 'goods', | |||
timeout: 30000 | |||
}; | |||
} | |||
/** | |||
* 创建截图目录 | |||
* @returns {Promise<string>} 截图目录路径 | |||
*/ | |||
async createScreenshotDir() { | |||
const dir = path.join(process.cwd(), 'screenshots'); | |||
try { | |||
await fsPromises.mkdir(dir, { recursive: true }); | |||
} catch (error) { | |||
if (error.code !== 'EEXIST') { | |||
throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error); | |||
} | |||
} | |||
return dir; | |||
} | |||
/** | |||
* 上传图片到服务器 | |||
* @param {string} imagePath - 图片路径 | |||
* @returns {Promise<string>} 图片URL | |||
*/ | |||
async uploadImage(imagePath) { | |||
try { | |||
const formData = new FormData(); | |||
formData.append('file', fs.createReadStream(imagePath)); | |||
formData.append('scene', this.uploadConfig.scene); | |||
const response = await axios.post(this.uploadConfig.url, formData, { | |||
headers: { | |||
...formData.getHeaders(), | |||
'Content-Type': 'multipart/form-data' | |||
}, | |||
timeout: this.uploadConfig.timeout | |||
}); | |||
if (!response.data || !response.data.url) { | |||
throw new Error('上传响应格式错误'); | |||
} | |||
return response.data.url; | |||
} catch (error) { | |||
if (error.response) { | |||
throw new CrawlerError( | |||
`图片上传失败: ${error.response.status} ${error.response.statusText}`, | |||
'IMAGE_UPLOAD_ERROR', | |||
'amazon', | |||
error | |||
); | |||
} | |||
throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 带重试的页面导航 | |||
* @param {string} url - 目标URL | |||
* @returns {Promise<void>} | |||
*/ | |||
async navigateWithRetry(url) { | |||
let lastError; | |||
for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) { | |||
try { | |||
await this.page.goto(url, { | |||
waitUntil: 'networkidle', | |||
timeout: this.timeouts.pageLoad | |||
}); | |||
return; | |||
} catch (error) { | |||
lastError = error; | |||
console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message); | |||
if (attempt < this.retryConfig.maxAttempts) { | |||
console.log(`等待 ${this.retryConfig.delay}ms 后重试...`); | |||
await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay)); | |||
// 重新初始化浏览器 | |||
await this.closeBrowser(); | |||
await this.initBrowser(); | |||
} | |||
} | |||
} | |||
throw new CrawlerError( | |||
`页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`, | |||
'NAVIGATION_ERROR', | |||
'amazon', | |||
lastError | |||
); | |||
} | |||
/** | |||
* 等待元素出现 | |||
* @param {string} selector - 选择器 | |||
* @returns {Promise<void>} | |||
*/ | |||
async waitForElement(selector) { | |||
try { | |||
await this.page.waitForSelector(selector, { | |||
timeout: this.timeouts.elementWait | |||
}); | |||
} catch (error) { | |||
throw new CrawlerError( | |||
`等待元素超时: ${selector}`, | |||
'ELEMENT_WAIT_ERROR', | |||
'amazon', | |||
error | |||
); | |||
} | |||
} | |||
/** | |||
* 解析价格 | |||
* @param {string} priceText - 价格文本 | |||
* @returns {Promise<number>} 解析后的价格 | |||
*/ | |||
async parsePrice(priceText) { | |||
try { | |||
if (!priceText) return null; | |||
// 移除货币符号和空格 | |||
priceText = priceText.replace(/[¥JP¥\s]/g, ''); | |||
// 如果包含积分信息,只取价格部分 | |||
if (priceText.includes('ポイント')) { | |||
priceText = priceText.split('ポイント')[0].trim(); | |||
} | |||
// 提取数字部分 | |||
const match = priceText.match(/([\d,]+)/); | |||
if (!match) return null; | |||
// 转换价格 | |||
return parseInt(match[1].replace(/,/g, '')); | |||
} catch (error) { | |||
throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 处理优惠券 | |||
* @returns {Promise<number>} 优惠券金额 | |||
*/ | |||
async handleCoupon() { | |||
try { | |||
let couponValue = 0; | |||
// 等待优惠券元素出现 | |||
const couponTrigger = await this.page.$(this.selectors.coupon); | |||
if (!couponTrigger) { | |||
return 0; // 没有优惠券,直接返回0 | |||
} | |||
try { | |||
// 点击优惠券按钮 | |||
await couponTrigger.click(); | |||
await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示 | |||
// 等待优惠券文本出现 | |||
const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim()); | |||
// 解析优惠券金额 | |||
const match = couponText.match(/¥\s*([\d,]+)/); | |||
if (match) { | |||
couponValue = parseInt(match[1].replace(/,/g, '')); | |||
} | |||
// 尝试关闭弹窗 | |||
try { | |||
await this.page.click('button.a-modal-close', { timeout: 2000 }); | |||
} catch (closeError) { | |||
// 如果找不到关闭按钮,尝试按ESC键 | |||
await this.page.keyboard.press('Escape'); | |||
} | |||
// 等待弹窗消失 | |||
await this.page.waitForTimeout(500); | |||
} catch (clickError) { | |||
console.log('优惠券点击或处理失败:', clickError.message); | |||
// 如果点击失败,尝试按ESC键关闭可能的弹窗 | |||
try { | |||
await this.page.keyboard.press('Escape'); | |||
} catch (escError) { | |||
console.log('ESC键关闭失败:', escError.message); | |||
} | |||
} | |||
return couponValue; | |||
} catch (error) { | |||
console.log('优惠券处理失败:', error.message); | |||
return 0; // 发生错误时返回0,而不是抛出异常 | |||
} | |||
} | |||
/** | |||
* 获取商品标题 | |||
* @returns {Promise<string>} 商品标题 | |||
*/ | |||
async getTitle() { | |||
try { | |||
return await this.page.$eval(this.selectors.title, el => el.textContent.trim()); | |||
} catch (error) { | |||
throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 获取商品SKU | |||
* @returns {Promise<string>} 商品SKU | |||
*/ | |||
async getSku() { | |||
try { | |||
const url = this.page.url(); | |||
return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; | |||
} catch (error) { | |||
throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 获取商品变体信息 | |||
* @returns {Promise<Array>} 变体信息数组 | |||
*/ | |||
async getVariants() { | |||
try { | |||
await this.page.waitForSelector(this.selectors.variants); | |||
const groupEls = await this.page.$$(this.selectors.variants); | |||
const groups = []; | |||
for (const groupEl of groupEls) { | |||
const btns = await groupEl.$$('.a-button-inner .a-button-input'); | |||
if (btns.length) groups.push(btns); | |||
} | |||
return groups; | |||
} catch (error) { | |||
throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 获取单个SKU信息 | |||
* @returns {Promise<Object>} SKU信息 | |||
*/ | |||
async getSingleSkuInfo() { | |||
try { | |||
// 等待页面加载完成 | |||
await this.page.waitForLoadState('networkidle'); | |||
// 等待标题元素出现 | |||
await this.waitForElement(this.selectors.title); | |||
// 处理优惠券 | |||
const couponValue = await this.handleCoupon(); | |||
// 获取商品信息 | |||
const info = await this.page.evaluate(({ selectors, couponValue }) => { | |||
const title = document.querySelector(selectors.title)?.textContent.trim() || null; | |||
let priceText = document.querySelector(selectors.price)?.textContent.trim() || null; | |||
// 处理价格文本 | |||
if (priceText?.includes('ポイント')) { | |||
priceText = priceText.split('ポイント')[0].trim(); | |||
} | |||
// 解析价格 | |||
const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null; | |||
const url = window.location.href; | |||
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; | |||
return { | |||
title, | |||
price: price ? price.toLocaleString() : null, | |||
sku: asin, | |||
url, | |||
remark: couponValue > 0 ? `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` : null | |||
}; | |||
}, { selectors: this.selectors, couponValue }); | |||
// 验证必要信息 | |||
if (!info.title || !info.price || !info.sku) { | |||
throw new Error('商品信息不完整'); | |||
} | |||
return info; | |||
} catch (error) { | |||
throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 获取所有SKU组合信息 | |||
* @returns {Promise<Array>} SKU信息数组 | |||
*/ | |||
async getAllSkuInfo() { | |||
try { | |||
const groups = await this.getVariants(); | |||
if (!groups.length) return [await this.getSingleSkuInfo()]; | |||
// 生成笛卡尔积组合 | |||
const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b])); | |||
let combos = groups[0].map(b => [b]); | |||
for (let i = 1; i < groups.length; i++) { | |||
combos = cartesian(combos, groups[i]); | |||
} | |||
const results = []; | |||
for (const combo of combos) { | |||
// 依次点击每个维度按钮 | |||
for (const btn of combo) { | |||
await btn.click(); | |||
await this.page.waitForLoadState('networkidle'); | |||
} | |||
// 获取当前组合信息 | |||
const info = await this.getSingleSkuInfo(); | |||
info.variants = await Promise.all( | |||
combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title')) | |||
); | |||
results.push(info); | |||
} | |||
return results; | |||
} catch (error) { | |||
throw new CrawlerError('获取所有SKU信息失败', 'ALL_SKU_INFO_GET_ERROR', 'amazon', error); | |||
} | |||
} | |||
/** | |||
* 主方法:抓取商品信息 | |||
* @param {string} url - 商品URL | |||
* @param {boolean} needScreenshot - 是否需要截图 | |||
* @param {boolean} includeAllSkus - 是否包含所有SKU | |||
* @returns {Promise<Array>} 商品信息数组 | |||
*/ | |||
async crawl(url, needScreenshot = false, includeAllSkus = false) { | |||
try { | |||
await this.initBrowser(); | |||
// 设置页面超时 | |||
this.page.setDefaultTimeout(this.timeouts.elementWait); | |||
this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad); | |||
// 设置请求拦截 | |||
await this.page.route('**/*', route => { | |||
const resourceType = route.request().resourceType(); | |||
// 只阻止字体和媒体资源,允许加载图片 | |||
if (['font', 'media'].includes(resourceType)) { | |||
route.abort(); | |||
} else { | |||
route.continue(); | |||
} | |||
}); | |||
// 导航到目标页面 | |||
await this.navigateWithRetry(url.split('?')[0]); | |||
const data = includeAllSkus | |||
? await this.getAllSkuInfo() | |||
: [await this.getSingleSkuInfo()]; | |||
if (needScreenshot) { | |||
try { | |||
const dir = await this.createScreenshotDir(); | |||
const filename = `${Date.now()}.png`; | |||
const shot = path.join(dir, filename); | |||
// 等待页面完全加载 | |||
await this.page.waitForLoadState('networkidle'); | |||
// 截取全页面 | |||
await this.page.screenshot({ | |||
path: shot, | |||
fullPage: true, | |||
timeout: this.timeouts.elementWait | |||
}); | |||
// 上传图片并获取URL | |||
const imageUrl = await this.uploadImage(shot); | |||
// 更新数据,添加图片URL | |||
data.forEach(item => { | |||
item.screenshotUrl = imageUrl; | |||
}); | |||
// 删除临时文件 | |||
try { | |||
await fsPromises.unlink(shot); | |||
} catch (error) { | |||
console.error('删除临时截图文件失败:', error); | |||
} | |||
} catch (error) { | |||
console.error('截图处理失败:', error); | |||
// 截图失败不影响主流程 | |||
} | |||
} | |||
return data; | |||
} catch (error) { | |||
throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error); | |||
} finally { | |||
await this.closeBrowser(); | |||
} | |||
} | |||
} | |||
module.exports = AmazonCrawler; |
@@ -0,0 +1,107 @@ | |||
const { chromium } = require('playwright'); | |||
const path = require('path'); | |||
const fs = require('fs').promises; | |||
/** | |||
* 基础爬虫类 | |||
* 提供所有爬虫共用的基础功能 | |||
*/ | |||
class BaseCrawler { | |||
constructor(config) { | |||
this.config = config; | |||
this.browser = null; | |||
this.context = null; | |||
this.page = null; | |||
} | |||
/** | |||
* 初始化浏览器 | |||
* @returns {Promise<void>} | |||
*/ | |||
async initBrowser() { | |||
this.browser = await chromium.launch({ | |||
headless: true, | |||
args: ['--no-sandbox', '--disable-setuid-sandbox'] | |||
}); | |||
this.context = await this.browser.newContext({ | |||
locale: 'ja-JP', | |||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |||
}); | |||
this.page = await this.context.newPage(); | |||
} | |||
/** | |||
* 关闭浏览器 | |||
* @returns {Promise<void>} | |||
*/ | |||
async closeBrowser() { | |||
if (this.browser) { | |||
await this.browser.close(); | |||
this.browser = null; | |||
this.context = null; | |||
this.page = null; | |||
} | |||
} | |||
/** | |||
* 创建截图目录 | |||
* @returns {Promise<string>} 截图目录路径 | |||
*/ | |||
async createScreenshotDir() { | |||
const dir = path.join(process.cwd(), 'screenshots'); | |||
await fs.mkdir(dir, { recursive: true }); | |||
return dir; | |||
} | |||
/** | |||
* 获取商品信息 | |||
* @param {string} url - 商品URL | |||
* @returns {Promise<Object>} 商品信息 | |||
*/ | |||
async crawl(url) { | |||
throw new Error('Method not implemented'); | |||
} | |||
/** | |||
* 解析价格 | |||
* @param {string} priceText - 价格文本 | |||
* @returns {Promise<number>} 解析后的价格 | |||
*/ | |||
async parsePrice(priceText) { | |||
throw new Error('Method not implemented'); | |||
} | |||
/** | |||
* 处理优惠券 | |||
* @returns {Promise<number>} 优惠券金额 | |||
*/ | |||
async handleCoupon() { | |||
throw new Error('Method not implemented'); | |||
} | |||
/** | |||
* 获取商品标题 | |||
* @returns {Promise<string>} 商品标题 | |||
*/ | |||
async getTitle() { | |||
throw new Error('Method not implemented'); | |||
} | |||
/** | |||
* 获取商品SKU | |||
* @returns {Promise<string>} 商品SKU | |||
*/ | |||
async getSku() { | |||
throw new Error('Method not implemented'); | |||
} | |||
/** | |||
* 获取商品变体信息 | |||
* @returns {Promise<Array>} 变体信息数组 | |||
*/ | |||
async getVariants() { | |||
throw new Error('Method not implemented'); | |||
} | |||
} | |||
module.exports = BaseCrawler; |
@@ -0,0 +1,41 @@ | |||
/** | |||
* 爬虫错误类 | |||
* @extends Error | |||
*/ | |||
class CrawlerError extends Error { | |||
/** | |||
* @param {string} message - 错误信息 | |||
* @param {string} code - 错误代码 | |||
* @param {string} platform - 平台名称 | |||
* @param {Error} [originalError] - 原始错误对象 | |||
*/ | |||
constructor(message, code, platform, originalError = null) { | |||
super(message); | |||
this.name = 'CrawlerError'; | |||
this.code = code; | |||
this.platform = platform; | |||
this.originalError = originalError; | |||
this.timestamp = new Date().toISOString(); | |||
} | |||
/** | |||
* 转换为JSON对象 | |||
* @returns {Object} 错误信息对象 | |||
*/ | |||
toJSON() { | |||
return { | |||
name: this.name, | |||
message: this.message, | |||
code: this.code, | |||
platform: this.platform, | |||
timestamp: this.timestamp, | |||
stack: this.stack, | |||
originalError: this.originalError ? { | |||
message: this.originalError.message, | |||
stack: this.originalError.stack | |||
} : null | |||
}; | |||
} | |||
} | |||
module.exports = CrawlerError; |