- 新增项目结构,包括主要文件和目录 - 添加 .gitignore 文件以排除不必要的文件 - 创建 package.json 和 package-lock.json,定义项目依赖 - 实现基本的 Express 应用,配置中间件和路由 - 添加爬虫服务逻辑,支持商品信息抓取和截图功能 - 完善 README 文档,提供项目功能、安装说明和使用示例master
@@ -76,3 +76,37 @@ typings/ | |||
# FuseBox cache | |||
.fusebox/ | |||
# 环境配置 | |||
.env.local | |||
.env.*.local | |||
# 运行时数据 | |||
screenshots/ | |||
# IDE | |||
.idea/ | |||
.vscode/ | |||
*.swp | |||
*.swo | |||
# 操作系统 | |||
.DS_Store | |||
Thumbs.db | |||
# 临时文件 | |||
*.tmp | |||
*.tmp.* | |||
*.tmp.*.* | |||
*.tmp.*.*.* | |||
# 缓存 | |||
.cache | |||
# 日志 | |||
logs | |||
.cursorrules | |||
@@ -1,2 +1,105 @@ | |||
# crawling-service | |||
一个基于 Node.js 和 Playwright 的商品信息爬虫服务。 | |||
## 功能特性 | |||
- 🚀 提供 RESTful API 接口 | |||
- 🔍 支持商品信息爬取 | |||
- 📸 支持商品截图功能 | |||
- 🛡️ 内置安全防护 | |||
- 📝 完整的日志记录 | |||
- 🔄 模块化设计 | |||
## 技术栈 | |||
- Node.js (v22.x) | |||
- Express.js - Web 框架 | |||
- Playwright - 浏览器自动化 | |||
- Winston - 日志管理 | |||
- Jest - 单元测试 | |||
- Helmet - 安全中间件 | |||
- CORS - 跨域支持 | |||
## 项目结构 | |||
``` | |||
src/ | |||
├── config/ # 配置文件 | |||
├── controllers/ # 控制器 | |||
├── middlewares/ # 中间件 | |||
├── routes/ # 路由 | |||
├── services/ # 业务逻辑 | |||
├── utils/ # 工具函数 | |||
├── app.js # 应用入口 | |||
└── server.js # 服务器配置 | |||
``` | |||
## 安装说明 | |||
1. 确保已安装 Node.js v22.x | |||
2. 克隆项目 | |||
```bash | |||
git clone http://dev.sohomall.jp:3000/lizhuang/crawling-service.git | |||
cd crawling-service | |||
``` | |||
3. 安装依赖 | |||
```bash | |||
npm install | |||
``` | |||
4. 配置环境变量 | |||
```bash | |||
cp .env.example .env | |||
# 编辑 .env 文件,配置必要的环境变量 | |||
``` | |||
## 运行服务 | |||
开发环境: | |||
```bash | |||
npm run dev | |||
``` | |||
生产环境: | |||
```bash | |||
npm start | |||
``` | |||
## API 接口 | |||
### 商品信息查询 | |||
- 请求方式:GET | |||
- 接口路径:`/api/product/info` | |||
- 参数: | |||
- url: 商品URL(必填) | |||
- needScreenshot: 是否需要截图(可选,默认 false) | |||
## 开发规范 | |||
- 遵循 SOLID 原则 | |||
- 函数职责单一 | |||
- 完整的异常处理 | |||
- 清晰的代码注释 | |||
- 统一的代码风格 | |||
## 测试 | |||
运行测试: | |||
```bash | |||
npm test | |||
``` | |||
## 贡献指南 | |||
1. Fork 项目 | |||
2. 创建特性分支 | |||
3. 提交变更 | |||
4. 推送到分支 | |||
5. 创建 Pull Request | |||
## 许可证 | |||
MIT License |
@@ -0,0 +1,36 @@ | |||
{ | |||
"name": "crawling-service", | |||
"version": "1.0.0", | |||
"description": "商品信息爬虫服务", | |||
"author": "lizhuang", | |||
"main": "src/app.js", | |||
"scripts": { | |||
"start": "node src/server.js", | |||
"dev": "nodemon src/app.js", | |||
"test": "jest" | |||
}, | |||
"keywords": [ | |||
"crawler", | |||
"playwright", | |||
"express" | |||
], | |||
"license": "ISC", | |||
"dependencies": { | |||
"cors": "^2.8.5", | |||
"dotenv": "^16.4.5", | |||
"express": "^4.18.2", | |||
"form-data": "^4.0.2", | |||
"helmet": "^7.1.0", | |||
"node-fetch": "^2.7.0", | |||
"playwright": "^1.42.1", | |||
"punycode": "^2.3.1", | |||
"winston": "^3.11.0" | |||
}, | |||
"devDependencies": { | |||
"jest": "^29.7.0", | |||
"nodemon": "^3.1.0" | |||
}, | |||
"engines": { | |||
"node": ">=22.0.0" | |||
} | |||
} |
@@ -0,0 +1,28 @@ | |||
const express = require('express'); | |||
const cors = require('cors'); | |||
const helmet = require('helmet'); | |||
const { errorHandler } = require('./middlewares/errorHandler'); | |||
const routes = require('./routes'); | |||
// 创建Express应用 | |||
const app = express(); | |||
// 中间件配置 | |||
app.use(helmet()); // 安全头 | |||
app.use(cors()); // 跨域支持 | |||
app.use(express.json()); // JSON解析 | |||
app.use(express.urlencoded({ extended: true })); // URL编码解析 | |||
// 路由配置 | |||
app.use('/api', routes); | |||
// 错误处理中间件 | |||
app.use(errorHandler); | |||
// 启动服务器 | |||
const PORT = process.env.PORT || 8991; | |||
app.listen(PORT, () => { | |||
console.log(`服务器运行在 http://localhost:${PORT}`); | |||
}); | |||
module.exports = app; |
@@ -0,0 +1,45 @@ | |||
const crawlerService = require('../services/crawlerService'); | |||
/** | |||
* 爬虫控制器 | |||
*/ | |||
class CrawlerController { | |||
/** | |||
* 获取商品信息 | |||
* @param {Request} req - 请求对象 | |||
* @param {Response} res - 响应对象 | |||
* @param {NextFunction} next - 下一个中间件函数 | |||
*/ | |||
async getProductInfo(req, res, next) { | |||
try { | |||
const { url, screenshot, allSkus } = req.query; | |||
// 参数验证 | |||
if (!url) { | |||
return res.status(400).json({ | |||
success: false, | |||
error: { | |||
message: '商品URL是必需的', | |||
statusCode: 400 | |||
} | |||
}); | |||
} | |||
// 调用爬虫服务 | |||
const productInfo = await crawlerService.crawlProductInfo( | |||
url, | |||
screenshot === 'true', | |||
allSkus === 'true' | |||
); | |||
res.json({ | |||
success: true, | |||
data: productInfo | |||
}); | |||
} catch (error) { | |||
next(error); | |||
} | |||
} | |||
} | |||
module.exports = new CrawlerController(); |
@@ -0,0 +1,25 @@ | |||
/** | |||
* 全局错误处理中间件 | |||
* @param {Error} err - 错误对象 | |||
* @param {Request} req - 请求对象 | |||
* @param {Response} res - 响应对象 | |||
* @param {NextFunction} next - 下一个中间件函数 | |||
*/ | |||
const errorHandler = (err, req, res, next) => { | |||
console.error('错误:', err); | |||
// 默认错误状态码和消息 | |||
const statusCode = err.statusCode || 500; | |||
const message = err.message || '服务器内部错误'; | |||
res.status(statusCode).json({ | |||
success: false, | |||
error: { | |||
message, | |||
statusCode, | |||
...(process.env.NODE_ENV === 'development' && { stack: err.stack }) | |||
} | |||
}); | |||
}; | |||
module.exports = { errorHandler }; |
@@ -0,0 +1,12 @@ | |||
const express = require('express'); | |||
const router = express.Router(); | |||
const crawlerController = require('../controllers/crawlerController'); | |||
/** | |||
* @route GET /api/product | |||
* @desc 获取商品信息 | |||
* @access Public | |||
*/ | |||
router.get('/product', crawlerController.getProductInfo); | |||
module.exports = router; |
@@ -0,0 +1,134 @@ | |||
const express = require('express'); | |||
const path = require('path'); | |||
const cors = require('cors'); | |||
const helmet = require('helmet'); | |||
const crawlerService = require('./services/crawlerService'); | |||
// 处理 punycode 弃用警告 | |||
process.removeAllListeners('warning'); | |||
process.on('warning', (warning) => { | |||
if (warning.name === 'DeprecationWarning' && warning.message.includes('punycode')) { | |||
return; | |||
} | |||
console.warn(warning); | |||
}); | |||
const app = express(); | |||
// 安全中间件 | |||
app.use(helmet({ | |||
crossOriginResourcePolicy: { policy: "cross-origin" } | |||
})); | |||
// CORS配置 | |||
app.use(cors({ | |||
origin: '*', // 允许所有来源访问,生产环境建议设置具体的域名 | |||
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'], | |||
allowedHeaders: ['Content-Type', 'Authorization'] | |||
})); | |||
// 解析 JSON 请求体 | |||
app.use(express.json()); | |||
// 解析 URL 编码的请求体 | |||
app.use(express.urlencoded({ extended: true })); | |||
// 设置静态文件目录 | |||
const screenshotsDir = path.join(process.cwd(), 'screenshots'); | |||
app.use('/screenshots', express.static(screenshotsDir)); | |||
// 获取本机IP地址 | |||
const os = require('os'); | |||
function getLocalIP() { | |||
const interfaces = os.networkInterfaces(); | |||
for (const name of Object.keys(interfaces)) { | |||
for (const iface of interfaces[name]) { | |||
// 跳过内部IP和非IPv4地址 | |||
if (iface.family === 'IPv4' && !iface.internal) { | |||
return iface.address; | |||
} | |||
} | |||
} | |||
return '0.0.0.0'; // 默认监听所有接口 | |||
} | |||
// 添加请求日志中间件 | |||
app.use((req, res, next) => { | |||
console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`); | |||
next(); | |||
}); | |||
// 商品信息API路由 | |||
app.get('/api/product', async (req, res) => { | |||
try { | |||
const { url, needScreenshot = false, includeAllSkus = false } = req.query; | |||
console.log('收到商品信息请求:', { url, needScreenshot, includeAllSkus }); | |||
if (!url) { | |||
console.log('请求缺少URL参数'); | |||
return res.status(400).json({ | |||
success: false, | |||
message: '请提供商品URL', | |||
example: '/api/product?url=https://www.amazon.co.jp/dp/XXXXX' | |||
}); | |||
} | |||
console.log('开始爬取商品信息:', url); | |||
const data = await crawlerService.crawlProductInfo(url, needScreenshot === 'true', includeAllSkus === 'true'); | |||
console.log('爬取完成:', data); | |||
res.json({ | |||
success: true, | |||
data | |||
}); | |||
} catch (error) { | |||
console.error('获取商品信息失败:', error); | |||
res.status(500).json({ | |||
success: false, | |||
message: '获取商品信息失败', | |||
error: error.message | |||
}); | |||
} | |||
}); | |||
// 添加健康检查端点 | |||
app.get('/health', (req, res) => { | |||
res.json({ status: 'ok', timestamp: new Date().toISOString() }); | |||
}); | |||
// 404 处理 | |||
app.use((req, res) => { | |||
res.status(404).json({ | |||
success: false, | |||
message: '接口不存在', | |||
path: req.path | |||
}); | |||
}); | |||
// 错误处理中间件 | |||
app.use((err, req, res, next) => { | |||
console.error('服务器错误:', err); | |||
res.status(500).json({ | |||
success: false, | |||
message: '服务器内部错误', | |||
error: process.env.NODE_ENV === 'development' ? err.message : undefined | |||
}); | |||
}); | |||
// 设置端口 | |||
const PORT = process.env.PORT || 8991; | |||
const HOST = '0.0.0.0'; // 监听所有网络接口 | |||
// 启动服务器 | |||
app.listen(PORT, HOST, () => { | |||
const localIP = getLocalIP(); | |||
console.log('服务器配置:'); | |||
console.log(`- 监听地址: ${HOST}:${PORT}`); | |||
console.log(`- 本地访问: http://localhost:${PORT}`); | |||
console.log(`- 局域网访问: http://${localIP}:${PORT}`); | |||
console.log(`- 健康检查: http://${localIP}:${PORT}/health`); | |||
console.log('\nAPI 使用示例:'); | |||
console.log('http://localhost:8991/api/product?url=https://www.amazon.co.jp/dp/XXXXX'); | |||
console.log('可选参数:'); | |||
console.log('- needScreenshot=true'); | |||
console.log('- includeAllSkus=true'); | |||
}); |
@@ -0,0 +1,196 @@ | |||
const { chromium } = require('playwright'); | |||
const path = require('path'); | |||
const fs = require('fs').promises; | |||
const FormData = require('form-data'); | |||
const fetch = require('node-fetch'); | |||
/** | |||
* Amazon 商品信息爬虫服务(Playwright) | |||
*/ | |||
class CrawlerService { | |||
constructor() { | |||
// 设置静态文件访问的基础URL | |||
this.UPLOAD_URL = 'https://apibase.sohomall.jp/uploaders?scene=goods'; | |||
} | |||
async initBrowser() { | |||
return await chromium.launch({ headless: true }); | |||
} | |||
async createScreenshotDir() { | |||
const dir = path.join(process.cwd(), 'screenshots'); | |||
await fs.mkdir(dir, { recursive: true }); | |||
return dir; | |||
} | |||
/** | |||
* 获取单个 SKU 信息(含优惠券扣减) | |||
*/ | |||
async getSingleSkuInfo(page) { | |||
// 添加控制台日志监听 | |||
page.on('console', msg => console.log('Browser Console:', msg.text())); | |||
await page.waitForTimeout(500); | |||
let couponValue = 0; | |||
const couponTrigger = await page.$('.a-declarative[data-action="a-modal"], .couponLabelText'); | |||
if (couponTrigger) { | |||
try { | |||
await couponTrigger.click(); | |||
await page.waitForTimeout(500); | |||
} catch {} | |||
try { | |||
const couponText = await page.$eval('.couponLabelText', el => el.textContent.trim()); | |||
const m = couponText.match(/¥\s*([\d,]+)/); | |||
couponValue = m ? parseInt(m[1].replace(/,/g, '')) : 0; | |||
console.log('Found coupon value:', couponValue); | |||
} catch {} | |||
// 尝试关闭弹窗 | |||
try { await page.click('button.a-modal-close', { timeout: 1000 }); } catch { await page.keyboard.press('Escape'); } | |||
} | |||
return await page.evaluate(couponValue => { | |||
const title = document.querySelector('#productTitle')?.textContent.trim() || null; | |||
let priceText = document.querySelector('span.a-price > span.a-offscreen')?.textContent.trim() | |||
|| document.querySelector('#priceblock_dealprice')?.textContent.trim() | |||
|| document.querySelector('#priceblock_saleprice')?.textContent.trim() | |||
|| document.querySelector('#priceblock_ourprice')?.textContent.trim() | |||
|| null; | |||
if (priceText?.includes('ポイント')) priceText = priceText.split('ポイント')[0].trim(); | |||
priceText = priceText.replace('¥', ''); | |||
priceText = priceText.replace('JP¥', ''); | |||
const m = priceText?.match(/\s*([\d,]+)/); | |||
let priceVal = m ? parseInt(m[1].replace(/,/g, '')) : null; | |||
if (priceVal != null) priceVal -= couponValue; | |||
console.log('priceText', priceText); | |||
console.log('priceVal', priceVal); | |||
console.log('couponValue', couponValue); | |||
const price = priceVal != null ? `${priceVal.toLocaleString()}` : priceText; | |||
const url = window.location.href; | |||
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null; | |||
return { title, price, sku: asin, url, remark: `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` }; | |||
}, couponValue); | |||
} | |||
/** | |||
* 获取所有 SKU 组合信息(笛卡尔积方式) | |||
*/ | |||
async getAllSkuInfo(page) { | |||
// 等待 SKU 分组加载 | |||
await page.waitForSelector('.a-cardui-body #twister-plus-inline-twister > .a-section'); | |||
const groupEls = await page.$$('.a-cardui-body #twister-plus-inline-twister > .a-section'); | |||
const groups = []; | |||
for (const groupEl of groupEls) { | |||
const btns = await groupEl.$$('.a-button-inner .a-button-input'); | |||
if (btns.length) groups.push(btns); | |||
} | |||
if (!groups.length) return [await this.getSingleSkuInfo(page)]; | |||
// 生成笛卡尔积组合 | |||
const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b])); | |||
let combos = groups[0].map(b => [b]); | |||
for (let i = 1; i < groups.length; i++) combos = cartesian(combos, groups[i]); | |||
const results = []; | |||
for (const combo of combos) { | |||
// 依次点击每个维度按钮 | |||
for (const btn of combo) { | |||
await btn.click(); | |||
await page.waitForLoadState('networkidle'); | |||
} | |||
// 获取当前组合信息 | |||
const info = await this.getSingleSkuInfo(page); | |||
// 添加 variants 字段 | |||
info.variants = await Promise.all( | |||
combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title')) | |||
); | |||
results.push(info); | |||
} | |||
return results; | |||
} | |||
/** | |||
* 将图片转换为base64 | |||
* @param {string} imagePath - 图片路径 | |||
* @returns {Promise<string>} base64字符串 | |||
*/ | |||
async convertImageToBase64(imagePath) { | |||
try { | |||
const imageBuffer = await fs.readFile(imagePath); | |||
return `data:image/png;base64,${imageBuffer.toString('base64')}`; | |||
} catch (error) { | |||
console.error('转换图片到base64失败:', error); | |||
return null; | |||
} | |||
} | |||
/** | |||
* 上传图片到服务器 | |||
* @param {string} imagePath - 图片路径 | |||
* @returns {Promise<string>} 上传后的图片URL | |||
*/ | |||
async uploadImage(imagePath) { | |||
try { | |||
const form = new FormData(); | |||
form.append('file', await fs.readFile(imagePath), { | |||
filename: path.basename(imagePath), | |||
contentType: 'image/png' | |||
}); | |||
const response = await fetch(this.UPLOAD_URL, { | |||
method: 'POST', | |||
body: form | |||
}); | |||
if (!response.ok) { | |||
throw new Error(`上传失败: ${response.statusText}`); | |||
} | |||
const result = await response.json(); | |||
return result.url; // 假设服务器返回的数据中包含url字段 | |||
} catch (error) { | |||
console.error('上传图片失败:', error); | |||
return null; | |||
} | |||
} | |||
/** | |||
* 主方法:抓取商品信息 | |||
*/ | |||
async crawlProductInfo(url, needScreenshot = false, includeAllSkus = false) { | |||
const browser = await this.initBrowser(); | |||
const context = await browser.newContext({ locale: 'ja-JP', userAgent: 'Mozilla/5.0' }); | |||
const page = await context.newPage(); | |||
await page.goto(url.split('?')[0], { waitUntil: 'networkidle' }); | |||
const data = includeAllSkus | |||
? await this.getAllSkuInfo(page) | |||
: [await this.getSingleSkuInfo(page)]; | |||
if (needScreenshot) { | |||
const dir = await this.createScreenshotDir(); | |||
const filename = `${Date.now()}.png`; | |||
const shot = path.join(dir, filename); | |||
await page.screenshot({ path: shot, fullPage: true }); | |||
// 上传图片并获取URL | |||
const imageUrl = await this.uploadImage(shot); | |||
// 更新数据,添加图片URL | |||
data.forEach(item => { | |||
item.screenshotUrl = imageUrl; | |||
}); | |||
// 删除临时文件 | |||
try { | |||
await fs.unlink(shot); | |||
} catch (error) { | |||
console.error('删除临时截图文件失败:', error); | |||
} | |||
} | |||
await browser.close(); | |||
return data; | |||
} | |||
} | |||
module.exports = new CrawlerService(); |