Browse Source

feat: 初始化商品信息爬虫服务项目

- 新增项目结构,包括主要文件和目录
- 添加 .gitignore 文件以排除不必要的文件
- 创建 package.json 和 package-lock.json,定义项目依赖
- 实现基本的 Express 应用,配置中间件和路由
- 添加爬虫服务逻辑,支持商品信息抓取和截图功能
- 完善 README 文档,提供项目功能、安装说明和使用示例
master
lizhuang 1 month ago
parent
commit
7586c25fb5
10 changed files with 5821 additions and 0 deletions
  1. 34
    0
      .gitignore
  2. 103
    0
      README.md
  3. 5208
    0
      package-lock.json
  4. 36
    0
      package.json
  5. 28
    0
      src/app.js
  6. 45
    0
      src/controllers/crawlerController.js
  7. 25
    0
      src/middlewares/errorHandler.js
  8. 12
    0
      src/routes/index.js
  9. 134
    0
      src/server.js
  10. 196
    0
      src/services/crawlerService.js

+ 34
- 0
.gitignore View File

@@ -76,3 +76,37 @@ typings/
# FuseBox cache
.fusebox/

# 环境配置
.env.local
.env.*.local

# 运行时数据
screenshots/

# IDE
.idea/
.vscode/
*.swp
*.swo

# 操作系统
.DS_Store
Thumbs.db


# 临时文件
*.tmp
*.tmp.*
*.tmp.*.*
*.tmp.*.*.*

# 缓存
.cache

# 日志
logs


.cursorrules



+ 103
- 0
README.md View File

@@ -1,2 +1,105 @@
# crawling-service

一个基于 Node.js 和 Playwright 的商品信息爬虫服务。

## 功能特性

- 🚀 提供 RESTful API 接口
- 🔍 支持商品信息爬取
- 📸 支持商品截图功能
- 🛡️ 内置安全防护
- 📝 完整的日志记录
- 🔄 模块化设计

## 技术栈

- Node.js (v22.x)
- Express.js - Web 框架
- Playwright - 浏览器自动化
- Winston - 日志管理
- Jest - 单元测试
- Helmet - 安全中间件
- CORS - 跨域支持

## 项目结构

```
src/
├── config/ # 配置文件
├── controllers/ # 控制器
├── middlewares/ # 中间件
├── routes/ # 路由
├── services/ # 业务逻辑
├── utils/ # 工具函数
├── app.js # 应用入口
└── server.js # 服务器配置
```

## 安装说明

1. 确保已安装 Node.js v22.x
2. 克隆项目
```bash
git clone http://dev.sohomall.jp:3000/lizhuang/crawling-service.git
cd crawling-service
```

3. 安装依赖
```bash
npm install
```

4. 配置环境变量
```bash
cp .env.example .env
# 编辑 .env 文件,配置必要的环境变量
```

## 运行服务

开发环境:
```bash
npm run dev
```

生产环境:
```bash
npm start
```

## API 接口

### 商品信息查询

- 请求方式:GET
- 接口路径:`/api/product/info`
- 参数:
- url: 商品URL(必填)
- needScreenshot: 是否需要截图(可选,默认 false)

## 开发规范

- 遵循 SOLID 原则
- 函数职责单一
- 完整的异常处理
- 清晰的代码注释
- 统一的代码风格

## 测试

运行测试:
```bash
npm test
```

## 贡献指南

1. Fork 项目
2. 创建特性分支
3. 提交变更
4. 推送到分支
5. 创建 Pull Request

## 许可证

MIT License

+ 5208
- 0
package-lock.json
File diff suppressed because it is too large
View File


+ 36
- 0
package.json View File

@@ -0,0 +1,36 @@
{
"name": "crawling-service",
"version": "1.0.0",
"description": "商品信息爬虫服务",
"author": "lizhuang",
"main": "src/app.js",
"scripts": {
"start": "node src/server.js",
"dev": "nodemon src/app.js",
"test": "jest"
},
"keywords": [
"crawler",
"playwright",
"express"
],
"license": "ISC",
"dependencies": {
"cors": "^2.8.5",
"dotenv": "^16.4.5",
"express": "^4.18.2",
"form-data": "^4.0.2",
"helmet": "^7.1.0",
"node-fetch": "^2.7.0",
"playwright": "^1.42.1",
"punycode": "^2.3.1",
"winston": "^3.11.0"
},
"devDependencies": {
"jest": "^29.7.0",
"nodemon": "^3.1.0"
},
"engines": {
"node": ">=22.0.0"
}
}

+ 28
- 0
src/app.js View File

@@ -0,0 +1,28 @@
const express = require('express');
const cors = require('cors');
const helmet = require('helmet');
const { errorHandler } = require('./middlewares/errorHandler');
const routes = require('./routes');

// 创建Express应用
const app = express();

// 中间件配置
app.use(helmet()); // 安全头
app.use(cors()); // 跨域支持
app.use(express.json()); // JSON解析
app.use(express.urlencoded({ extended: true })); // URL编码解析

// 路由配置
app.use('/api', routes);

// 错误处理中间件
app.use(errorHandler);

// 启动服务器
const PORT = process.env.PORT || 8991;
app.listen(PORT, () => {
console.log(`服务器运行在 http://localhost:${PORT}`);
});

module.exports = app;

+ 45
- 0
src/controllers/crawlerController.js View File

@@ -0,0 +1,45 @@
const crawlerService = require('../services/crawlerService');

/**
* 爬虫控制器
*/
class CrawlerController {
/**
* 获取商品信息
* @param {Request} req - 请求对象
* @param {Response} res - 响应对象
* @param {NextFunction} next - 下一个中间件函数
*/
async getProductInfo(req, res, next) {
try {
const { url, screenshot, allSkus } = req.query;

// 参数验证
if (!url) {
return res.status(400).json({
success: false,
error: {
message: '商品URL是必需的',
statusCode: 400
}
});
}

// 调用爬虫服务
const productInfo = await crawlerService.crawlProductInfo(
url,
screenshot === 'true',
allSkus === 'true'
);

res.json({
success: true,
data: productInfo
});
} catch (error) {
next(error);
}
}
}

module.exports = new CrawlerController();

+ 25
- 0
src/middlewares/errorHandler.js View File

@@ -0,0 +1,25 @@
/**
* 全局错误处理中间件
* @param {Error} err - 错误对象
* @param {Request} req - 请求对象
* @param {Response} res - 响应对象
* @param {NextFunction} next - 下一个中间件函数
*/
const errorHandler = (err, req, res, next) => {
console.error('错误:', err);

// 默认错误状态码和消息
const statusCode = err.statusCode || 500;
const message = err.message || '服务器内部错误';

res.status(statusCode).json({
success: false,
error: {
message,
statusCode,
...(process.env.NODE_ENV === 'development' && { stack: err.stack })
}
});
};

module.exports = { errorHandler };

+ 12
- 0
src/routes/index.js View File

@@ -0,0 +1,12 @@
const express = require('express');
const router = express.Router();
const crawlerController = require('../controllers/crawlerController');

/**
* @route GET /api/product
* @desc 获取商品信息
* @access Public
*/
router.get('/product', crawlerController.getProductInfo);

module.exports = router;

+ 134
- 0
src/server.js View File

@@ -0,0 +1,134 @@
const express = require('express');
const path = require('path');
const cors = require('cors');
const helmet = require('helmet');
const crawlerService = require('./services/crawlerService');

// 处理 punycode 弃用警告
process.removeAllListeners('warning');
process.on('warning', (warning) => {
if (warning.name === 'DeprecationWarning' && warning.message.includes('punycode')) {
return;
}
console.warn(warning);
});

const app = express();

// 安全中间件
app.use(helmet({
crossOriginResourcePolicy: { policy: "cross-origin" }
}));

// CORS配置
app.use(cors({
origin: '*', // 允许所有来源访问,生产环境建议设置具体的域名
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'],
allowedHeaders: ['Content-Type', 'Authorization']
}));

// 解析 JSON 请求体
app.use(express.json());
// 解析 URL 编码的请求体
app.use(express.urlencoded({ extended: true }));

// 设置静态文件目录
const screenshotsDir = path.join(process.cwd(), 'screenshots');
app.use('/screenshots', express.static(screenshotsDir));

// 获取本机IP地址
const os = require('os');
function getLocalIP() {
const interfaces = os.networkInterfaces();
for (const name of Object.keys(interfaces)) {
for (const iface of interfaces[name]) {
// 跳过内部IP和非IPv4地址
if (iface.family === 'IPv4' && !iface.internal) {
return iface.address;
}
}
}
return '0.0.0.0'; // 默认监听所有接口
}

// 添加请求日志中间件
app.use((req, res, next) => {
console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`);
next();
});

// 商品信息API路由
app.get('/api/product', async (req, res) => {
try {
const { url, needScreenshot = false, includeAllSkus = false } = req.query;
console.log('收到商品信息请求:', { url, needScreenshot, includeAllSkus });
if (!url) {
console.log('请求缺少URL参数');
return res.status(400).json({
success: false,
message: '请提供商品URL',
example: '/api/product?url=https://www.amazon.co.jp/dp/XXXXX'
});
}

console.log('开始爬取商品信息:', url);
const data = await crawlerService.crawlProductInfo(url, needScreenshot === 'true', includeAllSkus === 'true');
console.log('爬取完成:', data);
res.json({
success: true,
data
});
} catch (error) {
console.error('获取商品信息失败:', error);
res.status(500).json({
success: false,
message: '获取商品信息失败',
error: error.message
});
}
});

// 添加健康检查端点
app.get('/health', (req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
});

// 404 处理
app.use((req, res) => {
res.status(404).json({
success: false,
message: '接口不存在',
path: req.path
});
});

// 错误处理中间件
app.use((err, req, res, next) => {
console.error('服务器错误:', err);
res.status(500).json({
success: false,
message: '服务器内部错误',
error: process.env.NODE_ENV === 'development' ? err.message : undefined
});
});

// 设置端口
const PORT = process.env.PORT || 8991;
const HOST = '0.0.0.0'; // 监听所有网络接口

// 启动服务器
app.listen(PORT, HOST, () => {
const localIP = getLocalIP();
console.log('服务器配置:');
console.log(`- 监听地址: ${HOST}:${PORT}`);
console.log(`- 本地访问: http://localhost:${PORT}`);
console.log(`- 局域网访问: http://${localIP}:${PORT}`);
console.log(`- 健康检查: http://${localIP}:${PORT}/health`);
console.log('\nAPI 使用示例:');
console.log('http://localhost:8991/api/product?url=https://www.amazon.co.jp/dp/XXXXX');
console.log('可选参数:');
console.log('- needScreenshot=true');
console.log('- includeAllSkus=true');
});

+ 196
- 0
src/services/crawlerService.js View File

@@ -0,0 +1,196 @@
const { chromium } = require('playwright');
const path = require('path');
const fs = require('fs').promises;
const FormData = require('form-data');
const fetch = require('node-fetch');

/**
* Amazon 商品信息爬虫服务(Playwright)
*/
class CrawlerService {
constructor() {
// 设置静态文件访问的基础URL
this.UPLOAD_URL = 'https://apibase.sohomall.jp/uploaders?scene=goods';
}

async initBrowser() {
return await chromium.launch({ headless: true });
}

async createScreenshotDir() {
const dir = path.join(process.cwd(), 'screenshots');
await fs.mkdir(dir, { recursive: true });
return dir;
}

/**
* 获取单个 SKU 信息(含优惠券扣减)
*/
async getSingleSkuInfo(page) {
// 添加控制台日志监听
page.on('console', msg => console.log('Browser Console:', msg.text()));
await page.waitForTimeout(500);
let couponValue = 0;
const couponTrigger = await page.$('.a-declarative[data-action="a-modal"], .couponLabelText');
if (couponTrigger) {
try {
await couponTrigger.click();
await page.waitForTimeout(500);
} catch {}
try {
const couponText = await page.$eval('.couponLabelText', el => el.textContent.trim());
const m = couponText.match(/¥\s*([\d,]+)/);
couponValue = m ? parseInt(m[1].replace(/,/g, '')) : 0;
console.log('Found coupon value:', couponValue);
} catch {}
// 尝试关闭弹窗
try { await page.click('button.a-modal-close', { timeout: 1000 }); } catch { await page.keyboard.press('Escape'); }
}

return await page.evaluate(couponValue => {
const title = document.querySelector('#productTitle')?.textContent.trim() || null;
let priceText = document.querySelector('span.a-price > span.a-offscreen')?.textContent.trim()
|| document.querySelector('#priceblock_dealprice')?.textContent.trim()
|| document.querySelector('#priceblock_saleprice')?.textContent.trim()
|| document.querySelector('#priceblock_ourprice')?.textContent.trim()
|| null;
if (priceText?.includes('ポイント')) priceText = priceText.split('ポイント')[0].trim();
priceText = priceText.replace('¥', '');
priceText = priceText.replace('JP¥', '');
const m = priceText?.match(/\s*([\d,]+)/);
let priceVal = m ? parseInt(m[1].replace(/,/g, '')) : null;
if (priceVal != null) priceVal -= couponValue;
console.log('priceText', priceText);
console.log('priceVal', priceVal);
console.log('couponValue', couponValue);
const price = priceVal != null ? `${priceVal.toLocaleString()}` : priceText;
const url = window.location.href;
const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
return { title, price, sku: asin, url, remark: `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` };
}, couponValue);
}

/**
* 获取所有 SKU 组合信息(笛卡尔积方式)
*/
async getAllSkuInfo(page) {
// 等待 SKU 分组加载
await page.waitForSelector('.a-cardui-body #twister-plus-inline-twister > .a-section');
const groupEls = await page.$$('.a-cardui-body #twister-plus-inline-twister > .a-section');
const groups = [];
for (const groupEl of groupEls) {
const btns = await groupEl.$$('.a-button-inner .a-button-input');
if (btns.length) groups.push(btns);
}
if (!groups.length) return [await this.getSingleSkuInfo(page)];

// 生成笛卡尔积组合
const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b]));
let combos = groups[0].map(b => [b]);
for (let i = 1; i < groups.length; i++) combos = cartesian(combos, groups[i]);

const results = [];
for (const combo of combos) {
// 依次点击每个维度按钮
for (const btn of combo) {
await btn.click();
await page.waitForLoadState('networkidle');
}
// 获取当前组合信息
const info = await this.getSingleSkuInfo(page);
// 添加 variants 字段
info.variants = await Promise.all(
combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title'))
);
results.push(info);
}
return results;
}

/**
* 将图片转换为base64
* @param {string} imagePath - 图片路径
* @returns {Promise<string>} base64字符串
*/
async convertImageToBase64(imagePath) {
try {
const imageBuffer = await fs.readFile(imagePath);
return `data:image/png;base64,${imageBuffer.toString('base64')}`;
} catch (error) {
console.error('转换图片到base64失败:', error);
return null;
}
}

/**
* 上传图片到服务器
* @param {string} imagePath - 图片路径
* @returns {Promise<string>} 上传后的图片URL
*/
async uploadImage(imagePath) {
try {
const form = new FormData();
form.append('file', await fs.readFile(imagePath), {
filename: path.basename(imagePath),
contentType: 'image/png'
});

const response = await fetch(this.UPLOAD_URL, {
method: 'POST',
body: form
});

if (!response.ok) {
throw new Error(`上传失败: ${response.statusText}`);
}

const result = await response.json();
return result.url; // 假设服务器返回的数据中包含url字段
} catch (error) {
console.error('上传图片失败:', error);
return null;
}
}

/**
* 主方法:抓取商品信息
*/
async crawlProductInfo(url, needScreenshot = false, includeAllSkus = false) {
const browser = await this.initBrowser();
const context = await browser.newContext({ locale: 'ja-JP', userAgent: 'Mozilla/5.0' });
const page = await context.newPage();
await page.goto(url.split('?')[0], { waitUntil: 'networkidle' });

const data = includeAllSkus
? await this.getAllSkuInfo(page)
: [await this.getSingleSkuInfo(page)];

if (needScreenshot) {
const dir = await this.createScreenshotDir();
const filename = `${Date.now()}.png`;
const shot = path.join(dir, filename);
await page.screenshot({ path: shot, fullPage: true });
// 上传图片并获取URL
const imageUrl = await this.uploadImage(shot);
// 更新数据,添加图片URL
data.forEach(item => {
item.screenshotUrl = imageUrl;
});

// 删除临时文件
try {
await fs.unlink(shot);
} catch (error) {
console.error('删除临时截图文件失败:', error);
}
}

await browser.close();
return data;
}
}

module.exports = new CrawlerService();

Loading…
Cancel
Save