Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

AmazonCrawler.js 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. const BaseCrawler = require('../base/BaseCrawler');
  2. const CrawlerError = require('../../errors/CrawlerError');
  3. const path = require('path');
  4. const fs = require('fs');
  5. const fsPromises = require('fs').promises;
  6. const axios = require('axios');
  7. const FormData = require('form-data');
  8. /**
  9. * 亚马逊爬虫实现类
  10. */
  11. class AmazonCrawler extends BaseCrawler {
  12. constructor(config) {
  13. super(config);
  14. this.selectors = {
  15. title: '#productTitle',
  16. price: 'span.a-price > span.a-offscreen',
  17. coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
  18. variants: '.a-cardui-body #twister-plus-inline-twister > .a-section'
  19. };
  20. this.timeouts = config.timeouts || {
  21. pageLoad: 60000, // 页面加载超时时间
  22. elementWait: 10000, // 元素等待超时时间
  23. networkIdle: 5000 // 网络空闲超时时间
  24. };
  25. this.retryConfig = config.retry || {
  26. maxAttempts: 3, // 最大重试次数
  27. delay: 2000 // 重试延迟时间
  28. };
  29. this.uploadConfig = config.common?.upload || {
  30. url: 'https://apibase.sohomall.jp/uploaders',
  31. scene: 'goods',
  32. timeout: 600000
  33. };
  34. }
  35. /**
  36. * 创建截图目录
  37. * @returns {Promise<string>} 截图目录路径
  38. */
  39. async createScreenshotDir() {
  40. const dir = path.join(process.cwd(), 'screenshots');
  41. try {
  42. await fsPromises.mkdir(dir, { recursive: true });
  43. } catch (error) {
  44. if (error.code !== 'EEXIST') {
  45. throw new CrawlerError('创建截图目录失败', 'SCREENSHOT_DIR_ERROR', 'amazon', error);
  46. }
  47. }
  48. return dir;
  49. }
  50. /**
  51. * 上传图片到服务器
  52. * @param {string} imagePath - 图片路径
  53. * @returns {Promise<string>} 图片URL
  54. */
  55. async uploadImage(imagePath) {
  56. try {
  57. const formData = new FormData();
  58. formData.append('file', fs.createReadStream(imagePath));
  59. formData.append('scene', this.uploadConfig.scene);
  60. const response = await axios.post(this.uploadConfig.url, formData, {
  61. headers: {
  62. ...formData.getHeaders(),
  63. 'Content-Type': 'multipart/form-data'
  64. },
  65. timeout: this.uploadConfig.timeout
  66. });
  67. if (!response.data || !response.data.url) {
  68. throw new Error('上传响应格式错误');
  69. }
  70. return response.data.url;
  71. } catch (error) {
  72. if (error.response) {
  73. throw new CrawlerError(
  74. `图片上传失败: ${error.response.status} ${error.response.statusText}`,
  75. 'IMAGE_UPLOAD_ERROR',
  76. 'amazon',
  77. error
  78. );
  79. }
  80. throw new CrawlerError('图片上传失败', 'IMAGE_UPLOAD_ERROR', 'amazon', error);
  81. }
  82. }
  83. /**
  84. * 带重试的页面导航
  85. * @param {string} url - 目标URL
  86. * @returns {Promise<void>}
  87. */
  88. async navigateWithRetry(url) {
  89. let lastError;
  90. for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
  91. try {
  92. await this.page.goto(url, {
  93. waitUntil: 'networkidle',
  94. timeout: this.timeouts.pageLoad
  95. });
  96. return;
  97. } catch (error) {
  98. lastError = error;
  99. console.log(`导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`, error.message);
  100. if (attempt < this.retryConfig.maxAttempts) {
  101. console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
  102. await new Promise(resolve => setTimeout(resolve, this.retryConfig.delay));
  103. // 重新初始化浏览器
  104. await this.closeBrowser();
  105. await this.initBrowser();
  106. }
  107. }
  108. }
  109. throw new CrawlerError(
  110. `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
  111. 'NAVIGATION_ERROR',
  112. 'amazon',
  113. lastError
  114. );
  115. }
  116. /**
  117. * 等待元素出现
  118. * @param {string} selector - 选择器
  119. * @returns {Promise<void>}
  120. */
  121. async waitForElement(selector) {
  122. try {
  123. await this.page.waitForSelector(selector, {
  124. timeout: this.timeouts.elementWait
  125. });
  126. } catch (error) {
  127. throw new CrawlerError(
  128. `等待元素超时: ${selector}`,
  129. 'ELEMENT_WAIT_ERROR',
  130. 'amazon',
  131. error
  132. );
  133. }
  134. }
  135. /**
  136. * 解析价格
  137. * @param {string} priceText - 价格文本
  138. * @returns {Promise<number>} 解析后的价格
  139. */
  140. async parsePrice(priceText) {
  141. try {
  142. if (!priceText) return null;
  143. // 移除货币符号和空格
  144. priceText = priceText.replace(/[¥JP¥\s]/g, '');
  145. // 如果包含积分信息,只取价格部分
  146. if (priceText.includes('ポイント')) {
  147. priceText = priceText.split('ポイント')[0].trim();
  148. }
  149. // 提取数字部分
  150. const match = priceText.match(/([\d,]+)/);
  151. if (!match) return null;
  152. // 转换价格
  153. return parseInt(match[1].replace(/,/g, ''));
  154. } catch (error) {
  155. throw new CrawlerError('价格解析失败', 'PRICE_PARSE_ERROR', 'amazon', error);
  156. }
  157. }
  158. /**
  159. * 处理优惠券
  160. * @returns {Promise<number>} 优惠券金额
  161. */
  162. async handleCoupon() {
  163. try {
  164. let couponValue = 0;
  165. // 等待优惠券元素出现
  166. const couponTrigger = await this.page.$(this.selectors.coupon);
  167. if (!couponTrigger) {
  168. return 0; // 没有优惠券,直接返回0
  169. }
  170. try {
  171. // 点击优惠券按钮
  172. await couponTrigger.click();
  173. await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示
  174. // 等待优惠券文本出现
  175. const couponText = await this.page.$eval('.couponLabelText', el => el.textContent.trim());
  176. // 解析优惠券金额
  177. const match = couponText.match(/¥\s*([\d,]+)/);
  178. if (match) {
  179. couponValue = parseInt(match[1].replace(/,/g, ''));
  180. }
  181. // 尝试关闭弹窗
  182. try {
  183. await this.page.click('button.a-modal-close', { timeout: 2000 });
  184. } catch (closeError) {
  185. // 如果找不到关闭按钮,尝试按ESC键
  186. await this.page.keyboard.press('Escape');
  187. }
  188. // 等待弹窗消失
  189. await this.page.waitForTimeout(500);
  190. } catch (clickError) {
  191. console.log('优惠券点击或处理失败:', clickError.message);
  192. // 如果点击失败,尝试按ESC键关闭可能的弹窗
  193. try {
  194. await this.page.keyboard.press('Escape');
  195. } catch (escError) {
  196. console.log('ESC键关闭失败:', escError.message);
  197. }
  198. }
  199. return couponValue;
  200. } catch (error) {
  201. console.log('优惠券处理失败:', error.message);
  202. return 0; // 发生错误时返回0,而不是抛出异常
  203. }
  204. }
  205. /**
  206. * 获取商品标题
  207. * @returns {Promise<string>} 商品标题
  208. */
  209. async getTitle() {
  210. try {
  211. return await this.page.$eval(this.selectors.title, el => el.textContent.trim());
  212. } catch (error) {
  213. throw new CrawlerError('获取标题失败', 'TITLE_GET_ERROR', 'amazon', error);
  214. }
  215. }
  216. /**
  217. * 获取商品SKU
  218. * @returns {Promise<string>} 商品SKU
  219. */
  220. async getSku() {
  221. try {
  222. const url = this.page.url();
  223. return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
  224. } catch (error) {
  225. throw new CrawlerError('获取SKU失败', 'SKU_GET_ERROR', 'amazon', error);
  226. }
  227. }
  228. /**
  229. * 获取商品变体信息
  230. * @returns {Promise<Array>} 变体信息数组
  231. */
  232. async getVariants() {
  233. try {
  234. await this.page.waitForSelector(this.selectors.variants);
  235. const groupEls = await this.page.$$(this.selectors.variants);
  236. const groups = [];
  237. for (const groupEl of groupEls) {
  238. const btns = await groupEl.$$('.a-button-inner .a-button-input');
  239. if (btns.length) groups.push(btns);
  240. }
  241. return groups;
  242. } catch (error) {
  243. throw new CrawlerError('获取变体信息失败', 'VARIANTS_GET_ERROR', 'amazon', error);
  244. }
  245. }
  246. /**
  247. * 获取单个SKU信息
  248. * @returns {Promise<Object>} SKU信息
  249. */
  250. async getSingleSkuInfo() {
  251. try {
  252. // 等待页面加载完成
  253. await this.page.waitForLoadState('networkidle');
  254. // 等待标题元素出现
  255. await this.waitForElement(this.selectors.title);
  256. // 处理优惠券
  257. const couponValue = await this.handleCoupon();
  258. // 获取商品信息
  259. const info = await this.page.evaluate(({ selectors, couponValue }) => {
  260. const title = document.querySelector(selectors.title)?.textContent.trim() || null;
  261. let priceText = document.querySelector(selectors.price)?.textContent.trim() || null;
  262. // 处理价格文本
  263. if (priceText?.includes('ポイント')) {
  264. priceText = priceText.split('ポイント')[0].trim();
  265. }
  266. // 解析价格
  267. const price = priceText ? parseInt(priceText.replace(/[¥JP¥\s,]/g, '')) - couponValue : null;
  268. const url = window.location.href;
  269. const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
  270. return {
  271. title,
  272. price: price ? price.toLocaleString() : null,
  273. sku: asin,
  274. url,
  275. remark: couponValue > 0 ? `Original Price: JP¥${priceText} Coupon Price: JP¥${couponValue}` : null
  276. };
  277. }, { selectors: this.selectors, couponValue });
  278. // 验证必要信息
  279. if (!info.title || !info.price || !info.sku) {
  280. throw new Error('商品信息不完整');
  281. }
  282. return info;
  283. } catch (error) {
  284. throw new CrawlerError('获取SKU信息失败', 'SKU_INFO_GET_ERROR', 'amazon', error);
  285. }
  286. }
  287. /**
  288. * 获取所有SKU组合信息
  289. * @returns {Promise<Array>} SKU信息数组
  290. */
  291. async getAllSkuInfo() {
  292. try {
  293. const groups = await this.getVariants();
  294. if (!groups.length) return [await this.getSingleSkuInfo()];
  295. // 生成笛卡尔积组合
  296. const cartesian = (arr1, arr2) => arr1.flatMap(a => arr2.map(b => [...a, b]));
  297. let combos = groups[0].map(b => [b]);
  298. for (let i = 1; i < groups.length; i++) {
  299. combos = cartesian(combos, groups[i]);
  300. }
  301. const results = [];
  302. for (const combo of combos) {
  303. // 依次点击每个维度按钮
  304. for (const btn of combo) {
  305. await btn.click();
  306. await this.page.waitForLoadState('networkidle');
  307. }
  308. // 获取当前组合信息
  309. const info = await this.getSingleSkuInfo();
  310. info.variants = await Promise.all(
  311. combo.map(btn => btn.getAttribute('aria-label') || btn.getAttribute('title'))
  312. );
  313. results.push(info);
  314. }
  315. return results;
  316. } catch (error) {
  317. throw new CrawlerError('获取所有SKU信息失败', 'ALL_SKU_INFO_GET_ERROR', 'amazon', error);
  318. }
  319. }
  320. /**
  321. * 主方法:抓取商品信息
  322. * @param {string} url - 商品URL
  323. * @param {boolean} needScreenshot - 是否需要截图
  324. * @param {boolean} includeAllSkus - 是否包含所有SKU
  325. * @returns {Promise<Array>} 商品信息数组
  326. */
  327. async crawl(url, needScreenshot = false, includeAllSkus = false) {
  328. try {
  329. await this.initBrowser();
  330. // 设置页面超时
  331. this.page.setDefaultTimeout(this.timeouts.elementWait);
  332. this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
  333. // 设置请求拦截
  334. await this.page.route('**/*', route => {
  335. const resourceType = route.request().resourceType();
  336. // 只阻止字体和媒体资源,允许加载图片
  337. if (['font', 'media'].includes(resourceType)) {
  338. route.abort();
  339. } else {
  340. route.continue();
  341. }
  342. });
  343. // 导航到目标页面
  344. await this.navigateWithRetry(url.split('?')[0]);
  345. const data = includeAllSkus
  346. ? await this.getAllSkuInfo()
  347. : [await this.getSingleSkuInfo()];
  348. if (needScreenshot) {
  349. try {
  350. const dir = await this.createScreenshotDir();
  351. const filename = `${Date.now()}.png`;
  352. const shot = path.join(dir, filename);
  353. // 等待页面完全加载
  354. await this.page.waitForLoadState('networkidle');
  355. // 截取全页面
  356. await this.page.screenshot({
  357. path: shot,
  358. fullPage: true,
  359. timeout: this.timeouts.elementWait
  360. });
  361. // 上传图片并获取URL
  362. const imageUrl = await this.uploadImage(shot);
  363. // 更新数据,添加图片URL
  364. data.forEach(item => {
  365. item.screenshotUrl = imageUrl;
  366. });
  367. // 删除临时文件
  368. try {
  369. await fsPromises.unlink(shot);
  370. } catch (error) {
  371. console.error('删除临时截图文件失败:', error);
  372. }
  373. } catch (error) {
  374. console.error('截图处理失败:', error);
  375. // 截图失败不影响主流程
  376. }
  377. }
  378. return data;
  379. } catch (error) {
  380. throw new CrawlerError('商品信息抓取失败', 'CRAWL_ERROR', 'amazon', error);
  381. } finally {
  382. await this.closeBrowser();
  383. }
  384. }
  385. }
  386. module.exports = AmazonCrawler;