You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AmazonCrawler.js 13KB


  1. const BaseCrawler = require("../base/BaseCrawler");
  2. const CrawlerError = require("../../errors/CrawlerError");
  3. const path = require("path");
  4. const fs = require("fs");
  5. const fsPromises = require("fs").promises;
  6. const axios = require("axios");
  7. const FormData = require("form-data");
  8. /**
  9. * 亚马逊爬虫实现类
  10. */
  11. class AmazonCrawler extends BaseCrawler {
  12. constructor(config) {
  13. super(config);
  14. this.selectors = {
  15. title: "#productTitle",
  16. price: "span.a-price > span.a-offscreen",
  17. coupon: '.a-declarative[data-action="a-modal"], .couponLabelText',
  18. variants: ".a-cardui-body #twister-plus-inline-twister > .a-section",
  19. point: "#points_feature_div .a-color-price",
  20. };
  21. this.timeouts = config.timeouts || {
  22. pageLoad: 60000, // 页面加载超时时间
  23. elementWait: 10000, // 元素等待超时时间
  24. networkIdle: 5000, // 网络空闲超时时间
  25. };
  26. this.retryConfig = config.retry || {
  27. maxAttempts: 1, // 最大重试次数
  28. delay: 2000, // 重试延迟时间
  29. };
  30. this.uploadConfig = config.common?.upload || {
  31. url: "https://apibase.sohomall.jp/uploaders",
  32. scene: "digital-yy",
  33. timeout: 600000,
  34. };
  35. }
  36. /**
  37. * 创建截图目录
  38. * @returns {Promise<string>} 截图目录路径
  39. */
  40. async createScreenshotDir() {
  41. const dir = path.join(process.cwd(), "screenshots");
  42. try {
  43. await fsPromises.mkdir(dir, { recursive: true });
  44. } catch (error) {
  45. if (error.code !== "EEXIST") {
  46. throw new CrawlerError(
  47. "创建截图目录失败",
  48. "SCREENSHOT_DIR_ERROR",
  49. "amazon",
  50. error
  51. );
  52. }
  53. }
  54. return dir;
  55. }
  56. /**
  57. * 上传图片到服务器
  58. * @param {string} imagePath - 图片路径
  59. * @returns {Promise<string>} 图片URL
  60. */
  61. async uploadImage(imagePath) {
  62. try {
  63. const formData = new FormData();
  64. formData.append("file", fs.createReadStream(imagePath));
  65. formData.append("scene", this.uploadConfig.scene);
  66. const response = await axios.post(this.uploadConfig.url, formData, {
  67. headers: {
  68. ...formData.getHeaders(),
  69. "Content-Type": "multipart/form-data",
  70. },
  71. timeout: this.uploadConfig.timeout,
  72. });
  73. if (!response.data || !response.data.url) {
  74. throw new Error("上传响应格式错误");
  75. }
  76. return response.data.url;
  77. } catch (error) {
  78. if (error.response) {
  79. throw new CrawlerError(
  80. `图片上传失败: ${error.response.status} ${error.response.statusText}`,
  81. "IMAGE_UPLOAD_ERROR",
  82. "amazon",
  83. error
  84. );
  85. }
  86. throw new CrawlerError(
  87. "图片上传失败",
  88. "IMAGE_UPLOAD_ERROR",
  89. "amazon",
  90. error
  91. );
  92. }
  93. }
  94. /**
  95. * 带重试的页面导航
  96. * @param {string} url - 目标URL
  97. * @returns {Promise<void>}
  98. */
  99. async navigateWithRetry(url) {
  100. let lastError;
  101. for (let attempt = 1; attempt <= this.retryConfig.maxAttempts; attempt++) {
  102. try {
  103. await this.page.goto(url, {
  104. waitUntil: "networkidle",
  105. timeout: this.timeouts.pageLoad,
  106. });
  107. return;
  108. } catch (error) {
  109. lastError = error;
  110. console.log(
  111. `导航尝试 ${attempt}/${this.retryConfig.maxAttempts} 失败:`,
  112. error.message
  113. );
  114. if (attempt < this.retryConfig.maxAttempts) {
  115. console.log(`等待 ${this.retryConfig.delay}ms 后重试...`);
  116. await new Promise((resolve) =>
  117. setTimeout(resolve, this.retryConfig.delay)
  118. );
  119. // 重新初始化浏览器
  120. await this.closeBrowser();
  121. await this.initBrowser();
  122. }
  123. }
  124. }
  125. throw new CrawlerError(
  126. `页面导航失败,已重试 ${this.retryConfig.maxAttempts} 次`,
  127. "NAVIGATION_ERROR",
  128. "amazon",
  129. lastError
  130. );
  131. }
  132. /**
  133. * 等待元素出现
  134. * @param {string} selector - 选择器
  135. * @returns {Promise<void>}
  136. */
  137. async waitForElement(selector) {
  138. try {
  139. await this.page.waitForSelector(selector, {
  140. timeout: this.timeouts.elementWait,
  141. });
  142. } catch (error) {
  143. throw new CrawlerError(
  144. `等待元素超时: ${selector}`,
  145. "ELEMENT_WAIT_ERROR",
  146. "amazon",
  147. error
  148. );
  149. }
  150. }
  151. /**
  152. * 解析价格
  153. * @param {string} priceText - 价格文本
  154. * @returns {Promise<number>} 解析后的价格
  155. */
  156. async parsePrice(priceText) {
  157. try {
  158. if (!priceText) return null;
  159. // 移除货币符号和空格
  160. priceText = priceText.replace(/[¥JP¥\s]/g, "");
  161. // 如果包含积分信息,只取价格部分
  162. if (priceText.includes("ポイント")) {
  163. priceText = priceText.split("ポイント")[0].trim();
  164. }
  165. // 提取数字部分
  166. const match = priceText.match(/([\d,]+)/);
  167. if (!match) return null;
  168. // 转换价格
  169. return parseInt(match[1].replace(/,/g, ""));
  170. } catch (error) {
  171. throw new CrawlerError(
  172. "价格解析失败",
  173. "PRICE_PARSE_ERROR",
  174. "amazon",
  175. error
  176. );
  177. }
  178. }
  179. /**
  180. * 处理积分
  181. * @returns {Promise<number>} 积分金额
  182. */
  183. async handlePoint() {
  184. try {
  185. let pointValue = 0;
  186. const pointTrigger = await this.page.$(this.selectors.point);
  187. if (!pointTrigger) {
  188. return 0; // 没有积分,直接返回0
  189. } else {
  190. const pointText = await this.page.$eval(this.selectors.point, (el) =>
  191. el.textContent.trim()
  192. );
  193. const match = pointText.match(/\d+/);
  194. if (match) {
  195. pointValue = match[0];
  196. }
  197. return pointValue;
  198. }
  199. } catch (error) {
  200. console.log("积分处理失败:", error.message);
  201. return 0; // 发生错误时返回0,而不是抛出异常
  202. }
  203. }
  204. /**
  205. * 处理优惠券
  206. * @returns {Promise<number>} 优惠券金额
  207. */
  208. async handleCoupon() {
  209. try {
  210. let couponValue = 0;
  211. // 等待优惠券元素出现
  212. const couponTrigger = await this.page.$(this.selectors.coupon);
  213. if (!couponTrigger) {
  214. return 0; // 没有优惠券,直接返回0
  215. }
  216. try {
  217. // 点击优惠券按钮
  218. await couponTrigger.click();
  219. await this.page.waitForTimeout(1000); // 增加等待时间,确保弹窗完全显示
  220. // 等待优惠券文本出现
  221. const couponText = await this.page.$eval(".couponLabelText", (el) =>
  222. el.textContent.trim()
  223. );
  224. // 解析优惠券金额
  225. const match = couponText.match(/¥\s*([\d,]+)/);
  226. if (match) {
  227. couponValue = parseInt(match[1].replace(/,/g, ""));
  228. }
  229. // 尝试关闭弹窗
  230. try {
  231. await this.page.click("button.a-modal-close", { timeout: 2000 });
  232. } catch (closeError) {
  233. // 如果找不到关闭按钮,尝试按ESC键
  234. await this.page.keyboard.press("Escape");
  235. }
  236. // 等待弹窗消失
  237. await this.page.waitForTimeout(500);
  238. } catch (clickError) {
  239. console.log("没有优惠券", clickError.message);
  240. // 如果点击失败,尝试按ESC键关闭可能的弹窗
  241. try {
  242. await this.page.keyboard.press("Escape");
  243. } catch (escError) {
  244. console.log("ESC键关闭失败:", escError.message);
  245. }
  246. }
  247. return couponValue;
  248. } catch (error) {
  249. console.log("优惠券处理失败:", error.message);
  250. return 0; // 发生错误时返回0,而不是抛出异常
  251. }
  252. }
  253. /**
  254. * 获取商品标题
  255. * @returns {Promise<string>} 商品标题
  256. */
  257. async getTitle() {
  258. try {
  259. return await this.page.$eval(this.selectors.title, (el) =>
  260. el.textContent.trim()
  261. );
  262. } catch (error) {
  263. throw new CrawlerError(
  264. "获取标题失败",
  265. "TITLE_GET_ERROR",
  266. "amazon",
  267. error
  268. );
  269. }
  270. }
  271. /**
  272. * 获取商品SKU
  273. * @returns {Promise<string>} 商品SKU
  274. */
  275. async getSku() {
  276. try {
  277. const url = this.page.url();
  278. return url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
  279. } catch (error) {
  280. throw new CrawlerError("获取SKU失败", "SKU_GET_ERROR", "amazon", error);
  281. }
  282. }
  283. /**
  284. * 获取商品变体信息
  285. * @returns {Promise<Array>} 变体信息数组
  286. */
  287. async getVariants() {
  288. try {
  289. await this.page.waitForSelector(this.selectors.variants);
  290. const groupEls = await this.page.$$(this.selectors.variants);
  291. const groups = [];
  292. for (const groupEl of groupEls) {
  293. const btns = await groupEl.$$(".a-button-inner .a-button-input");
  294. if (btns.length) groups.push(btns);
  295. }
  296. return groups;
  297. } catch (error) {
  298. throw new CrawlerError(
  299. "获取变体信息失败",
  300. "VARIANTS_GET_ERROR",
  301. "amazon",
  302. error
  303. );
  304. }
  305. }
  306. /**
  307. * 获取单个SKU信息
  308. * @returns {Promise<Object>} SKU信息
  309. */
  310. async getSingleSkuInfo() {
  311. try {
  312. // 等待页面加载完成
  313. await this.page.waitForLoadState("networkidle");
  314. // 等待标题元素出现
  315. await this.waitForElement(this.selectors.title);
  316. // 处理优惠券
  317. const couponValue = await this.handleCoupon();
  318. // 处理积分
  319. const pointValue = await this.handlePoint();
  320. // 获取商品信息
  321. const info = await this.page.evaluate(
  322. ({ selectors, couponValue, pointValue }) => {
  323. const title =
  324. document.querySelector(selectors.title)?.textContent.trim() || null;
  325. let priceText =
  326. document.querySelector(selectors.price)?.textContent.trim() || null;
  327. // 处理价格文本
  328. if (priceText?.includes("ポイント")) {
  329. priceText = priceText.split("ポイント")[0].trim();
  330. }
  331. // 解析价格
  332. const price = priceText
  333. ? parseInt(priceText.replace(/[¥JP¥\s,]/g, "")) -
  334. couponValue -
  335. pointValue
  336. : null;
  337. const url = window.location.href;
  338. const asin = url.match(/\/dp\/([A-Z0-9]{10})/)?.[1] || null;
  339. return {
  340. title,
  341. price: price ? price.toLocaleString() : null,
  342. sku: asin,
  343. url,
  344. };
  345. },
  346. { selectors: this.selectors, couponValue, pointValue }
  347. );
  348. // 验证必要信息
  349. if (!info.title || !info.price || !info.sku) {
  350. throw new Error("商品信息不完整");
  351. }
  352. return info;
  353. } catch (error) {
  354. throw new CrawlerError(
  355. "获取SKU信息失败",
  356. "SKU_INFO_GET_ERROR",
  357. "amazon",
  358. error
  359. );
  360. }
  361. }
  362. /**
  363. * 主方法:抓取商品信息
  364. * @param {string} url - 商品URL
  365. * @param {boolean} needScreenshot - 是否需要截图
  366. * @returns {Promise<Array>} 商品信息数组
  367. */
  368. async crawl(url, needScreenshot = false) {
  369. try {
  370. await this.initBrowser();
  371. // 设置页面超时
  372. this.page.setDefaultTimeout(this.timeouts.elementWait);
  373. this.page.setDefaultNavigationTimeout(this.timeouts.pageLoad);
  374. // 设置请求拦截
  375. await this.page.route("**/*", (route) => {
  376. const resourceType = route.request().resourceType();
  377. // 只阻止字体和媒体资源,允许加载图片
  378. if (["font", "media"].includes(resourceType)) {
  379. route.abort();
  380. } else {
  381. route.continue();
  382. }
  383. });
  384. // 导航到目标页面
  385. await this.navigateWithRetry(url.split("?")[0]);
  386. // 只获取单个SKU信息
  387. const data = [await this.getSingleSkuInfo()];
  388. if (needScreenshot) {
  389. try {
  390. const dir = await this.createScreenshotDir();
  391. const filename = `${Date.now()}.png`;
  392. const shot = path.join(dir, filename);
  393. // 等待页面完全加载
  394. await this.page.waitForLoadState("networkidle");
  395. // 截取全页面
  396. await this.page.screenshot({
  397. path: shot,
  398. fullPage: true,
  399. timeout: this.timeouts.elementWait,
  400. });
  401. // 上传图片并获取URL
  402. const imageUrl = await this.uploadImage(shot);
  403. // 更新数据,添加图片URL
  404. data.forEach((item) => {
  405. item.screenshotUrl = imageUrl;
  406. });
  407. // 删除临时文件
  408. try {
  409. await fsPromises.unlink(shot);
  410. } catch (error) {
  411. console.error("删除临时截图文件失败:", error);
  412. }
  413. } catch (error) {
  414. console.error("截图处理失败:", error);
  415. // 截图失败不影响主流程
  416. }
  417. }
  418. return data;
  419. } catch (error) {
  420. throw new CrawlerError(
  421. "商品信息抓取失败",
  422. "CRAWL_ERROR",
  423. "amazon",
  424. error
  425. );
  426. } finally {
  427. await this.closeBrowser();
  428. }
  429. }
  430. }
  431. module.exports = AmazonCrawler;