- 将爬虫配置中的截图选项添加到配置文件中,包含视口尺寸、图片质量和格式,以增强截图功能的灵活性 - 在亚马逊爬虫中引入截图配置,确保在截图时使用配置中的选项,提高代码的可维护性和一致性 - 修改商品信息示例中的请求超时时间,从 30 秒增加到 60 秒,提升请求的稳定性 - 该更改旨在提高爬虫的功能性和代码的可读性,符合 SOLID 原则和命名规范master
@@ -11,10 +11,10 @@ const localClient = new ProductApiClient({ | |||
const serverClient = { | |||
// baseURL: "http://192.168.1.107:8080", // 本地 | |||
baseURL: "https://digital.sohomall.jp/prod-api", // 外网 | |||
timeout: 10000 * 30, // 30秒 | |||
timeout: 10000 * 60, // 60秒 | |||
params: { | |||
pageNum: 1, | |||
pageSize: 500, | |||
pageSize: 1000, | |||
}, | |||
}; | |||
@@ -16,6 +16,20 @@ module.exports = { | |||
}, | |||
page: { | |||
locale: 'ja-JP', | |||
}, | |||
// 截图配置 | |||
screenshot: { | |||
// 视口尺寸 | |||
viewport: { | |||
width: 1920, | |||
height: 1080 | |||
}, | |||
// 截图选项 | |||
options: { | |||
fullPage: false, | |||
quality: 60, // 图片质量 0-100 | |||
type: 'png' // 图片格式: 'png' | 'jpeg' | |||
} | |||
} | |||
}, | |||
@@ -33,6 +33,16 @@ class AmazonCrawler extends BaseCrawler { | |||
scene: "digital-yy", | |||
timeout: 600000, | |||
}; | |||
// 截图配置 | |||
this.screenshotConfig = { | |||
// 截图选项 | |||
options: { | |||
fullPage: false, | |||
quality: 60, // 图片质量 0-100 | |||
type: "jpeg", | |||
}, | |||
}; | |||
} | |||
/** | |||
@@ -56,6 +66,24 @@ class AmazonCrawler extends BaseCrawler { | |||
return dir; | |||
} | |||
/** | |||
* 设置页面视口尺寸 | |||
* @param {number} width - 宽度 | |||
* @param {number} height - 高度 | |||
* @returns {Promise<void>} | |||
*/ | |||
async setViewportSize(width, height) { | |||
try { | |||
await this.page.setViewportSize({ width, height }); | |||
} catch (error) { | |||
throw new CrawlerError( | |||
"设置视口尺寸失败", | |||
"VIEWPORT_SIZE_ERROR", | |||
"amazon", | |||
error | |||
); | |||
} | |||
} | |||
/** | |||
* 上传图片到服务器 | |||
* @param {string} imagePath - 图片路径 | |||
@@ -79,6 +107,8 @@ class AmazonCrawler extends BaseCrawler { | |||
throw new Error("上传响应格式错误"); | |||
} | |||
console.log(response.data.url); | |||
return response.data.url; | |||
} catch (error) { | |||
if (error.response) { | |||
@@ -440,10 +470,18 @@ class AmazonCrawler extends BaseCrawler { | |||
// 等待页面完全加载 | |||
await this.page.waitForLoadState("networkidle"); | |||
// 获取截图配置 | |||
const screenshotOptions = this.screenshotConfig.options || { | |||
fullPage: false, | |||
}; | |||
// 等待页面加载完成 | |||
await this.page.waitForTimeout(500); | |||
// 截取全页面 | |||
await this.page.screenshot({ | |||
path: shot, | |||
fullPage: true, | |||
...screenshotOptions, | |||
timeout: this.timeouts.elementWait, | |||
}); | |||
@@ -19,6 +19,10 @@ class BaseCrawler { | |||
* @returns {Promise<void>} | |||
*/ | |||
async initBrowser() { | |||
// 获取截图配置 | |||
const screenshotConfig = this.config?.common?.screenshot || {}; | |||
const viewportConfig = screenshotConfig.viewport || { width: 1920, height: 1080 }; | |||
this.browser = await chromium.launch({ | |||
headless: false, | |||
args: ["--no-sandbox", "--disable-setuid-sandbox"], | |||
@@ -27,6 +31,8 @@ class BaseCrawler { | |||
locale: "ja-JP", | |||
userAgent: | |||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |||
// 使用配置中的视口尺寸 | |||
viewport: viewportConfig | |||
}); | |||
this.page = await this.context.newPage(); | |||
} |