-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_cpu_coolers.js
130 lines (116 loc) · 5.28 KB
/
scrape_cpu_coolers.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
const { ZenRows } = require("zenrows");
const fs = require("fs");
const { parse } = require("node-html-parser");
const apiKeys = [];
const baseURL = "https://pcpartpicker.com/products/cpu-cooler/";
const outputFile = "cpu_coolers_detailed.csv";
if (!fs.existsSync(outputFile)) {
let headers = "Name,Image URL,Product URL,Price,Manufacturer,Model,Part #,Fan RPM,Noise Level,Color,Height,CPU Socket,Water Cooled,Fanless,Specs Number\n";
fs.writeFileSync(outputFile, headers);
}
async function fetchCPUCoolerDetails(url, apiKey) {
const client = new ZenRows(apiKey);
try {
const { data } = await client.get(url, {
"js_render": "true",
});
const root = parse(data);
const specsSelector = '#product-page > div.main-wrapper.xs-col-12 > div.wrapper.wrapper__pageContent > section > div > div.main-content.col.xs-col-12.md-col-9.lg-col-9 > div.block.xs-block.md-hide.specs';
const specs = root.querySelector(specsSelector);
if (!specs) {
console.error(`Specs section not found for ${url}`);
return {}; // Return empty object if specs section is not found
}
const specsNum = specs.querySelectorAll("div.group").length;
let details = {
manufacturer: "",
model: "",
partNumber: "",
fanRPM: "",
noiseLevel: "",
color: "",
height: "",
cpuSocket: "",
waterCooled: "",
fanless: "",
specsNum: specsNum,
};
specs.querySelectorAll("div.group").forEach((group, index) => {
let title = group.querySelector("h3")?.textContent.trim();
let value = group.querySelector("div > p")?.textContent.trim() || Array.from(group.querySelectorAll("div > ul > li")).map(li => li.textContent.trim()).join(', ');
switch (title) {
case "Manufacturer":
details.manufacturer = value;
break;
case "Model":
details.model = value;
break;
case "Part #":
details.partNumber = value;
break;
case "Fan RPM":
details.fanRPM = value;
break;
case "Noise Level":
details.noiseLevel = value;
break;
case "Color":
details.color = value;
break;
case "Height":
details.height = value;
break;
case "CPU Socket":
details.cpuSocket = value;
break;
case "Water Cooled":
details.waterCooled = value;
break;
case "Fanless":
details.fanless = value;
break;
default:
break;
}
});
return details;
} catch (error) {
console.error(`Error fetching details for ${url}: ${error.message}`);
return {}; // Return empty object on failure
}
}
async function scrapePage(pageNumber) {
//let apiKeyIndex = Math.floor((pageNumber - 1) / 2) % apiKeys.length; // Get the index for the API key
let apiKey = "71be0ed3280206ab0354db216ee47c0df7bcafe8";
const client = new ZenRows(apiKey); // Define the client here with the correct API key
const url = baseURL + `#page=${pageNumber}`;
let csvContent = "";
try {
const { data } = await client.get(url, {
"js_render": "true",
"wait": "4000"
});
const root = parse(data);
const rows = root.querySelectorAll("#category_content > tr");
for (const row of rows) {
const name = row.querySelector('td.td__name > a > div.td__nameWrapper > p')?.innerText.trim();
const imageUrl = row.querySelector('td.td__name > a > div.td__imageWrapper > div > img')?.getAttribute('src');
const productUrl = "https://pcpartpicker.com" + row.querySelector('td.td__name > a')?.getAttribute('href');
const priceElement = row.querySelector('td.td__price');
let price = priceElement ? priceElement.innerText.trim().split('Add')[0].trim() : 'N/A';
const details = await fetchCPUCoolerDetails(productUrl, apiKey);
csvContent += `"${name}","${imageUrl}","${productUrl}","${price}","${details.manufacturer || 'N/A'}","${details.model || 'N/A'}","${details.partNumber || 'N/A'}","${details.fanRPM || 'N/A'}","${details.noiseLevel || 'N/A'}","${details.color || 'N/A'}","${details.height || 'N/A'}","${details.cpuSocket || 'N/A'}","${details.waterCooled || 'N/A'}","${details.fanless || 'N/A'}","${details.specsNum || "N/A"}"\n`;
}
fs.appendFileSync(outputFile, csvContent, 'utf8'); // Ensure proper encoding
console.log(`Page ${pageNumber} scraped successfully.`);
} catch (error) {
console.error(`Failed to scrape page ${pageNumber}: ${error.message}`);
}
}
(async () => {
const totalPages = 13; // Adjust the total number of pages as necessary
for (let i = 12; i <= totalPages; i++) {
await scrapePage(i);
}
console.log("Data has been written to CSV file.");
})();