-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawlRestaurantsInCities.js
100 lines (78 loc) · 2.59 KB
/
crawlRestaurantsInCities.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
const adblocker = AdblockerPlugin({
blockTrackers: true,
});
puppeteer.use(adblocker);
const { Cluster } = require("puppeteer-cluster");
const fs = require("fs");
const { initDatabase } = require("./database/mongoConnector");
process.on("unhandledRejection", (error, p) => {
console.log("=== UNHANDLED REJECTION ===");
console.dir(error);
});
const baseURL = "https://www.tripadvisor.de";
// Selectors
const nextButtonSelector = ".nav.next";
const resultSelector = "[data-test-attribute=typeahead-results] > a";
const restaurantLinkSelector = 'a[href^="/Restaurant_"]';
// Results
let errors = [];
const restaurantUrls = fs
.readFileSync("data/restaurantsListUrls.txt", "utf8")
.split("\n");
restaurantUrls.pop();
const getRestaurantUrls = (page) =>
page.$$eval(restaurantLinkSelector, (nodes) =>
nodes.map((node) => node.href.match(/\/Restaurant_.*.html/g)[0])
);
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 2,
monitor: true,
timeout: 3600 * 2000,
puppeteer,
puppeteerOptions: {
timeout: 600 * 1000,
args: ["--disable-dev-shm-usage"],
},
});
const { insertManyRestaurants } = await initDatabase();
await cluster.task(async ({ page, data: url }) => {
await page.goto(url);
let restaurants = new Set();
const cityId = url.match(/g.*-/)[0][:-1]
let urls = await getRestaurantUrls(page);
urls.forEach((url) => restaurants.add(url));
let nextButtonUrl = await page.$eval(
nextButtonSelector,
(node) => node.href
);
while (nextButtonUrl) {
await page.goto(nextButtonUrl);
urls = await getRestaurantUrls(page);
urls.filter((item) => item.includes(cityId));
urls.forEach((url) => restaurants.add(url));
nextButtonUrl = await page.$eval(nextButtonSelector, (node) => node.href);
}
restaurants = [...restaurants].map((item) => ({
url: baseURL + item,
}));
insertManyRestaurants(restaurants);
});
cluster.on("taskerror", (err, data, willRetry) => {
if (willRetry) {
console.warn(
`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`
);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
restaurantUrls.forEach((url) => cluster.queue(url));
await cluster.idle();
await cluster.close();
})();