-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.js
124 lines (103 loc) · 3.72 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import puppeteer from "puppeteer";
import fs from "fs";
import xlsx from "xlsx";
import chalk from "chalk";
import nanospinner, { createSpinner } from "nanospinner";
import { exit } from "process";
const json_file = 'old_data.json';
const xlsx_file = 'internships.xlsx';
let old_data;
fs.readFile(json_file, 'utf-8', (err,data) => {
if(err){
return;
}
old_data = JSON.parse(data);
});
const spinner = createSpinner('Scraping the web for internships...').start();
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const url = 'https://iaeste.org/internships?utf8=%E2%9C%93&ref_no=&discipline%5B%5D=11&internship_type=open&sort=deadline_at'
try {
await page.goto(url);
} catch (error) {
spinner.error({text: chalk.redBright("Error loading the page. Check log file.")});
exit();
}
const number_pages = await page.evaluate(()=> {
const element = document.querySelector('body > main > div:nth-child(5) > div > div.pagination-wrapper > a');
return element?element.innerText.substring(2):3
}
);
let internships = [];
let next_url = url;
let curr_page = 1;
do{
internships = internships.concat(await page.evaluate(()=>
Array.from(document.querySelectorAll('section.card__body'))
.map(card => (
{
ref: card.querySelector('div > span').innerText,
duration: card.querySelector('article > div:nth-child(3) > div > div:nth-child(1) > div').innerText,
within: card.querySelector('article > div:nth-child(4) > div').innerText,
expiration: card.querySelector('article > div:nth-child(3) > div > div.hide--large > time > span').innerText,
salary: card.querySelector('article > div:nth-child(5) > div').innerText,
link: 'https://iaeste.org/internships/' + String(card.querySelector('div > span').innerText).toLowerCase(),
title: card.querySelector('div.card__title').innerText
}))
))
curr_page++;
next_url = `https://iaeste.org/internships?discipline%5B%5D=11&internship_type=open&page=${curr_page}&ref_no=&sort=deadline_at`;
if(curr_page<= number_pages)
await page.goto(next_url);
}while(curr_page <= number_pages);
spinner.success({text: chalk.bold.cyan("Scraped " + internships.length + " internships.\n")});
await browser.close();
writeToFile(internships);
extractToSheet(internships);
getNewOffers(old_data, internships);
})();
function extractToSheet(arr){
let internshipsWS = xlsx.utils.json_to_sheet(arr);
// Create a new Workbook
var wb = xlsx.utils.book_new()
// Name your sheet
xlsx.utils.book_append_sheet(wb, internshipsWS, 'IAESTE internships')
// export your excel
xlsx.writeFile(wb, xlsx_file);
}
function writeToFile(arr){
fs.writeFile(json_file, JSON.stringify(arr), function(err) {
if (err) {
console.log(err);
}
});
}
function getNewOffers(older, newer){
if(older == null){
console.log(chalk.red('Cannot detect newer offers. Old offers file not found. Will start looking on the next execution.'));
return;
}
for(var i = 0; i < newer.length; i++){
if(!existsIn(newer[i].ref, older))
printNewOffer(newer[i]);
}
}
function printNewOffer(offer){
console.log(
chalk.bgRed('\nNew Offer:') +
chalk.yellow('\nreference: ') + offer.ref +
chalk.yellow('\nduration: ' ) + offer.duration +
chalk.yellow('\nwithin: ' ) + offer.within+
chalk.yellow('\nexpiration: ') + offer.expiration+
chalk.yellow('\nsalary: ' ) + offer.salary+
chalk.yellow('\nlink: ' ) + offer.link+
chalk.yellow('\ntitle: ' ) + offer.title);
}
function existsIn(ref, array){
for(var i = 0; i < array.length; i++){
if(array[i].ref == ref)
return true;
}
return false;
}