-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_api.sh
executable file
·34 lines (25 loc) · 1.07 KB
/
scrape_api.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash
# this is a constant in every api request so i've added it as a variable
base="https://archive.org/services/search/v1/scrape?fields=title&q="
# this one determines what to look for
search=$1
# curl -s $base$search > raw.json
# scrape first 5000 or less entries
wget -O raw.json -q -i $base$search
# insert links to a text file
jq -r '.items[].identifier' raw.json >> $search.txt
# copy the cursor if it exist to keep requesting next enteries in the api
jq -r '.cursor' raw.json | grep -v "null" > cursor.txt
# just a string that needs to be present before the cursor
pre="&cursor="
# make the scrape command easier to read by storing the cursor to a variable
cursor="$(cat cursor.txt)"
# while loop, keep looking at the file and scrape the api until it finds less than 5000 entries in which case there won't be a need to scrape the cursor further
while [[ $(jq -r '.total' raw.json) > 5000 ]]
do
sleep 1
wget -O raw.json -q -i $base$search$pre$cursor
jq -r '.items[].identifier' raw.json >> $search.txt
jq -r '.cursor' raw.json | grep -v "null" > cursor.txt
sleep 1
done