forked from MarkEEaton/open-journal-matcher
-
Notifications
You must be signed in to change notification settings - Fork 1
/
fetch.py
61 lines (53 loc) · 1.58 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
""" loop through the issns, gather abstracts and wite to abstracts/ """
import json
import requests
from time import sleep
from bs4 import BeautifulSoup
def fetch(issn):
base_url = "https://doaj.org/api/v1/search/articles/issn%3A"
pagesize = "?pageSize=100&sort=year%3Adesc"
data = requests.get(base_url + issn + pagesize)
print(
"fetching data for "
+ issn
+ ". "
+ str(idx + 1)
+ "/"
+ str(len(issns))
+ ". status: "
+ str(data.status_code)
)
try:
articles = data.json().get("results")
except:
articles = ""
status = str(data.status_code)
if status == "429":
sleep(10)
print("forbidden")
articles = fetch(issn)
return articles
def parse(articles):
abstracts = ""
print('Number of articles: ' + str(len(articles)))
if len(articles) <= 10:
return abstracts
for article in articles:
try:
abstract = article["bibjson"]["abstract"]
abstract = BeautifulSoup(abstract, "lxml").text
abstracts = abstracts + abstract
except KeyError:
pass
return abstracts
if __name__ == "__main__":
with open("issnlist-June2020.txt") as issnfile:
issns = json.loads(issnfile.read())
for idx, issn in enumerate(issns):
articles = fetch(issn)
abstracts = parse(articles)
if abstracts == "":
pass
else:
with open("abstracts-June2020/" + issn + ".txt", "w") as abstractfile:
abstractfile.write(json.dumps(abstracts))