-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_arxiv.py
54 lines (42 loc) · 1.57 KB
/
scrape_arxiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import arxivscraper
import pandas as pd
import numpy as np
from datetime import date
from datetime import timedelta
necessary_keywords = ['exoplanet', 'atmosphere']
keywords = ['high-resolution', 'high resolution', 'hi-res', 'high-dispersion',
'high dispersion', ' HRS ', ' HDS ']
def check_hires_row(row):
"""
checks whether a paper might be a hires paper based on what's in the abstract.
"""
abstract = row['abstract']
hires = 0
necessary_hires = 0
for keyword in necessary_keywords:
if keyword in abstract:
necessary_hires += 1
for keyword in keywords:
if keyword in abstract:
hires += 1
return necessary_hires == len(necessary_keywords) and hires >= 1
def check_hires(frame):
hires = []
for i, row in frame.iterrows():
hires += [check_hires_row(row)]
frame['hires?'] = hires
if __name__=='__main__':
today = date.today().strftime("%Y-%m-%d")
yesterday = (date.today() - timedelta(days = 1)).strftime("%Y-%m-%d")
scraper = arxivscraper.Scraper(
category='physics:astro-ph',
date_from=yesterday, date_until=today,
filters={'abstract':['exoplanet']})
output = scraper.scrape()
print(output)
cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors', 'affiliation', 'url')
df = pd.DataFrame(output,columns=cols)
check_hires(df)
hires_frame = df[df['hires?']==True]
if len(hires_frame) != 0:
hires_frame.to_csv(f'data/potential_hires_papers_{yesterday}.csv')