-
Notifications
You must be signed in to change notification settings - Fork 0
/
ETL_Crossref.py
142 lines (106 loc) · 4.65 KB
/
ETL_Crossref.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Extract, Transform, Load script
#
# Take each json file from the Crossref dump and create an object containing it that is loaded in Solr
__author__ = 'Gabriele Pisciotta'
from tqdm import tqdm
import json
import pysolr
import time
import gzip
from os import listdir
from os.path import isfile, join, basename, splitext
import re
import tarfile
import argparse
# Get list of file inside the dir
def get_files_in_dir(path):
list_of_files = [f for f in listdir(path) if isfile(join(path, f))]
list_of_files.sort(key=lambda f: int(re.sub('\D', '', f)))
return list_of_files
# Read json content in file
def read_json_file(f):
with open(f) as data_file:
data = json.load(data_file)
return data
# Extract a string from the metadata
def extract_string_from_metadata(content):
text = ""
if 'author' in content and len(content['author']) > 0:
for a in content['author']:
if 'given' in a and 'family' in a:
text = "".join([text, a['given'], " ", a['family'], ", "])
if 'title' in content and len(content['title']) > 0:
text = "".join([text, content['title'][0], ", "])
if 'short-container-title' in content and len(content['short-container-title']) > 0:
text = "".join([text, content['short-container-title'][0], ", "])
if 'issued' in content and 'date_parts' in content['issued'] and len(content['issued']['date_parts']) > 0:
dates = "".join([str(x)+ " " for x in content['issued']['date-parts'][0]])
text = "".join([text, dates, ", "])
elif 'published-print' in content and 'date-parts' in content['published-print'] and len(content['published-print']['date-parts'][0]) > 0:
dates = "".join([str(x)+ " " for x in content['published-print']['date-parts'][0]])
text = "".join([text, dates, ", "])
if 'volume' in content:
text = "".join([text, content['volume'], ", "])
if 'issue' in content:
text = "".join([text, content['issue'], ", "])
if 'page' in content:
text = "".join([text, content['page'], ", "])
if 'DOI' in content:
text = "".join([text, content['DOI'].lower()])
return text
def crossref_ETL(source, start_path, dump_filename, solr_address):
start = time.time()
try:
solr = pysolr.Solr(solr_address, always_commit=True, timeout=1000)
solr.ping()
print("Connection enstablished to Solr")
except:
print("Can't enstablish a connection to Solr")
exit()
json_file = 0
doc_in_json_file = 0
if source == 'path':
inpath = start_path
file_list = get_files_in_dir(inpath)
else:
print("Extracting Crossref dump... This may take a while.")
crossref_dump_compressed = tarfile.open(dump_filename, 'r:gz')
file_list = crossref_dump_compressed.getmembers()
# For each file in the crossref compressed dump
for f in tqdm(file_list):
if source == 'compressed':
# Extract a single file from the dump
f = crossref_dump_compressed.extractfile(f)
# When extracting the dump, it may happen that is read something that isn't a file
if f is None:
continue
content = json.loads(f.read().decode())
else:
# Read the file as a json object
content = read_json_file(join(inpath, f))
# Each element in the json object is a single document
elements = []
# Iterate through all the element in the chunk
for element in content['items']:
# For each document, create the structure of the object that will be updated in Solr
doc = {
"id": element['DOI'].lower(),
"bibref": extract_string_from_metadata(element),
"original": json.dumps(element)
}
elements.append(doc)
doc_in_json_file += 1
# Upload in Solr the list of elements
solr.add(elements)
json_file += 1
end = time.time()
print("Loaded {} docs from {} dumps Time ETL: {}".format(doc_in_json_file, json_file, (end - start)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("start_path", default="", help="Working path")
parser.add_argument("source", choices=['path', 'compressed'], default="compressed",
help="Kind of source. Can be 'path' or 'compressed'")
parser.add_argument("dump_filename")
parser.add_argument("solr_address", default="http://localhost:8983/solr/crossref")
args = parser.parse_args()
crossref_ETL(source=args.source, start_path=args.start_path, dump_filename=args.dump_filename, solr_address=args.solr_address)