-
Notifications
You must be signed in to change notification settings - Fork 2
/
ris2sql
executable file
·309 lines (284 loc) · 12.1 KB
/
ris2sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3
# Build a structure like so (or better, stream it, avoiding memory pressure):
# {
# 1, [
# {"TY", "JOUR"},
# {"TI", "#1 mine remembered"},
# {"AU", "Stonebanks, Roger"},
# {"KW", "History"}
# ]
# 2, [ [] ]
# }
#
# Then iterate over all of the objects, dumping output into appropriate files:
#
# for rec in record:
# if
import os
import sys
class RISParser:
# We expect each of these fields to appear only once for a given citation
core_map = {
"AB": "abstract",
"DA": "pub_date",
"DP": "pub_database",
"SP": "start_page",
"EP": "end_page",
"LA": "language",
"CN": "call_number",
"CY": "pub_place",
"M3": "work_type",
"PB": "publisher",
"PY": "pub_year",
"TY": "doc_type", # type of the cited document
"TI": "title",
"T2": "title2",
"T3": "title3",
"J2": "alternate_title", # often abbrev. journal or book title
"ST": "short_title", # often abbrev. journal or book title
"DO": "doi",
"UR": "url",
"L2": "local_url",
"SN": "isbn_issn",
"ET": "edition",
"M1": "id_number",
"IS": "issue_number",
"VL": "volume",
"Y2": "access_date",
}
# multiple authors / editors per citation
# see http://refdb.sourceforge.net/manual/ch07.html#sect1-ris-format
# but Zotero exports "series_editor" as A2 and "editor" as A3, argh
author_map = {
"AU": "author",
"A3": "editor",
"A2": "series_editor",
"A4": "translator",
"A5": "contributor",
}
skip_keys = ("L1", "L4")
def __init__(
self,
ris_f="cawls.ris",
core_f="core.sql",
authors="authors.sql",
kw_f="keywords.sql",
notes_f="notes.sql",
):
self.rec = 0
self.core = {}
self.init_core()
self.ris_f = open(ris_f, "r")
self.core_f = open(core_f, "w")
self.authors_f = open(authors, "w")
self.kw_f = open(kw_f, "w")
self.notes_f = open(notes_f, "w")
self.sql_header_core()
self.sql_header_authors()
self.sql_header_keywords()
self.sql_header_notes()
self.sql_core_adds = [
r"""UPDATE citations
SET url = regexp_replace(url, '\.librweb.laurentian.ca', '')
WHERE url ~ 'librweb';""",
"UPDATE citations SET isbn_issn = regexp_replace(isbn_issn, '^(\d{4})(\d{4})$', '\\1-\\2') WHERE isbn_issn ~ '^\d{8}$';",
"ALTER TABLE citations ADD COLUMN series TEXT;",
"ALTER TABLE citations ADD COLUMN pub_title TEXT;",
"ALTER TABLE citations ADD COLUMN kw_tsv TSVECTOR;",
"ALTER TABLE citations ADD COLUMN merged_with TEXT;",
r"""UPDATE citations SET pub_date = NULL WHERE pub_date !~ '^\d{4}';""",
"DROP FUNCTION IF EXISTS ris_search(TEXT);",
"DROP TYPE IF EXISTS search_result CASCADE;",
"CREATE TYPE search_result AS (citation INT, title TEXT, pub_year INT, doc_type TEXT, rank REAL);",
r"""CREATE OR REPLACE FUNCTION ris_filter(att TEXT, val TEXT)
RETURNS SETOF search_result AS $$
DECLARE
sql TEXT := 'SELECT id, title, substring(pub_date from 1 for 4)::INT AS pub_year, doc_type, 0::REAL FROM citations
WHERE ' || quote_ident(att) || ' = $2';
BEGIN
RETURN QUERY EXECUTE sql USING att, val;
END;
$$ LANGUAGE PLPGSQL;""",
r"""CREATE OR REPLACE FUNCTION author_citations(author INT)
RETURNS SETOF search_result AS $$
SELECT DISTINCT c.id, c.title, substring(c.pub_date from 1 for 4)::INT AS pub_year, doc_type, 0::real
FROM citations c
INNER JOIN authored_v a ON c.id = a.citation
WHERE a.id = author
ORDER BY 2 ASC
$$ LANGUAGE SQL;""",
r"""UPDATE citations
SET title2 = 'Relations Industrielles / Industrial Relations'
WHERE title2 IN ('Relations Industrielles', 'Relations industrielles');""",
r"""UPDATE citations
SET title2 = 'Labour / Le Travail'
WHERE title2 IN ('Labour/Le Travail');""",
r"""UPDATE citations SET language = 'English'
WHERE language IN ('eng', 'en', 'en_ca', 'en-US', 'EN', 'En', 'English: en-CA');""",
r"""UPDATE citations SET language = 'English'
WHERE language IS NULL;""",
r"""UPDATE citations SET language = 'French'
WHERE language IN ('fr', 'fre');""",
r"""UPDATE citations SET language = 'Spanish'
WHERE language IN ('sp');""",
r"""UPDATE citations SET language = 'English, French'
WHERE language IN ('en fr', 'eng; fre', 'en/fr');""",
r"""UPDATE citations SET language = 'French, English'
WHERE language IN ('fr en');""",
r"""UPDATE citations SET language = 'French, English, Spanish'
WHERE language IN ('fr en sp');""",
r"""UPDATE citations
SET publisher = 'Canadian Committee on Labour History'
WHERE publisher = 'Committee on Canadian Labour History';""",
r"""UPDATE citations SET series = title2
WHERE doc_type IN ('BOOK', 'RPRT');""",
r"""UPDATE citations SET series = title3
WHERE doc_type IN ('CHAP');""",
r"""UPDATE citations SET pub_title = title2
WHERE doc_type IN ('JOUR', 'CHAP', 'NEWS', 'MGZN', 'ELEC');""",
r"""UPDATE citations SET pub_title = title
WHERE doc_type IN ('BOOK');""",
r"""UPDATE citations SET kw_tsv = setweight(to_tsvector('english', title), 'A')
|| setweight(to_tsvector('english', COALESCE(abstract, '')), 'C');""",
r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
(SELECT STRING_AGG(notes, ' ') FROM citation_notes WHERE citation = citations.id), '')), 'C');""",
r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
(SELECT STRING_AGG(keywords, ' ') FROM citation_keywords WHERE citation = citations.id), '')), 'B');""",
r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
(SELECT STRING_AGG(author_name, ' ') FROM authored_v WHERE citation = citations.id), '')), 'B');""",
"CREATE INDEX kw_tsv_index ON citations USING GIN(kw_tsv);",
r"""CREATE OR REPLACE FUNCTION ris_search(query TEXT) RETURNS SETOF search_result AS $$
SELECT id, title, substring(pub_date from 1 for 4)::INT AS pub_year, doc_type, ts_rank_cd(kw_tsv, plainto_tsquery(query))
FROM citations
WHERE kw_tsv @@ plainto_tsquery(query)
AND merged_with IS NULL
ORDER BY 3 DESC$$ LANGUAGE SQL;""",
]
self.sql_author_adds = [
r"""CREATE TABLE authors (id SERIAL, author_name TEXT);"""
r"""INSERT INTO authors (author_name)
SELECT DISTINCT author_name FROM cites_to_authors;""",
r"""UPDATE cites_to_authors SET author = (
SELECT id FROM authors
WHERE cites_to_authors.author_name = authors.author_name
);""",
r"""ALTER TABLE cites_to_authors DROP COLUMN author_name;""",
r"""CREATE VIEW authored_v AS
SELECT citation, id, author_type, author_name
FROM authors INNER JOIN cites_to_authors ON author = id;""",
]
def init_core(self):
self.authors = []
self.keywords = []
self.notes = []
self.lastkey = ""
self.lastval = ""
for v in RISParser.core_map.values():
self.core[v] = None
def sql_header_core(self, table="citations"):
sql = "DROP TABLE IF EXISTS %s; CREATE TABLE %s(id INTEGER, " % (table, table)
for key in sorted(self.core_map.keys()):
sql = sql + "%s TEXT, " % (self.core_map[key])
sql = sql[0:-2] + ");\n"
sql = sql + "COPY %s (id, " % (table)
for key in sorted(self.core_map.keys()):
sql = sql + "%s, " % (self.core_map[key])
sql = sql[0:-2] + ") FROM STDIN;\n"
self.core_f.write(sql)
def sql_header_authors(self, table="authors"):
sql = r"""
DROP TABLE IF EXISTS cites_to_authors;
CREATE TABLE cites_to_authors(citation INTEGER,
author_type TEXT, author INT, author_name TEXT);
COPY cites_to_authors(citation, author_type, author_name) FROM STDIN;
"""
self.authors_f.write(sql)
def sql_header_keywords(self, table="keywords"):
sql = (
"DROP TABLE IF EXISTS citation_keywords;\n"
+ "CREATE TABLE citation_keywords(id SERIAL, citation INTEGER, "
+ "keywords TEXT);\n"
+ "COPY citation_keywords(citation, keywords) FROM STDIN;\n"
)
self.kw_f.write(sql)
def sql_header_notes(self, table="notes"):
sql = (
"DROP TABLE IF EXISTS citation_notes;\n"
+ "CREATE TABLE citation_notes(id SERIAL, citation INTEGER, "
+ "notes TEXT);\n"
+ "COPY citation_notes(citation, notes) FROM STDIN;\n"
)
self.notes_f.write(sql)
def close_sql(self):
for f in (self.core_f, self.authors_f, self.kw_f, self.notes_f):
f.write("\\.\n")
for line in self.sql_core_adds:
self.core_f.write(line)
self.core_f.write("\n")
for line in self.sql_author_adds:
self.authors_f.write(line)
self.authors_f.write("\n")
def parse(self, line):
if len(line) == 1:
pass
elif line[0:-1] == "ER - ":
self.store()
self.printem()
elif line[2:6] == " - ":
self.store()
self.lastkey = line[0:2]
self.lastval = line[6:].strip()
else:
self.lastval = "\\n".join((self.lastval, line[0:].strip()))
def printem(self):
self.write_core()
self.write_authors()
self.write_keywords()
self.write_notes()
self.init_core()
self.rec += 1
def write_core(self):
self.core_f.write("%d" % (self.rec))
for key in sorted(self.core_map.keys()):
val = self.core[self.core_map[key]]
if not val:
val = "\\N"
val = val.replace("\t", " ")
self.core_f.write("\t%s" % (val))
self.core_f.write("\n")
def write_authors(self):
for x, y in self.authors:
self.authors_f.write("%d\t%s\t%s\n" % (self.rec, x, y))
def write_keywords(self):
for x in self.keywords:
self.kw_f.write("%d\t%s\n" % (self.rec, x))
def write_notes(self):
for x in self.notes:
self.notes_f.write("%d\t%s\n" % (self.rec, x))
def store(self):
if self.lastkey in RISParser.core_map:
self.core[RISParser.core_map[self.lastkey]] = self.lastval
elif self.lastkey in RISParser.author_map:
self.authors.append(([RISParser.author_map[self.lastkey], self.lastval]))
elif self.lastkey == "KW":
self.keywords.append(self.lastval)
elif self.lastkey == "N1":
self.notes.append(self.lastval)
elif self.lastkey in RISParser.skip_keys:
pass
elif self.lastkey:
print("Undefined key: ", self.lastkey)
def parsefile(self):
with self.ris_f as f:
for line in f:
self.parse(line)
self.close_sql()
if __name__ == "__main__":
foo = RISParser(
ris_f="cawls.ris",
core_f="core.sql",
authors="authors.sql",
kw_f="keywords.sql",
notes_f="notes.sql",
)
foo.parsefile()