ris2sql

#!/usr/bin/env python3

# Build a structure like so (or better, stream it, avoiding memory pressure):
# {
#   1, [
#         {"TY", "JOUR"},
#         {"TI", "#1 mine remembered"},
#         {"AU", "Stonebanks, Roger"},
#         {"KW", "History"}
#      ]
#   2, [ [] ]
# }
#
# Then iterate over all of the objects, dumping output into appropriate files:
#
# for rec in record:
#     if

import os
import sys


class RISParser:

    # We expect each of these fields to appear only once for a given citation
    core_map = {
        "AB": "abstract",
        "DA": "pub_date",
        "DP": "pub_database",
        "SP": "start_page",
        "EP": "end_page",
        "LA": "language",
        "CN": "call_number",
        "CY": "pub_place",
        "M3": "work_type",
        "PB": "publisher",
        "PY": "pub_year",
        "TY": "doc_type",  # type of the cited document
        "TI": "title",
        "T2": "title2",
        "T3": "title3",
        "J2": "alternate_title",  # often abbrev. journal or book title
        "ST": "short_title",  # often abbrev. journal or book title
        "DO": "doi",
        "UR": "url",
        "L2": "local_url",
        "SN": "isbn_issn",
        "ET": "edition",
        "M1": "id_number",
        "IS": "issue_number",
        "VL": "volume",
        "Y2": "access_date",
    }

    # multiple authors / editors per citation
    # see http://refdb.sourceforge.net/manual/ch07.html#sect1-ris-format
    # but Zotero exports "series_editor" as A2 and "editor" as A3, argh
    author_map = {
        "AU": "author",
        "A3": "editor",
        "A2": "series_editor",
        "A4": "translator",
        "A5": "contributor",
    }

    skip_keys = ("L1", "L4")

    def __init__(
        self,
        ris_f="cawls.ris",
        core_f="core.sql",
        authors="authors.sql",
        kw_f="keywords.sql",
        notes_f="notes.sql",
    ):
        self.rec = 0
        self.core = {}
        self.init_core()
        self.ris_f = open(ris_f, "r")
        self.core_f = open(core_f, "w")
        self.authors_f = open(authors, "w")
        self.kw_f = open(kw_f, "w")
        self.notes_f = open(notes_f, "w")
        self.sql_header_core()
        self.sql_header_authors()
        self.sql_header_keywords()
        self.sql_header_notes()
        self.sql_core_adds = [
            r"""UPDATE citations
                SET url = regexp_replace(url, '\.librweb.laurentian.ca', '')
                WHERE url ~ 'librweb';""",
            "UPDATE citations SET isbn_issn = regexp_replace(isbn_issn, '^(\d{4})(\d{4})$', '\\1-\\2') WHERE isbn_issn ~ '^\d{8}$';",
            "ALTER TABLE citations ADD COLUMN series TEXT;",
            "ALTER TABLE citations ADD COLUMN pub_title TEXT;",
            "ALTER TABLE citations ADD COLUMN kw_tsv TSVECTOR;",
            "ALTER TABLE citations ADD COLUMN merged_with TEXT;",
            r"""UPDATE citations SET pub_date = NULL WHERE pub_date !~ '^\d{4}';""",
            "DROP FUNCTION IF EXISTS ris_search(TEXT);",
            "DROP TYPE IF EXISTS search_result CASCADE;",
            "CREATE TYPE search_result AS (citation INT, title TEXT, pub_year INT, doc_type TEXT, rank REAL);",
            r"""CREATE OR REPLACE FUNCTION ris_filter(att TEXT, val TEXT)
                RETURNS SETOF search_result AS $$
                DECLARE
                    sql TEXT := 'SELECT id, title, substring(pub_date from 1 for 4)::INT AS pub_year, doc_type, 0::REAL FROM citations
                        WHERE ' || quote_ident(att) || ' = $2';
                BEGIN
                    RETURN QUERY EXECUTE sql USING att, val;
                END;
                $$ LANGUAGE PLPGSQL;""",
            r"""CREATE OR REPLACE FUNCTION author_citations(author INT)
                RETURNS SETOF search_result AS $$
                    SELECT DISTINCT c.id, c.title, substring(c.pub_date from 1 for 4)::INT AS pub_year, doc_type, 0::real
                    FROM citations c
                        INNER JOIN authored_v a ON c.id = a.citation
                    WHERE a.id = author
                    ORDER BY 2 ASC
                $$ LANGUAGE SQL;""",
            r"""UPDATE citations
                SET title2 = 'Relations Industrielles / Industrial Relations'
                WHERE title2 IN ('Relations Industrielles', 'Relations industrielles');""",
            r"""UPDATE citations
                SET title2 = 'Labour / Le Travail'
                WHERE title2 IN ('Labour/Le Travail');""",
            r"""UPDATE citations SET language = 'English'
                WHERE language IN ('eng', 'en', 'en_ca', 'en-US', 'EN', 'En', 'English: en-CA');""",
            r"""UPDATE citations SET language = 'English'
                WHERE language IS NULL;""",
            r"""UPDATE citations SET language = 'French'
                WHERE language IN ('fr', 'fre');""",
            r"""UPDATE citations SET language = 'Spanish'
                WHERE language IN ('sp');""",
            r"""UPDATE citations SET language = 'English, French'
                WHERE language IN ('en fr', 'eng; fre', 'en/fr');""",
            r"""UPDATE citations SET language = 'French, English'
                WHERE language IN ('fr en');""",
            r"""UPDATE citations SET language = 'French, English, Spanish'
                WHERE language IN ('fr en sp');""",
            r"""UPDATE citations
                SET publisher = 'Canadian Committee on Labour History'
                WHERE publisher = 'Committee on Canadian Labour History';""",
            r"""UPDATE citations SET series = title2
                    WHERE doc_type IN ('BOOK', 'RPRT');""",
            r"""UPDATE citations SET series = title3
                    WHERE doc_type IN ('CHAP');""",
            r"""UPDATE citations SET pub_title = title2
                    WHERE doc_type IN ('JOUR', 'CHAP', 'NEWS', 'MGZN', 'ELEC');""",
            r"""UPDATE citations SET pub_title = title
                    WHERE doc_type IN ('BOOK');""",
            r"""UPDATE citations SET kw_tsv = setweight(to_tsvector('english', title), 'A')
                 || setweight(to_tsvector('english', COALESCE(abstract, '')), 'C');""",
            r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
                 (SELECT STRING_AGG(notes, ' ') FROM citation_notes WHERE citation = citations.id), '')), 'C');""",
            r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
                 (SELECT STRING_AGG(keywords, ' ') FROM citation_keywords WHERE citation = citations.id), '')), 'B');""",
            r"""UPDATE citations SET kw_tsv = kw_tsv || setweight(to_tsvector('english', COALESCE(
                 (SELECT STRING_AGG(author_name, ' ') FROM authored_v WHERE citation = citations.id), '')), 'B');""",
            "CREATE INDEX kw_tsv_index ON citations USING GIN(kw_tsv);",
            r"""CREATE OR REPLACE FUNCTION ris_search(query TEXT) RETURNS SETOF search_result AS $$
                 SELECT id, title, substring(pub_date from 1 for 4)::INT AS pub_year, doc_type, ts_rank_cd(kw_tsv, plainto_tsquery(query))
                 FROM citations
                WHERE kw_tsv @@ plainto_tsquery(query)
                 AND merged_with IS NULL
                ORDER BY 3 DESC$$ LANGUAGE SQL;""",
        ]
        self.sql_author_adds = [
            r"""CREATE TABLE authors (id SERIAL, author_name TEXT);"""
            r"""INSERT INTO authors (author_name)
                    SELECT DISTINCT author_name FROM cites_to_authors;""",
            r"""UPDATE cites_to_authors SET author = (
                    SELECT id FROM authors
                    WHERE cites_to_authors.author_name = authors.author_name
                );""",
            r"""ALTER TABLE cites_to_authors DROP COLUMN author_name;""",
            r"""CREATE VIEW authored_v AS
                    SELECT citation, id, author_type, author_name
                    FROM authors INNER JOIN cites_to_authors ON author = id;""",
        ]

    def init_core(self):
        self.authors = []
        self.keywords = []
        self.notes = []
        self.lastkey = ""
        self.lastval = ""
        for v in RISParser.core_map.values():
            self.core[v] = None

    def sql_header_core(self, table="citations"):
        sql = "DROP TABLE IF EXISTS %s; CREATE TABLE %s(id INTEGER, " % (table, table)
        for key in sorted(self.core_map.keys()):
            sql = sql + "%s TEXT, " % (self.core_map[key])
        sql = sql[0:-2] + ");\n"

        sql = sql + "COPY %s (id, " % (table)
        for key in sorted(self.core_map.keys()):
            sql = sql + "%s, " % (self.core_map[key])
        sql = sql[0:-2] + ") FROM STDIN;\n"
        self.core_f.write(sql)

    def sql_header_authors(self, table="authors"):
        sql = r"""
            DROP TABLE IF EXISTS cites_to_authors;
            CREATE TABLE cites_to_authors(citation INTEGER,
                author_type TEXT, author INT, author_name TEXT);
            COPY cites_to_authors(citation, author_type, author_name) FROM STDIN;
        """
        self.authors_f.write(sql)

    def sql_header_keywords(self, table="keywords"):
        sql = (
            "DROP TABLE IF EXISTS citation_keywords;\n"
            + "CREATE TABLE citation_keywords(id SERIAL, citation INTEGER, "
            + "keywords TEXT);\n"
            + "COPY citation_keywords(citation, keywords) FROM STDIN;\n"
        )
        self.kw_f.write(sql)

    def sql_header_notes(self, table="notes"):
        sql = (
            "DROP TABLE IF EXISTS citation_notes;\n"
            + "CREATE TABLE citation_notes(id SERIAL, citation INTEGER, "
            + "notes TEXT);\n"
            + "COPY citation_notes(citation, notes) FROM STDIN;\n"
        )
        self.notes_f.write(sql)

    def close_sql(self):
        for f in (self.core_f, self.authors_f, self.kw_f, self.notes_f):
            f.write("\\.\n")
        for line in self.sql_core_adds:
            self.core_f.write(line)
            self.core_f.write("\n")
        for line in self.sql_author_adds:
            self.authors_f.write(line)
            self.authors_f.write("\n")

    def parse(self, line):
        if len(line) == 1:
            pass
        elif line[0:-1] == "ER  - ":
            self.store()
            self.printem()
        elif line[2:6] == "  - ":
            self.store()
            self.lastkey = line[0:2]
            self.lastval = line[6:].strip()
        else:
            self.lastval = "\\n".join((self.lastval, line[0:].strip()))

    def printem(self):
        self.write_core()
        self.write_authors()
        self.write_keywords()
        self.write_notes()
        self.init_core()
        self.rec += 1

    def write_core(self):
        self.core_f.write("%d" % (self.rec))
        for key in sorted(self.core_map.keys()):
            val = self.core[self.core_map[key]]
            if not val:
                val = "\\N"
            val = val.replace("\t", " ")
            self.core_f.write("\t%s" % (val))
        self.core_f.write("\n")

    def write_authors(self):
        for x, y in self.authors:
            self.authors_f.write("%d\t%s\t%s\n" % (self.rec, x, y))

    def write_keywords(self):
        for x in self.keywords:
            self.kw_f.write("%d\t%s\n" % (self.rec, x))

    def write_notes(self):
        for x in self.notes:
            self.notes_f.write("%d\t%s\n" % (self.rec, x))

    def store(self):
        if self.lastkey in RISParser.core_map:
            self.core[RISParser.core_map[self.lastkey]] = self.lastval
        elif self.lastkey in RISParser.author_map:
            self.authors.append(([RISParser.author_map[self.lastkey], self.lastval]))
        elif self.lastkey == "KW":
            self.keywords.append(self.lastval)
        elif self.lastkey == "N1":
            self.notes.append(self.lastval)
        elif self.lastkey in RISParser.skip_keys:
            pass
        elif self.lastkey:
            print("Undefined key: ", self.lastkey)

    def parsefile(self):
        with self.ris_f as f:
            for line in f:
                self.parse(line)
        self.close_sql()


if __name__ == "__main__":
    foo = RISParser(
        ris_f="cawls.ris",
        core_f="core.sql",
        authors="authors.sql",
        kw_f="keywords.sql",
        notes_f="notes.sql",
    )
    foo.parsefile()