From 29d2d198feb39ff48efb87ee9e041b9b50591709 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Mon, 4 Sep 2023 07:37:28 +0200 Subject: [PATCH 01/15] TT-1042: Read language codes for initialized files from .env + add additional language ISO-692-2 'nno' and undetermined 'und' --- .env.example | 4 ++++ .../data/txt/info_page_keywords.json | 10 ++++++--- metadata_extract/data/txt/labels.json | 18 +++++++-------- metadata_extract/data/txt/stopwords.json | 14 ++++++++++-- metadata_extract/init_files.py | 22 ++++++++++--------- src/settings.py | 1 + 6 files changed, 45 insertions(+), 24 deletions(-) diff --git a/.env.example b/.env.example index 61fc792..9978e92 100644 --- a/.env.example +++ b/.env.example @@ -3,6 +3,10 @@ MAX_FILE_SIZE_MB=123 ENVIRONMENT=local DIFF_FILES_FOLDER=/path/to/diff/files +# A comma separated list of ISO 639-2 language codes +# Languages must be present in the metadata_extract.data.txt files, otherwise they will be skipped on a per file basis +LANGUAGE=eng,nob + # To use a authority registry database, specify either... # ... the path to a SQLite file diff --git a/metadata_extract/data/txt/info_page_keywords.json b/metadata_extract/data/txt/info_page_keywords.json index 213150e..2d9ace3 100644 --- a/metadata_extract/data/txt/info_page_keywords.json +++ b/metadata_extract/data/txt/info_page_keywords.json @@ -1,17 +1,21 @@ { - "en": [ + "eng": [ "year", "date", "publish", "isbn", "issn", "project no.", "title", "author", "number of pages", "summary", "abstract", "©", "subject words", "keywords", "country", "county", "report", "employer", "availability", "contact person", "publication type" ], - "no": [ + "nob": [ "år", "dato", "utgivelsesår", "utgiver", "utgitt", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel", "forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "©", "opphavsrett", "rettighetshaver", "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører", "oppdragsgiver", "tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur" ], - "nn": [ + "nno": [ "forfattar" + ], + "nor": [], + "und": [ + "isbn", "issn", "©" ] } \ No newline at end of file diff --git a/metadata_extract/data/txt/labels.json b/metadata_extract/data/txt/labels.json index 018960d..e05381c 100644 --- a/metadata_extract/data/txt/labels.json +++ b/metadata_extract/data/txt/labels.json @@ -1,5 +1,5 @@ { - "en": { + "eng": { "author": ["authors", "author(s)", "author"], "photo": ["photographer", "photo", "illustration"], "publisher": ["published by", "publisher"], @@ -7,7 +7,7 @@ "bindingWords": ["and"], "report": ["report"] }, - "no": { + "nob": { "author": ["forfatter(e)", "forfattere", "forfatter", "skrevet av"], "photo": ["fotograf", "foto", "illustrasjon", "bilde", "fotomontasje"], "publisher": ["utgiver", "utgivere", "utgitt av"], @@ -15,12 +15,12 @@ "bindingWords": ["og"], "report": ["rapport"] }, - "nn": { - "author": ["forfattar(ar)", "forfattarar", "forfattar"], - "photo": [], - "publisher": ["utgjevar", "utgjevarar"], - "reportType": [], - "bindingWords": [], - "report": [] + "nno": { + "author": ["forfattar(ar)", "forfattarar", "forfattar", "skrive av"], + "photo": ["fotograf", "foto", "illustrasjon", "bilete", "fotomontasje"], + "publisher": ["utgjevar", "utgjevarar", "utgivar", "utgivarar", "gitt ut av"], + "reportType": ["årsrapport", "evaluering", "rettleiar", "undersøking"], + "bindingWords": ["og"], + "report": ["rapport"] } } \ No newline at end of file diff --git a/metadata_extract/data/txt/stopwords.json b/metadata_extract/data/txt/stopwords.json index 2375dae..30c7b30 100644 --- a/metadata_extract/data/txt/stopwords.json +++ b/metadata_extract/data/txt/stopwords.json @@ -1,5 +1,5 @@ { - "no": [ + "nob": [ "akademi", "universitet", "direktorat", "direktoratet", "forsvar", "militær", "forsker", "seniorforsker", "forskningsleder", "forskningssjef", "professor", "høgskole", "høyskole", "konsulent", "kommunal", "kommune", "kongelig", "kongelige", "politikk", "politiker", "politi", "politiet", "myndighet", "myndigheter", "overvåking", @@ -9,7 +9,17 @@ "lærer", "byrå", "assistent", "assisterende", "organisasjon", "senter", "departement", "avdeling", "fylkesmann", "institutt", "internasjonal", "verden", "program", "rapport", "vitenskap" ], - "en": [ + "nno": [ + "akademi", "universitet", "direktorat", "direktoratet", "forsvar", "militær", "forskar", "seniorforskar", + "forskingsleiar", "forskingssjef", "professor", "høgskule", "konsulent", "kommunal", "kommune", + "kongeleg", "kongelege", "politikk", "politikar", "politi", "politiet", "styresmakt", "styresmakter", "overvaking", + "forsking", "forsvaret", "europa", "amerika", "fylke", "kontor", "seniorrådgivar", "rådgivar", + "rådgivarar", "seniorrådgivarar", "prosjekt", "prosjektleiar", "direktør", "laboratorium", "stortinget", "system", + "vitskap", "seksjon", "skule", "fakultet", "norge", "norsk", "seniorforskar", "skandinavia", "strategisk", + "lærer", "byrå", "assistent", "assisterande", "organisasjon", "senter", "departement", "avdeling", "fylkesmann", + "institutt", "internasjonal", "verda", "program", "rapport", "vitskap" + ], + "eng": [ "academy", "university", "director", "directorate", "defence", "military", "researcher", "senior researcher", "research leader", "research manager", "professor", "college", "consultant", "consulting", "municipal", "municipality", "royal", "politics", "politician", "police", "authority", "authorities", "surveillance", "research", "europe", diff --git a/metadata_extract/init_files.py b/metadata_extract/init_files.py index 8ffe688..03c6b49 100644 --- a/metadata_extract/init_files.py +++ b/metadata_extract/init_files.py @@ -1,9 +1,10 @@ # pylint: disable=missing-module-docstring import json - from importlib.resources import files from typing import Type, Any +from src.settings import get_settings + class InitFiles: """ Class for loading the files in the metadata_extract.data directory, @@ -29,6 +30,7 @@ def __new__(cls: Type['InitFiles'], *args: Any, **kwargs: Any) -> 'InitFiles': def __init__(self) -> None: if InitFiles._initialized: return + self.languages = get_settings().LANGUAGES.split(',') self.info_page_keywords = self.__init_info_page_keywords() self.stopwords = self.__init_stopwords() self.labels = self.__init_labels() @@ -55,31 +57,31 @@ def __init_info_page_keywords(self) -> list[str]: keywords = [] with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: languages = json.load(file) - for lang in languages: - keywords.extend(languages[lang]) + [keywords.extend(languages[lang]) for lang in self.languages if lang in languages] return keywords def __init_stopwords(self) -> list[str]: stopwords = [] with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: languages = json.load(file) - for lang in languages: - stopwords.extend(languages[lang]) + [stopwords.extend(languages[lang]) for lang in self.languages if lang in languages] return stopwords def __init_labels(self) -> dict[str, str]: labels = {} with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: languages = json.load(file) - for lang in languages: - for key in languages[lang]: - if key not in labels: - labels[key] = "" - labels[key] += "|" + "|".join(languages[lang][key]) + [self.__get_labels_from_lang(labels, lang, languages) for lang in self.languages if lang in languages] for key in labels: labels[key] = labels[key].lstrip("|").rstrip("|") return labels + def __get_labels_from_lang(self, labels: dict, lang, languages) -> None: + for key in languages[lang]: + if key not in labels: + labels[key] = "" + labels[key] += "|" + "|".join(languages[lang][key]) + def __init_doc_type_mapping(self) -> dict[str, str]: doc_type_mapping = {} with files("metadata_extract.data")\ diff --git a/src/settings.py b/src/settings.py index cf6e510..4dd9102 100644 --- a/src/settings.py +++ b/src/settings.py @@ -14,6 +14,7 @@ class Settings(BaseSettings): MOUNT_FOLDER: str = "" MAX_FILE_SIZE_MB: int = 0 ENVIRONMENT: str = "local" + LANGUAGES: str = "" REGISTRY_FILE: str = "" REGISTRY_HOST: str = "" REGISTRY_USER: str = "" From 42f04e12f90767a2291a4202ec0deb5adfbc1f3f Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Mon, 4 Sep 2023 08:06:47 +0200 Subject: [PATCH 02/15] Fix lint errors --- metadata_extract/init_files.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/metadata_extract/init_files.py b/metadata_extract/init_files.py index 03c6b49..96523b9 100644 --- a/metadata_extract/init_files.py +++ b/metadata_extract/init_files.py @@ -57,26 +57,31 @@ def __init_info_page_keywords(self) -> list[str]: keywords = [] with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: languages = json.load(file) - [keywords.extend(languages[lang]) for lang in self.languages if lang in languages] + for lang in filter(lambda x: x in languages, self.languages): + keywords.extend(languages[lang]) return keywords def __init_stopwords(self) -> list[str]: stopwords = [] with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: languages = json.load(file) - [stopwords.extend(languages[lang]) for lang in self.languages if lang in languages] + for lang in filter(lambda x: x in languages, self.languages): + stopwords.extend(languages[lang]) return stopwords def __init_labels(self) -> dict[str, str]: - labels = {} + labels: dict[str, str] = {} with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: - languages = json.load(file) - [self.__get_labels_from_lang(labels, lang, languages) for lang in self.languages if lang in languages] + languages: dict[str, Any] = json.load(file) + for lang in filter(lambda x: x in languages, self.languages): + self.__get_labels_from_lang(labels, lang, languages) for key in labels: labels[key] = labels[key].lstrip("|").rstrip("|") return labels - def __get_labels_from_lang(self, labels: dict, lang, languages) -> None: + def __get_labels_from_lang( + self, labels: dict[str, str], lang: str, languages: dict[str, Any] + ) -> None: for key in languages[lang]: if key not in labels: labels[key] = "" From e0b95f3ef8ba4cbcd61260a2846e3cb84aed6d16 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Mon, 4 Sep 2023 10:04:32 +0200 Subject: [PATCH 03/15] Add pytest-env --- .github/workflows/lint_and_test.yml | 2 +- test/pytest.ini | 3 +++ test/requirements.txt | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 test/pytest.ini diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index 37efdb6..2637416 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -23,4 +23,4 @@ jobs: - name: Type-check run: mypy metadata_extract src diff main.py - name: Running tests - run: python -m pytest --cov=metadata_extract + run: python -m pytest -c test/pytest.ini --cov=metadata_extract diff --git a/test/pytest.ini b/test/pytest.ini new file mode 100644 index 0000000..2ddc476 --- /dev/null +++ b/test/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +env = + LANGUAGES=nob,nno,nor,eng \ No newline at end of file diff --git a/test/requirements.txt b/test/requirements.txt index e34eb7a..589518b 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -8,3 +8,4 @@ types-dateparser==1.1.4.9 types-requests==2.31.0.1 types-Markdown==3.4.2.9 pylint==2.17.4 +pytest-env==1.0.1 From 9f3b3388cfc99e3af6b81b83c13c89e1845a5977 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Wed, 6 Sep 2023 07:37:33 +0200 Subject: [PATCH 04/15] Fill out nynorsk word lists --- .../data/txt/info_page_keywords.json | 14 ++++++++------ metadata_extract/data/txt/stopwords.json | 18 +++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/metadata_extract/data/txt/info_page_keywords.json b/metadata_extract/data/txt/info_page_keywords.json index 2d9ace3..8cb8522 100644 --- a/metadata_extract/data/txt/info_page_keywords.json +++ b/metadata_extract/data/txt/info_page_keywords.json @@ -6,15 +6,17 @@ ], "nob": [ "år", "dato", "utgivelsesår", "utgiver", "utgitt", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel", - "forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "©", "opphavsrett", - "rettighetshaver", "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører", - "oppdragsgiver", "tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", - "sitering", "signatur" + "forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "©", "opphavsrett", "rettighetshaver", + "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører", "oppdragsgiver", + "tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", + "signatur" ], "nno": [ - "forfattar" + "år", "dato", "utgivingsår", "utgivar", "gitt ut", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel", + "forfattar", "talet på sider", "sidetal", "samandrag", "©", "opphavsrett", "rettshavar", "emneord", "nøkkelord", + "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktørar", "oppdragsgivar", "tilgjengelegheit", + "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur" ], - "nor": [], "und": [ "isbn", "issn", "©" ] diff --git a/metadata_extract/data/txt/stopwords.json b/metadata_extract/data/txt/stopwords.json index 30c7b30..4d9eba3 100644 --- a/metadata_extract/data/txt/stopwords.json +++ b/metadata_extract/data/txt/stopwords.json @@ -1,4 +1,13 @@ { + "eng": [ + "academy", "university", "director", "directorate", "defence", "military", "researcher", "senior researcher", + "research leader", "research manager", "professor", "college", "consultant", "consulting", "municipal", "municipality", + "royal", "politics", "politician", "police", "authority", "authorities", "surveillance", "research", "europe", + "america", "county", "office", "senior advisor", "advisor", "senior advisors", "project", "project manager", + "laboratory", "parliament", "system", "science", "section", "school", "faculty", "norway", "norwegian", + "scandinavia", "strategic", "teacher", "agency", "assistant", "organization", "center", "department", "division", + "governor", "institute", "international", "world", "policy", "program", "report", "science", "ministry", "ministries" + ], "nob": [ "akademi", "universitet", "direktorat", "direktoratet", "forsvar", "militær", "forsker", "seniorforsker", "forskningsleder", "forskningssjef", "professor", "høgskole", "høyskole", "konsulent", "kommunal", "kommune", @@ -18,14 +27,5 @@ "vitskap", "seksjon", "skule", "fakultet", "norge", "norsk", "seniorforskar", "skandinavia", "strategisk", "lærer", "byrå", "assistent", "assisterande", "organisasjon", "senter", "departement", "avdeling", "fylkesmann", "institutt", "internasjonal", "verda", "program", "rapport", "vitskap" - ], - "eng": [ - "academy", "university", "director", "directorate", "defence", "military", "researcher", "senior researcher", - "research leader", "research manager", "professor", "college", "consultant", "consulting", "municipal", "municipality", - "royal", "politics", "politician", "police", "authority", "authorities", "surveillance", "research", "europe", - "america", "county", "office", "senior advisor", "advisor", "senior advisors", "project", "project manager", - "laboratory", "parliament", "system", "science", "section", "school", "faculty", "norway", "norwegian", - "scandinavia", "strategic", "teacher", "agency", "assistant", "organization", "center", "department", "division", - "governor", "institute", "international", "world", "policy", "program", "report", "science", "ministry", "ministries" ] } \ No newline at end of file From c4cd7c92271eea29a689c575ce4dc912e1f415f3 Mon Sep 17 00:00:00 2001 From: fredrikmonsen <31658585+fredrikmonsen@users.noreply.github.com> Date: Thu, 7 Sep 2023 08:18:04 +0200 Subject: [PATCH 05/15] Apply suggestions from code review Co-authored-by: pierrebeauguitte --- .env.example | 2 +- metadata_extract/data/txt/info_page_keywords.json | 10 +++++----- test/pytest.ini | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index 9978e92..a354002 100644 --- a/.env.example +++ b/.env.example @@ -5,7 +5,7 @@ DIFF_FILES_FOLDER=/path/to/diff/files # A comma separated list of ISO 639-2 language codes # Languages must be present in the metadata_extract.data.txt files, otherwise they will be skipped on a per file basis -LANGUAGE=eng,nob +LANGUAGES=eng,nob # To use a authority registry database, specify either... diff --git a/metadata_extract/data/txt/info_page_keywords.json b/metadata_extract/data/txt/info_page_keywords.json index 8cb8522..9042073 100644 --- a/metadata_extract/data/txt/info_page_keywords.json +++ b/metadata_extract/data/txt/info_page_keywords.json @@ -1,19 +1,19 @@ { "eng": [ - "year", "date", "publish", "isbn", "issn", "project no.", "title", "author", "number of pages", "summary", - "abstract", "©", "subject words", "keywords", "country", "county", "report", "employer", "availability", + "year", "date", "publish", "project no.", "title", "author", "number of pages", "summary", + "abstract", "subject words", "keywords", "country", "county", "report", "employer", "availability", "contact person", "publication type" ], "nob": [ "år", "dato", "utgivelsesår", "utgiver", "utgitt", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel", - "forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "©", "opphavsrett", "rettighetshaver", + "forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "opphavsrett", "rettighetshaver", "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører", "oppdragsgiver", "tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur" ], "nno": [ - "år", "dato", "utgivingsår", "utgivar", "gitt ut", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel", - "forfattar", "talet på sider", "sidetal", "samandrag", "©", "opphavsrett", "rettshavar", "emneord", "nøkkelord", + "år", "dato", "utgivingsår", "utgivar", "gitt ut", "prosjektnummer", "prosjektnr.", "tittel", + "forfattar", "talet på sider", "sidetal", "samandrag", "opphavsrett", "rettshavar", "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktørar", "oppdragsgivar", "tilgjengelegheit", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur" ], diff --git a/test/pytest.ini b/test/pytest.ini index 2ddc476..3d49122 100644 --- a/test/pytest.ini +++ b/test/pytest.ini @@ -1,3 +1,3 @@ [pytest] env = - LANGUAGES=nob,nno,nor,eng \ No newline at end of file + LANGUAGES=nob,nno,eng \ No newline at end of file From 1c27537eff6d0add49acc17732ed97e00ce7706e Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Fri, 8 Sep 2023 13:38:03 +0200 Subject: [PATCH 06/15] Rewrite singleton InitFiles -> ResourceLoader with static methods --- metadata_extract/author_name.py | 8 +- metadata_extract/finder.py | 8 +- metadata_extract/infopage.py | 11 +-- metadata_extract/init_files.py | 95 ------------------------ metadata_extract/meteor.py | 7 +- metadata_extract/page.py | 2 +- metadata_extract/resource_loader.py | 96 ++++++++++++++++++++++++ metadata_extract/text.py | 111 ++++++++++++++++++++-------- src/util.py | 8 +- test/test_infopage.py | 2 + 10 files changed, 205 insertions(+), 143 deletions(-) delete mode 100644 metadata_extract/init_files.py create mode 100644 metadata_extract/resource_loader.py diff --git a/metadata_extract/author_name.py b/metadata_extract/author_name.py index 227513e..6ac9d22 100644 --- a/metadata_extract/author_name.py +++ b/metadata_extract/author_name.py @@ -33,7 +33,7 @@ def get_author_names(text_block: str) -> Optional[list[str]]: def remove_multi_capital_letter(authors: list[str]) -> list[str]: """Remove all names with multiple sequential capital letters""" return [author for author in authors - if not text.DOUBLE_CAPITAL_LETTER_PATTERN.search(author)] + if not text.double_capital_letter_pattern().search(author)] def remove_non_author_name(name: str) -> Optional[str]: @@ -44,7 +44,7 @@ def remove_non_author_name(name: str) -> Optional[str]: def remove_parenthesis(author_text: str) -> str: """Remove all parenthesis and text inside them""" - parenthesis_match = text.PARENTHESIS_PATTERN.findall(author_text) + parenthesis_match = text.parenthesis_pattern().findall(author_text) if parenthesis_match: for match in parenthesis_match: author_text = author_text.replace(match, "") @@ -53,7 +53,7 @@ def remove_parenthesis(author_text: str) -> str: def match_text_name_regex(author_text: str) -> Optional[str]: """Match all names in text to name_pattern regular expression""" - name_match = text.NAME_PATTERN.findall(author_text) + name_match = text.name_pattern().findall(author_text) if name_match: author_text = ", ".join(name_match) return author_text @@ -82,7 +82,7 @@ def is_probable_name_block(text_block: str) -> bool: text_block = text.substitute_special_char_and_binding(text_block) # Step 2: Check if the block and the match have the same length - match_block = "".join(text.NAME_PATTERN.findall(text_block)) + match_block = "".join(text.name_pattern().findall(text_block)) return len(text_block) == len(match_block) diff --git a/metadata_extract/finder.py b/metadata_extract/finder.py index c75f7aa..aa09250 100644 --- a/metadata_extract/finder.py +++ b/metadata_extract/finder.py @@ -100,7 +100,7 @@ def find_publisher(self) -> None: Returns after first value is found.""" for number, page in self.doc.pages.items(): for line in page.split('\n'): - match = text.PUBLISHER_LABEL.match(line) + match = text.publisher_label().match(line) if match is not None: value = line[match.span()[1]:].strip() if value != '': @@ -134,7 +134,7 @@ def find_author(self) -> None: if found_title and isinstance(found_title, str) \ and author_name.name_exists_in_title(found_title, author): continue - if any(keyword in author.lower() for keyword in text.STOPWORDS): + if any(keyword in author.lower() for keyword in text.stopwords()): continue if author_name.is_all_caps_spaced(author): continue @@ -189,10 +189,10 @@ def get_author_from_info(self) -> None: # TODO: Seems to only fetch first author before comma from pdfinfo if self.doc.pdfinfo['author']: author = self.doc.pdfinfo['author'] - name_match = text.NAME_PATTERN.findall(author) + name_match = text.name_pattern().findall(author) for match in name_match: found_on_page = text.find_in_pages(match, self.doc.pages) - if any(keyword in author.lower() for keyword in text.STOPWORDS): + if any(keyword in author.lower() for keyword in text.stopwords()): continue if found_on_page > 0: candidate = Candidate(author_name.create_author_dict(match), diff --git a/metadata_extract/infopage.py b/metadata_extract/infopage.py index 75e2d44..905ace1 100644 --- a/metadata_extract/infopage.py +++ b/metadata_extract/infopage.py @@ -4,9 +4,9 @@ from typing import Optional, TypedDict from fitz import Document -from metadata_extract.init_files import InitFiles from . import text from .author_name import get_author_names +from .resource_loader import ResourceLoader from .page import Page, TextBlock @@ -30,9 +30,6 @@ class InfoPage(Page): HASKEYWORD = 4 KEYWORDFONT = 8 - # TODO: add nynorsk / samisk words - keywords = InitFiles().get_info_page_keywords() - @staticmethod def find_page_number(pages: dict[int, str]) -> int: """Looks for the info page, based on a keyword list. @@ -42,7 +39,7 @@ def find_page_number(pages: dict[int, str]) -> int: scores: dict[int, int] = {} for page in pages: score = 0 - for k in InfoPage.keywords: + for k in ResourceLoader.get_info_page_keywords(): if k in pages[page].lower(): score += 1 scores[page] = score @@ -53,7 +50,7 @@ def find_page_number(pages: dict[int, str]) -> int: @staticmethod def keyword_appears_in(string: str) -> bool: - for k in InfoPage.keywords: + for k in ResourceLoader.get_info_page_keywords(): if k in string.lower(): return True return False @@ -136,7 +133,7 @@ def find_publisher(self) -> Optional[str]: def find_author(self) -> Optional[list[str]]: author_block = None for block in self.text_blocks: - if text.AUTHOR_LABEL.match(block.text.lower()): + if text.author_label().match(block.text.lower()): author_block = block break if author_block: diff --git a/metadata_extract/init_files.py b/metadata_extract/init_files.py deleted file mode 100644 index 96523b9..0000000 --- a/metadata_extract/init_files.py +++ /dev/null @@ -1,95 +0,0 @@ -# pylint: disable=missing-module-docstring -import json -from importlib.resources import files -from typing import Type, Any - -from src.settings import get_settings - - -class InitFiles: - """ Class for loading the files in the metadata_extract.data directory, - which are applied to regular expressions in the metadata_extract.text module - and in the keyword list of the metadata.infopage module. - - The class is implemented as a singleton, so that the files are only read once. - - The application requires the following files used in this class to run: - - txt/info_page_keywords.json - - txt/stopwords.json - - txt/labels.json - - txt/doc_type_mapping_no_en.json - """ - _instance = None - _initialized = False - - def __new__(cls: Type['InitFiles'], *args: Any, **kwargs: Any) -> 'InitFiles': - if not cls._instance: - cls._instance = super(InitFiles, cls).__new__(cls, *args, **kwargs) - return cls._instance - - def __init__(self) -> None: - if InitFiles._initialized: - return - self.languages = get_settings().LANGUAGES.split(',') - self.info_page_keywords = self.__init_info_page_keywords() - self.stopwords = self.__init_stopwords() - self.labels = self.__init_labels() - self.doc_type_mapping = self.__init_doc_type_mapping() - InitFiles._initialized = True - - @staticmethod - def get_info_page_keywords() -> list[str]: - return InitFiles().info_page_keywords - - @staticmethod - def get_stopwords() -> list[str]: - return InitFiles().stopwords - - @staticmethod - def get_labels() -> dict[str, str]: - return InitFiles().labels - - @staticmethod - def get_doc_type_mapping() -> dict[str, str]: - return InitFiles().doc_type_mapping - - def __init_info_page_keywords(self) -> list[str]: - keywords = [] - with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: - languages = json.load(file) - for lang in filter(lambda x: x in languages, self.languages): - keywords.extend(languages[lang]) - return keywords - - def __init_stopwords(self) -> list[str]: - stopwords = [] - with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: - languages = json.load(file) - for lang in filter(lambda x: x in languages, self.languages): - stopwords.extend(languages[lang]) - return stopwords - - def __init_labels(self) -> dict[str, str]: - labels: dict[str, str] = {} - with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: - languages: dict[str, Any] = json.load(file) - for lang in filter(lambda x: x in languages, self.languages): - self.__get_labels_from_lang(labels, lang, languages) - for key in labels: - labels[key] = labels[key].lstrip("|").rstrip("|") - return labels - - def __get_labels_from_lang( - self, labels: dict[str, str], lang: str, languages: dict[str, Any] - ) -> None: - for key in languages[lang]: - if key not in labels: - labels[key] = "" - labels[key] += "|" + "|".join(languages[lang][key]) - - def __init_doc_type_mapping(self) -> dict[str, str]: - doc_type_mapping = {} - with files("metadata_extract.data")\ - .joinpath("txt/doc_type_mapping_no_en.json").open() as file: - doc_type_mapping = json.load(file) - return doc_type_mapping diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index b9f41ae..3bca63e 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -2,6 +2,8 @@ from typing import Optional + +from .resource_loader import ResourceLoader from .registry import PublisherRegistry from .meteor_document import MeteorDocument from .metadata import Results @@ -17,8 +19,11 @@ class Meteor: and return the best ones as a Results object (TypedDict, JSON-serializable) """ - def __init__(self) -> None: + def __init__(self, languages: Optional[list[str]] = None) -> None: self.registry: Optional[PublisherRegistry] = None + ResourceLoader.load_info_page_keywords(languages) + ResourceLoader.load_stopwords(languages) + ResourceLoader.load_labels(languages) def set_registry(self, registry: PublisherRegistry) -> None: self.registry = registry diff --git a/metadata_extract/page.py b/metadata_extract/page.py index f3bec01..2d9ee56 100644 --- a/metadata_extract/page.py +++ b/metadata_extract/page.py @@ -129,7 +129,7 @@ def find_isxn(self, identifier: str) -> list[ValueAndContext]: def find_publisher_block(self) -> Optional[TextBlock]: publisher_block = None for block in self.text_blocks: - if text.PUBLISHER_LABEL.match(block.text.lower()): + if text.publisher_label().match(block.text.lower()): publisher_block = block break return publisher_block diff --git a/metadata_extract/resource_loader.py b/metadata_extract/resource_loader.py new file mode 100644 index 0000000..dcfdacf --- /dev/null +++ b/metadata_extract/resource_loader.py @@ -0,0 +1,96 @@ +# pylint: disable=missing-module-docstring +import json +from importlib.resources import files +from typing import Any, Optional + + +class ResourceLoader: + """ Class for loading resource files in the metadata_extract.data directory, + which are applied to regular expressions in the metadata_extract.text module + and in the keyword list of the metadata.infopage module. + + The application requires the following files used in this class to run: + - txt/info_page_keywords.json + - txt/stopwords.json + - txt/labels.json + - txt/doc_type_mapping_no_en.json + """ + __info_page_keywords: list[str] + __stopwords: list[str] + __labels: dict[str, Any] + __doc_type_mapping: dict[str, str] + + __lang_info_page_keywords: dict[str, list[str]] + __lang_stopwords: dict[str, list[str]] + __lang_labels: dict[str, Any] + + @staticmethod + def load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None: + with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: + info_page_keywords = json.load(file) + ResourceLoader.__lang_info_page_keywords = {} + if selected_languages: + for lang in filter( + lambda x: x in info_page_keywords, selected_languages + ): + ResourceLoader.__lang_info_page_keywords[lang] = info_page_keywords[lang] + else: + ResourceLoader.__lang_info_page_keywords = info_page_keywords + + @staticmethod + def load_stopwords(selected_languages: Optional[list[str]] = None) -> None: + with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: + stopwords = json.load(file) + ResourceLoader.__lang_stopwords = {} + if selected_languages: + for lang in filter(lambda x: x in stopwords, selected_languages): + ResourceLoader.__lang_stopwords[lang] = stopwords[lang] + else: + ResourceLoader.__lang_stopwords = stopwords + + @staticmethod + def load_labels(selected_languages: Optional[list[str]] = None) -> None: + with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: + labels = json.load(file) + ResourceLoader.__lang_labels = {} + if selected_languages: + for lang in filter(lambda x: x in labels, selected_languages): + ResourceLoader.__lang_labels[lang] = labels[lang] + else: + ResourceLoader.__lang_labels = labels + + @staticmethod + def get_info_page_keywords() -> list[str]: + keywords = [] + for lang in ResourceLoader.__lang_info_page_keywords: + keywords.extend(ResourceLoader.__lang_info_page_keywords[lang]) + ResourceLoader.__info_page_keywords = keywords + return ResourceLoader.__info_page_keywords + + @staticmethod + def get_stopwords() -> list[str]: + stopwords = [] + for lang in ResourceLoader.__lang_stopwords: + stopwords.extend(ResourceLoader.__lang_stopwords[lang]) + ResourceLoader.__stopwords = stopwords + return ResourceLoader.__stopwords + + @staticmethod + def get_labels() -> dict[str, str]: + labels: dict[str, str] = {} + for lang in ResourceLoader.__lang_labels: + for key in ResourceLoader.__lang_labels[lang]: + if key not in labels: + labels[key] = "" + labels[key] += "|" + "|".join(ResourceLoader.__lang_labels[lang][key]) + for key in labels: + labels[key] = labels[key].lstrip("|").rstrip("|") + ResourceLoader.__labels = labels + return ResourceLoader.__labels + + @staticmethod + def get_doc_type_mapping() -> dict[str, str]: + with files("metadata_extract.data") \ + .joinpath("txt/doc_type_mapping_no_en.json").open() as file: + ResourceLoader.__doc_type_mapping = json.load(file) + return ResourceLoader.__doc_type_mapping diff --git a/metadata_extract/text.py b/metadata_extract/text.py index 89518a7..aa0086a 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -1,10 +1,11 @@ """Text module, containing methods and logic dealing with strings and regexes.""" import re -from typing import Optional +from typing import Optional, Pattern + import regex -from metadata_extract.init_files import InitFiles +from metadata_extract.resource_loader import ResourceLoader class ValueAndContext: @@ -17,29 +18,79 @@ def append_to_context(self, extra_context: str) -> None: self.context = (self.context or '') + extra_context -files = InitFiles() -LABELS = files.get_labels() -STOPWORDS = files.get_stopwords() +def labels() -> dict[str, str]: + return ResourceLoader.get_labels() + + +def stopwords() -> list[str]: + return ResourceLoader.get_stopwords() + + +def info_page_keywords() -> list[str]: + return ResourceLoader.get_info_page_keywords() + ISXN_PATTERN = { 'ISSN': re.compile(r"\D(\d{4}[–-][\dX]{4})\D"), 'ISBN': re.compile(r"\D([\d–-]{13,17})\D") } -REPORT_PATTERN = re.compile(fr'^(\w+)\W({LABELS["report"]})\W', re.IGNORECASE) -TYPE_PATTERN_1 = re.compile(fr'\b({LABELS["reportType"]})\b', re.IGNORECASE) -TYPE_PATTERN_2 = re.compile(r'\bNOU\b') -DOC_TYPE_MAPPING = files.get_doc_type_mapping() -PUBLISHER_LABEL = re.compile(fr'({LABELS["publisher"]}):?', re.IGNORECASE) -NO_LETTERS_PATTERN = re.compile(r'^[\W\d]+$') -AUTHOR_LABEL = re.compile(fr'({LABELS["author"]}):?', re.IGNORECASE) -NAME_PATTERN = regex.compile(r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + - r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") -PARENTHESIS_PATTERN = re.compile(r"\(.*?\)") -DOUBLE_CAPITAL_LETTER_PATTERN = re.compile(r"\b[A-Z]{2,}\b") -BINDING_WORD_PATTERN = re.compile(fr'\b(?:{LABELS["bindingWords"]})\b|&|,') -SPECIAL_CHAR_AND_BINDING_PATTERN = re.compile(fr'[;:,.]|({LABELS["bindingWords"]})\b|&+') -NON_ALPHANUMERIC_PATTERN = re.compile(r"\W+") -PHOTOGRAPH_LABEL = re.compile(fr'\b({LABELS["photo"]})\b', re.IGNORECASE) + + +def report_pattern() -> Pattern[str]: + return re.compile(fr'^(\w+)\W({labels()["report"]})\W', re.IGNORECASE) + + +def type_pattern_1() -> Pattern[str]: + return re.compile(fr'\b({labels()["reportType"]})\b', re.IGNORECASE) + + +def type_pattern_2() -> Pattern[str]: + return re.compile(r'\bNOU\b') + + +def doc_type_mapping() -> dict[str, str]: + return ResourceLoader.get_doc_type_mapping() + + +def publisher_label() -> Pattern[str]: + return re.compile(fr'({labels()["publisher"]}):?', re.IGNORECASE) + + +def no_letters_pattern() -> Pattern[str]: + return re.compile(r'^[\W\d]+$') + + +def author_label() -> Pattern[str]: + return re.compile(fr'({labels()["author"]}):?', re.IGNORECASE) + + +def name_pattern() -> regex.regex.Pattern[str]: + return regex.compile(r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + + r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") + + +def parenthesis_pattern() -> Pattern[str]: + return re.compile(r"\(.*?\)") + + +def double_capital_letter_pattern() -> Pattern[str]: + return re.compile(r"\b[A-Z]{2,}\b") + + +def binding_word_pattern() -> Pattern[str]: + return re.compile(fr'\b(?:{labels()["bindingWords"]})\b|&|,') + + +def special_char_and_binding_pattern() -> Pattern[str]: + return re.compile(fr'[;:,.]|({labels()["bindingWords"]})\b|&+') + + +def non_alphanumeric_pattern() -> Pattern[str]: + return re.compile(r"\W+") + + +def photograph_label() -> Pattern[str]: + return re.compile(fr'\b({labels()["photo"]})\b', re.IGNORECASE) def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: @@ -63,7 +114,7 @@ def find_isxn(identifier: str, text: str) -> Optional[ValueAndContext]: def find_report_prefix(text: str) -> Optional[str]: - match = REPORT_PATTERN.search(text + '.') + match = report_pattern().search(text + '.') if match: prefix = match.group(1) if prefix.isupper(): @@ -72,20 +123,20 @@ def find_report_prefix(text: str) -> Optional[str]: def find_doc_type(page_text: str) -> Optional[str]: - match = TYPE_PATTERN_1.search(page_text) + match = type_pattern_1().search(page_text) if match: doc_type = match.group(1).lower() - if doc_type in DOC_TYPE_MAPPING: - return DOC_TYPE_MAPPING[doc_type] + if doc_type in doc_type_mapping(): + return doc_type_mapping()[doc_type] return doc_type - match = TYPE_PATTERN_2.search(page_text) + match = type_pattern_2().search(page_text) if match: return "nou" return None def has_no_letters(text: str) -> bool: - return bool(NO_LETTERS_PATTERN.match(text)) + return bool(no_letters_pattern().match(text)) def clean_whitespace(text: str) -> str: @@ -93,16 +144,16 @@ def clean_whitespace(text: str) -> str: def split_on_binding_word(text: str) -> list[str]: - return BINDING_WORD_PATTERN.split(text) + return binding_word_pattern().split(text) def substitute_special_char_and_binding(text: str) -> str: - return SPECIAL_CHAR_AND_BINDING_PATTERN.sub('', text).replace(" ", " ").strip() + return special_char_and_binding_pattern().sub('', text).replace(" ", " ").strip() def substitute_non_alphanumeric(text: str) -> str: - return NON_ALPHANUMERIC_PATTERN.sub(' ', text).strip() + return non_alphanumeric_pattern().sub(' ', text).strip() def has_non_author_keywords(text: str) -> bool: - return bool(re.search(PHOTOGRAPH_LABEL, text)) + return bool(re.search(photograph_label(), text)) diff --git a/src/util.py b/src/util.py index 8326056..6ea8398 100644 --- a/src/util.py +++ b/src/util.py @@ -23,7 +23,7 @@ class Utils: """Helper methods for API endpoints""" def __init__(self) -> None: - self.meteor = Meteor() + self.meteor = Meteor(languages=Utils.get_languages()) if get_settings().REGISTRY_FILE: self.meteor.set_registry( PublisherRegistry(registry_file=get_settings().REGISTRY_FILE) @@ -40,6 +40,12 @@ def __init__(self) -> None: ) ) + @staticmethod + def get_languages() -> list[str] | None: + if len(get_settings().LANGUAGES) == 0: + return None + return get_settings().LANGUAGES.split(',') + @staticmethod def get_environment_prefix() -> str: return "/meteor" if get_settings().ENVIRONMENT in ["stage", "prod"] else "" diff --git a/test/test_infopage.py b/test/test_infopage.py index ba73f6b..b4573a4 100644 --- a/test/test_infopage.py +++ b/test/test_infopage.py @@ -3,8 +3,10 @@ from metadata_extract.infopage import InfoPage from metadata_extract.meteor_document import MeteorDocument +from metadata_extract.resource_loader import ResourceLoader doc = MeteorDocument('test/resources/report.pdf') +ResourceLoader.load_info_page_keywords(["und", "eng", "nob", "nno"]) infopagenr = InfoPage.find_page_number(doc.pages) From 82c3222490206cd197a9cc0ddf3c3ebe830aa9c2 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Mon, 11 Sep 2023 09:04:08 +0200 Subject: [PATCH 07/15] Change und language code to mul --- .env.example | 3 ++- metadata_extract/data/txt/info_page_keywords.json | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index a354002..275083c 100644 --- a/.env.example +++ b/.env.example @@ -4,8 +4,9 @@ ENVIRONMENT=local DIFF_FILES_FOLDER=/path/to/diff/files # A comma separated list of ISO 639-2 language codes +# Include 'mul' to catch keywords present in multiple languages such as 'isbn' or 'issn' # Languages must be present in the metadata_extract.data.txt files, otherwise they will be skipped on a per file basis -LANGUAGES=eng,nob +LANGUAGES=mul,eng,nob # To use a authority registry database, specify either... diff --git a/metadata_extract/data/txt/info_page_keywords.json b/metadata_extract/data/txt/info_page_keywords.json index 9042073..9bd1c62 100644 --- a/metadata_extract/data/txt/info_page_keywords.json +++ b/metadata_extract/data/txt/info_page_keywords.json @@ -17,7 +17,7 @@ "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktørar", "oppdragsgivar", "tilgjengelegheit", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur" ], - "und": [ + "mul": [ "isbn", "issn", "©" ] } \ No newline at end of file From 7a8aea4d559b634dc55316eebf704b542234e7f6 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Tue, 12 Sep 2023 08:25:20 +0200 Subject: [PATCH 08/15] Apply suggestions --- .../data/txt/doc_type_mapping.json | 14 ++ .../data/txt/doc_type_mapping_no_en.json | 6 - metadata_extract/finder.py | 5 +- metadata_extract/meteor.py | 4 +- metadata_extract/resource_loader.py | 136 +++++++++++------- metadata_extract/text.py | 32 ++--- src/util.py | 2 +- test/test_infopage.py | 2 +- 8 files changed, 114 insertions(+), 87 deletions(-) create mode 100644 metadata_extract/data/txt/doc_type_mapping.json delete mode 100644 metadata_extract/data/txt/doc_type_mapping_no_en.json diff --git a/metadata_extract/data/txt/doc_type_mapping.json b/metadata_extract/data/txt/doc_type_mapping.json new file mode 100644 index 0000000..08c4464 --- /dev/null +++ b/metadata_extract/data/txt/doc_type_mapping.json @@ -0,0 +1,14 @@ +{ + "nob": { + "årsrapport": "annualReport", + "evaluering": "evaluation", + "veileder": "guidance", + "undersøkelse": "survey" + }, + "nno": { + "årsrapport": "annualReport", + "evaluering": "evaluation", + "veileiar": "guidance", + "undersøking": "survey" + } +} \ No newline at end of file diff --git a/metadata_extract/data/txt/doc_type_mapping_no_en.json b/metadata_extract/data/txt/doc_type_mapping_no_en.json deleted file mode 100644 index 3506d5a..0000000 --- a/metadata_extract/data/txt/doc_type_mapping_no_en.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "årsrapport": "annualReport", - "evaluering": "evaluation", - "veileder": "guidance", - "undersøkelse": "survey" -} \ No newline at end of file diff --git a/metadata_extract/finder.py b/metadata_extract/finder.py index aa09250..c3d8518 100644 --- a/metadata_extract/finder.py +++ b/metadata_extract/finder.py @@ -15,6 +15,7 @@ from .metadata import Metadata from .meteor_document import MeteorDocument from .registry import PublisherRegistry, RegistryType +from .resource_loader import ResourceLoader class CopyrightType(TypedDict): @@ -134,7 +135,7 @@ def find_author(self) -> None: if found_title and isinstance(found_title, str) \ and author_name.name_exists_in_title(found_title, author): continue - if any(keyword in author.lower() for keyword in text.stopwords()): + if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()): continue if author_name.is_all_caps_spaced(author): continue @@ -192,7 +193,7 @@ def get_author_from_info(self) -> None: name_match = text.name_pattern().findall(author) for match in name_match: found_on_page = text.find_in_pages(match, self.doc.pages) - if any(keyword in author.lower() for keyword in text.stopwords()): + if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()): continue if found_on_page > 0: candidate = Candidate(author_name.create_author_dict(match), diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index 3bca63e..635715d 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -21,9 +21,7 @@ class Meteor: def __init__(self, languages: Optional[list[str]] = None) -> None: self.registry: Optional[PublisherRegistry] = None - ResourceLoader.load_info_page_keywords(languages) - ResourceLoader.load_stopwords(languages) - ResourceLoader.load_labels(languages) + ResourceLoader.load(languages) def set_registry(self, registry: PublisherRegistry) -> None: self.registry = registry diff --git a/metadata_extract/resource_loader.py b/metadata_extract/resource_loader.py index dcfdacf..192a693 100644 --- a/metadata_extract/resource_loader.py +++ b/metadata_extract/resource_loader.py @@ -13,84 +13,116 @@ class ResourceLoader: - txt/info_page_keywords.json - txt/stopwords.json - txt/labels.json - - txt/doc_type_mapping_no_en.json + - txt/doc_type_mapping.json """ __info_page_keywords: list[str] __stopwords: list[str] __labels: dict[str, Any] __doc_type_mapping: dict[str, str] - __lang_info_page_keywords: dict[str, list[str]] - __lang_stopwords: dict[str, list[str]] - __lang_labels: dict[str, Any] + __lang_info_page_keywords: dict[str, list[str]] = {} + __lang_stopwords: dict[str, list[str]] = {} + __lang_labels: dict[str, Any] = {} + __lang_doc_type_mapping: dict[str, Any] = {} @staticmethod - def load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None: - with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: - info_page_keywords = json.load(file) - ResourceLoader.__lang_info_page_keywords = {} - if selected_languages: - for lang in filter( - lambda x: x in info_page_keywords, selected_languages - ): - ResourceLoader.__lang_info_page_keywords[lang] = info_page_keywords[lang] - else: - ResourceLoader.__lang_info_page_keywords = info_page_keywords - - @staticmethod - def load_stopwords(selected_languages: Optional[list[str]] = None) -> None: - with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: - stopwords = json.load(file) - ResourceLoader.__lang_stopwords = {} - if selected_languages: - for lang in filter(lambda x: x in stopwords, selected_languages): - ResourceLoader.__lang_stopwords[lang] = stopwords[lang] - else: - ResourceLoader.__lang_stopwords = stopwords - - @staticmethod - def load_labels(selected_languages: Optional[list[str]] = None) -> None: - with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: - labels = json.load(file) - ResourceLoader.__lang_labels = {} - if selected_languages: - for lang in filter(lambda x: x in labels, selected_languages): - ResourceLoader.__lang_labels[lang] = labels[lang] - else: - ResourceLoader.__lang_labels = labels + def load(selected_languages: Optional[list[str]] = None) -> None: + ResourceLoader.__load_info_page_keywords(selected_languages) + ResourceLoader.__load_stopwords(selected_languages) + ResourceLoader.__load_labels(selected_languages) + ResourceLoader.__load_doc_type_mapping(selected_languages) @staticmethod def get_info_page_keywords() -> list[str]: - keywords = [] - for lang in ResourceLoader.__lang_info_page_keywords: - keywords.extend(ResourceLoader.__lang_info_page_keywords[lang]) - ResourceLoader.__info_page_keywords = keywords return ResourceLoader.__info_page_keywords @staticmethod def get_stopwords() -> list[str]: - stopwords = [] - for lang in ResourceLoader.__lang_stopwords: - stopwords.extend(ResourceLoader.__lang_stopwords[lang]) - ResourceLoader.__stopwords = stopwords return ResourceLoader.__stopwords @staticmethod def get_labels() -> dict[str, str]: + return ResourceLoader.__labels + + @staticmethod + def get_doc_type_mapping() -> dict[str, str]: + return ResourceLoader.__doc_type_mapping + + @staticmethod + def __load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None: + if ResourceLoader.__lang_info_page_keywords: + return + with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: + keyword_data = json.load(file) + ResourceLoader.__lang_info_page_keywords = {} + if selected_languages: + for lang in filter( + lambda x: x in keyword_data, selected_languages + ): + ResourceLoader.__lang_info_page_keywords[lang] = keyword_data[lang] + else: + ResourceLoader.__lang_info_page_keywords = keyword_data + + keyword_list = [] + for lang, items in ResourceLoader.__lang_info_page_keywords.items(): + keyword_list.extend(items) + ResourceLoader.__info_page_keywords = keyword_list + + @staticmethod + def __load_stopwords(selected_languages: Optional[list[str]] = None) -> None: + if ResourceLoader.__lang_stopwords: + return + with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: + stopwords_data = json.load(file) + ResourceLoader.__lang_stopwords = {} + if selected_languages: + for lang in filter(lambda x: x in stopwords_data, selected_languages): + ResourceLoader.__lang_stopwords[lang] = stopwords_data[lang] + else: + ResourceLoader.__lang_stopwords = stopwords_data + + stopwords_list = [] + for lang, items in ResourceLoader.__lang_stopwords.items(): + stopwords_list.extend(items) + ResourceLoader.__stopwords = stopwords_list + + @staticmethod + def __load_labels(selected_languages: Optional[list[str]] = None) -> None: + if ResourceLoader.__lang_labels: + return + with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: + label_data = json.load(file) + ResourceLoader.__lang_labels = {} + if selected_languages: + for lang in filter(lambda x: x in label_data, selected_languages): + ResourceLoader.__lang_labels[lang] = label_data[lang] + else: + ResourceLoader.__lang_labels = label_data + labels: dict[str, str] = {} - for lang in ResourceLoader.__lang_labels: - for key in ResourceLoader.__lang_labels[lang]: + for lang, label_dict in ResourceLoader.__lang_labels.items(): + for key in label_dict: if key not in labels: labels[key] = "" labels[key] += "|" + "|".join(ResourceLoader.__lang_labels[lang][key]) for key in labels: labels[key] = labels[key].lstrip("|").rstrip("|") ResourceLoader.__labels = labels - return ResourceLoader.__labels @staticmethod - def get_doc_type_mapping() -> dict[str, str]: + def __load_doc_type_mapping(selected_languages: Optional[list[str]] = None) -> None: + if ResourceLoader.__lang_doc_type_mapping: + return with files("metadata_extract.data") \ - .joinpath("txt/doc_type_mapping_no_en.json").open() as file: - ResourceLoader.__doc_type_mapping = json.load(file) - return ResourceLoader.__doc_type_mapping + .joinpath("txt/doc_type_mapping.json").open() as file: + ResourceLoader.__lang_doc_type_mapping = json.load(file) + + doc_type_mapping: dict[str, str] = {} + if selected_languages: + for lang in filter( + lambda x: x in ResourceLoader.__lang_doc_type_mapping, selected_languages + ): + doc_type_mapping.update(ResourceLoader.__lang_doc_type_mapping[lang]) + else: + doc_type_mapping = ResourceLoader.__lang_doc_type_mapping + ResourceLoader.__doc_type_mapping = doc_type_mapping diff --git a/metadata_extract/text.py b/metadata_extract/text.py index aa0086a..88266b6 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -18,18 +18,10 @@ def append_to_context(self, extra_context: str) -> None: self.context = (self.context or '') + extra_context -def labels() -> dict[str, str]: +def __labels() -> dict[str, str]: return ResourceLoader.get_labels() -def stopwords() -> list[str]: - return ResourceLoader.get_stopwords() - - -def info_page_keywords() -> list[str]: - return ResourceLoader.get_info_page_keywords() - - ISXN_PATTERN = { 'ISSN': re.compile(r"\D(\d{4}[–-][\dX]{4})\D"), 'ISBN': re.compile(r"\D([\d–-]{13,17})\D") @@ -37,23 +29,19 @@ def info_page_keywords() -> list[str]: def report_pattern() -> Pattern[str]: - return re.compile(fr'^(\w+)\W({labels()["report"]})\W', re.IGNORECASE) + return re.compile(fr'^(\w+)\W({__labels()["report"]})\W', re.IGNORECASE) def type_pattern_1() -> Pattern[str]: - return re.compile(fr'\b({labels()["reportType"]})\b', re.IGNORECASE) + return re.compile(fr'\b({__labels()["reportType"]})\b', re.IGNORECASE) def type_pattern_2() -> Pattern[str]: return re.compile(r'\bNOU\b') -def doc_type_mapping() -> dict[str, str]: - return ResourceLoader.get_doc_type_mapping() - - def publisher_label() -> Pattern[str]: - return re.compile(fr'({labels()["publisher"]}):?', re.IGNORECASE) + return re.compile(fr'({__labels()["publisher"]}):?', re.IGNORECASE) def no_letters_pattern() -> Pattern[str]: @@ -61,7 +49,7 @@ def no_letters_pattern() -> Pattern[str]: def author_label() -> Pattern[str]: - return re.compile(fr'({labels()["author"]}):?', re.IGNORECASE) + return re.compile(fr'({__labels()["author"]}):?', re.IGNORECASE) def name_pattern() -> regex.regex.Pattern[str]: @@ -78,11 +66,11 @@ def double_capital_letter_pattern() -> Pattern[str]: def binding_word_pattern() -> Pattern[str]: - return re.compile(fr'\b(?:{labels()["bindingWords"]})\b|&|,') + return re.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,') def special_char_and_binding_pattern() -> Pattern[str]: - return re.compile(fr'[;:,.]|({labels()["bindingWords"]})\b|&+') + return re.compile(fr'[;:,.]|({__labels()["bindingWords"]})\b|&+') def non_alphanumeric_pattern() -> Pattern[str]: @@ -90,7 +78,7 @@ def non_alphanumeric_pattern() -> Pattern[str]: def photograph_label() -> Pattern[str]: - return re.compile(fr'\b({labels()["photo"]})\b', re.IGNORECASE) + return re.compile(fr'\b({__labels()["photo"]})\b', re.IGNORECASE) def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: @@ -126,8 +114,8 @@ def find_doc_type(page_text: str) -> Optional[str]: match = type_pattern_1().search(page_text) if match: doc_type = match.group(1).lower() - if doc_type in doc_type_mapping(): - return doc_type_mapping()[doc_type] + if doc_type in ResourceLoader.get_doc_type_mapping(): + return ResourceLoader.get_doc_type_mapping()[doc_type] return doc_type match = type_pattern_2().search(page_text) if match: diff --git a/src/util.py b/src/util.py index 6ea8398..315b918 100644 --- a/src/util.py +++ b/src/util.py @@ -42,7 +42,7 @@ def __init__(self) -> None: @staticmethod def get_languages() -> list[str] | None: - if len(get_settings().LANGUAGES) == 0: + if not get_settings().LANGUAGES: return None return get_settings().LANGUAGES.split(',') diff --git a/test/test_infopage.py b/test/test_infopage.py index b4573a4..c70f947 100644 --- a/test/test_infopage.py +++ b/test/test_infopage.py @@ -6,7 +6,7 @@ from metadata_extract.resource_loader import ResourceLoader doc = MeteorDocument('test/resources/report.pdf') -ResourceLoader.load_info_page_keywords(["und", "eng", "nob", "nno"]) +ResourceLoader.load(["und", "eng", "nob", "nno"]) infopagenr = InfoPage.find_page_number(doc.pages) From f2c71c3f823e991139c22ef94ccb660c086c0037 Mon Sep 17 00:00:00 2001 From: fredrikmonsen <31658585+fredrikmonsen@users.noreply.github.com> Date: Tue, 12 Sep 2023 08:33:10 +0200 Subject: [PATCH 09/15] Update test/test_infopage.py Co-authored-by: pierrebeauguitte --- test/test_infopage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_infopage.py b/test/test_infopage.py index c70f947..7aabb3f 100644 --- a/test/test_infopage.py +++ b/test/test_infopage.py @@ -6,7 +6,7 @@ from metadata_extract.resource_loader import ResourceLoader doc = MeteorDocument('test/resources/report.pdf') -ResourceLoader.load(["und", "eng", "nob", "nno"]) +ResourceLoader.load(["mul", "eng", "nob", "nno"]) infopagenr = InfoPage.find_page_number(doc.pages) From b66ed7e128aaf28ca06d6ca3731c292f5b12c44b Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Tue, 12 Sep 2023 08:35:58 +0200 Subject: [PATCH 10/15] Remove pytest-env dependency --- .github/workflows/lint_and_test.yml | 2 +- test/pytest.ini | 3 --- test/requirements.txt | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 test/pytest.ini diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index 2637416..37efdb6 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -23,4 +23,4 @@ jobs: - name: Type-check run: mypy metadata_extract src diff main.py - name: Running tests - run: python -m pytest -c test/pytest.ini --cov=metadata_extract + run: python -m pytest --cov=metadata_extract diff --git a/test/pytest.ini b/test/pytest.ini deleted file mode 100644 index 3d49122..0000000 --- a/test/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -env = - LANGUAGES=nob,nno,eng \ No newline at end of file diff --git a/test/requirements.txt b/test/requirements.txt index 589518b..e34eb7a 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -8,4 +8,3 @@ types-dateparser==1.1.4.9 types-requests==2.31.0.1 types-Markdown==3.4.2.9 pylint==2.17.4 -pytest-env==1.0.1 From 2d2122d454a55f6197351a9f0b4251a6857e2ceb Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Tue, 12 Sep 2023 13:55:23 +0200 Subject: [PATCH 11/15] Apply suggestions --- metadata_extract/resource_loader.py | 40 ++++----------- metadata_extract/text.py | 76 +++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 46 deletions(-) diff --git a/metadata_extract/resource_loader.py b/metadata_extract/resource_loader.py index 192a693..24a4cf3 100644 --- a/metadata_extract/resource_loader.py +++ b/metadata_extract/resource_loader.py @@ -15,13 +15,10 @@ class ResourceLoader: - txt/labels.json - txt/doc_type_mapping.json """ - __info_page_keywords: list[str] - __stopwords: list[str] + __info_page_keywords: list[str] = [] + __stopwords: list[str] = [] __labels: dict[str, Any] __doc_type_mapping: dict[str, str] - - __lang_info_page_keywords: dict[str, list[str]] = {} - __lang_stopwords: dict[str, list[str]] = {} __lang_labels: dict[str, Any] = {} __lang_doc_type_mapping: dict[str, Any] = {} @@ -50,41 +47,24 @@ def get_doc_type_mapping() -> dict[str, str]: @staticmethod def __load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None: - if ResourceLoader.__lang_info_page_keywords: + if ResourceLoader.__info_page_keywords: return with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file: keyword_data = json.load(file) - ResourceLoader.__lang_info_page_keywords = {} - if selected_languages: - for lang in filter( - lambda x: x in keyword_data, selected_languages - ): - ResourceLoader.__lang_info_page_keywords[lang] = keyword_data[lang] - else: - ResourceLoader.__lang_info_page_keywords = keyword_data - - keyword_list = [] - for lang, items in ResourceLoader.__lang_info_page_keywords.items(): - keyword_list.extend(items) - ResourceLoader.__info_page_keywords = keyword_list + for lang in keyword_data: + if selected_languages is None or lang in selected_languages: + ResourceLoader.__info_page_keywords.extend(keyword_data[lang]) @staticmethod def __load_stopwords(selected_languages: Optional[list[str]] = None) -> None: - if ResourceLoader.__lang_stopwords: + if ResourceLoader.__stopwords: return with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file: stopwords_data = json.load(file) - ResourceLoader.__lang_stopwords = {} - if selected_languages: - for lang in filter(lambda x: x in stopwords_data, selected_languages): - ResourceLoader.__lang_stopwords[lang] = stopwords_data[lang] - else: - ResourceLoader.__lang_stopwords = stopwords_data - stopwords_list = [] - for lang, items in ResourceLoader.__lang_stopwords.items(): - stopwords_list.extend(items) - ResourceLoader.__stopwords = stopwords_list + for lang in stopwords_data: + if selected_languages is None or lang in selected_languages: + ResourceLoader.__stopwords.extend(stopwords_data[lang]) @staticmethod def __load_labels(selected_languages: Optional[list[str]] = None) -> None: diff --git a/metadata_extract/text.py b/metadata_extract/text.py index 88266b6..eca05e0 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -22,63 +22,107 @@ def __labels() -> dict[str, str]: return ResourceLoader.get_labels() -ISXN_PATTERN = { +__PATTERNS: dict[str, Pattern[str]] = { 'ISSN': re.compile(r"\D(\d{4}[–-][\dX]{4})\D"), 'ISBN': re.compile(r"\D([\d–-]{13,17})\D") } +__NAME_PATTERN: dict[str, regex.regex.Pattern[str]] = {} + + +def issn_pattern() -> Pattern[str]: + if 'ISSN' not in __PATTERNS: + __PATTERNS['ISSN'] = re.compile(r"\D(\d{4}[–-][\dX]{4})\D") + return __PATTERNS['ISSN'] + + +def isbn_pattern() -> Pattern[str]: + if 'ISBN' not in __PATTERNS: + __PATTERNS['ISBN'] = re.compile(r"\D([\d–-]{13,17})\D") + return __PATTERNS['ISBN'] def report_pattern() -> Pattern[str]: - return re.compile(fr'^(\w+)\W({__labels()["report"]})\W', re.IGNORECASE) + if 'report' not in __PATTERNS: + __PATTERNS['report'] = re.compile(fr'^(\w+)\W({__labels()["report"]})\W', re.IGNORECASE) + return __PATTERNS['report'] def type_pattern_1() -> Pattern[str]: - return re.compile(fr'\b({__labels()["reportType"]})\b', re.IGNORECASE) + if 'type_pattern_1' not in __PATTERNS: + __PATTERNS['type_pattern_1'] = re.compile( + fr'\b({__labels()["reportType"]})\b', re.IGNORECASE + ) + return __PATTERNS['type_pattern_1'] def type_pattern_2() -> Pattern[str]: - return re.compile(r'\bNOU\b') + if 'type_pattern_2' not in __PATTERNS: + __PATTERNS['type_pattern_2'] = re.compile(r'\bNOU\b') + return __PATTERNS['type_pattern_2'] def publisher_label() -> Pattern[str]: - return re.compile(fr'({__labels()["publisher"]}):?', re.IGNORECASE) + if 'publisher' not in __PATTERNS: + __PATTERNS['publisher'] = re.compile(fr'({__labels()["publisher"]}):?', re.IGNORECASE) + return __PATTERNS['publisher'] def no_letters_pattern() -> Pattern[str]: - return re.compile(r'^[\W\d]+$') + if 'no_letters' not in __PATTERNS: + __PATTERNS['no_letters'] = re.compile(r'^[\W\d]+$') + return __PATTERNS['no_letters'] def author_label() -> Pattern[str]: - return re.compile(fr'({__labels()["author"]}):?', re.IGNORECASE) + if 'author' not in __PATTERNS: + __PATTERNS['author'] = re.compile(fr'({__labels()["author"]}):?', re.IGNORECASE) + return __PATTERNS['author'] def name_pattern() -> regex.regex.Pattern[str]: - return regex.compile(r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + - r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") + if 'name' not in __NAME_PATTERN: + __NAME_PATTERN['name'] = regex.compile( + r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + + r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") + return __NAME_PATTERN['name'] def parenthesis_pattern() -> Pattern[str]: - return re.compile(r"\(.*?\)") + if 'parenthesis' not in __PATTERNS: + __PATTERNS['parenthesis'] = re.compile(r"\(.*?\)") + return __PATTERNS['parenthesis'] def double_capital_letter_pattern() -> Pattern[str]: - return re.compile(r"\b[A-Z]{2,}\b") + if 'double_capital_letter' not in __PATTERNS: + __PATTERNS['double_capital_letter'] = re.compile(r"\b[A-Z]{2,}\b") + return __PATTERNS['double_capital_letter'] def binding_word_pattern() -> Pattern[str]: - return re.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,') + if 'binding_word' not in __PATTERNS: + __PATTERNS['binding_word'] = re.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,') + return __PATTERNS['binding_word'] def special_char_and_binding_pattern() -> Pattern[str]: - return re.compile(fr'[;:,.]|({__labels()["bindingWords"]})\b|&+') + if 'special_char_and_binding' not in __PATTERNS: + __PATTERNS['special_char_and_binding'] = re.compile( + fr'[;:,.]|({__labels()["bindingWords"]})\b|&+' + ) + return __PATTERNS['special_char_and_binding'] def non_alphanumeric_pattern() -> Pattern[str]: - return re.compile(r"\W+") + if 'non_alphanumeric' not in __PATTERNS: + __PATTERNS['non_alphanumeric'] = re.compile(r"\W+") + return __PATTERNS['non_alphanumeric'] def photograph_label() -> Pattern[str]: - return re.compile(fr'\b({__labels()["photo"]})\b', re.IGNORECASE) + if 'photograph' not in __PATTERNS: + __PATTERNS['photograph'] = re.compile(fr'\b({__labels()["photo"]})\b', re.IGNORECASE) + return __PATTERNS['photograph'] def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: @@ -95,7 +139,7 @@ def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: def find_isxn(identifier: str, text: str) -> Optional[ValueAndContext]: - match = ISXN_PATTERN[identifier].search("." + text + ".") + match = __PATTERNS[identifier].search("." + text + ".") if match: return ValueAndContext(re.sub('–', '-', match.group(1)), text.lower()) return None From 0957d8b8623066b1a55c6c9af644f9cc311ab007 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Tue, 12 Sep 2023 15:33:04 +0200 Subject: [PATCH 12/15] Refactor to only use regex package instead of re --- metadata_extract/text.py | 84 ++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/metadata_extract/text.py b/metadata_extract/text.py index eca05e0..3edcd0b 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -1,7 +1,6 @@ """Text module, containing methods and logic dealing with strings and regexes.""" -import re -from typing import Optional, Pattern +from typing import Optional import regex @@ -22,106 +21,105 @@ def __labels() -> dict[str, str]: return ResourceLoader.get_labels() -__PATTERNS: dict[str, Pattern[str]] = { - 'ISSN': re.compile(r"\D(\d{4}[–-][\dX]{4})\D"), - 'ISBN': re.compile(r"\D([\d–-]{13,17})\D") +__PATTERNS: dict[str, regex.regex.Pattern[str]] = { + 'ISSN': regex.compile(r"\D(\d{4}[–-][\dX]{4})\D"), + 'ISBN': regex.compile(r"\D([\d–-]{13,17})\D") } -__NAME_PATTERN: dict[str, regex.regex.Pattern[str]] = {} -def issn_pattern() -> Pattern[str]: +def issn_pattern() -> regex.regex.Pattern[str]: if 'ISSN' not in __PATTERNS: - __PATTERNS['ISSN'] = re.compile(r"\D(\d{4}[–-][\dX]{4})\D") + __PATTERNS['ISSN'] = regex.compile(r"\D(\d{4}[–-][\dX]{4})\D") return __PATTERNS['ISSN'] -def isbn_pattern() -> Pattern[str]: +def isbn_pattern() -> regex.regex.Pattern[str]: if 'ISBN' not in __PATTERNS: - __PATTERNS['ISBN'] = re.compile(r"\D([\d–-]{13,17})\D") + __PATTERNS['ISBN'] = regex.compile(r"\D([\d–-]{13,17})\D") return __PATTERNS['ISBN'] -def report_pattern() -> Pattern[str]: +def report_pattern() -> regex.regex.Pattern[str]: if 'report' not in __PATTERNS: - __PATTERNS['report'] = re.compile(fr'^(\w+)\W({__labels()["report"]})\W', re.IGNORECASE) + __PATTERNS['report'] = regex.compile(fr'^(\w+)\W({__labels()["report"]})\W(?i)') return __PATTERNS['report'] -def type_pattern_1() -> Pattern[str]: +def type_pattern_1() -> regex.regex.Pattern[str]: if 'type_pattern_1' not in __PATTERNS: - __PATTERNS['type_pattern_1'] = re.compile( - fr'\b({__labels()["reportType"]})\b', re.IGNORECASE + __PATTERNS['type_pattern_1'] = regex.compile( + fr'\b({__labels()["reportType"]})\b(?i)' ) return __PATTERNS['type_pattern_1'] -def type_pattern_2() -> Pattern[str]: +def type_pattern_2() -> regex.regex.Pattern[str]: if 'type_pattern_2' not in __PATTERNS: - __PATTERNS['type_pattern_2'] = re.compile(r'\bNOU\b') + __PATTERNS['type_pattern_2'] = regex.compile(r'\bNOU\b') return __PATTERNS['type_pattern_2'] -def publisher_label() -> Pattern[str]: +def publisher_label() -> regex.regex.Pattern[str]: if 'publisher' not in __PATTERNS: - __PATTERNS['publisher'] = re.compile(fr'({__labels()["publisher"]}):?', re.IGNORECASE) + __PATTERNS['publisher'] = regex.compile(fr'({__labels()["publisher"]}):?(?i)') return __PATTERNS['publisher'] -def no_letters_pattern() -> Pattern[str]: +def no_letters_pattern() -> regex.regex.Pattern[str]: if 'no_letters' not in __PATTERNS: - __PATTERNS['no_letters'] = re.compile(r'^[\W\d]+$') + __PATTERNS['no_letters'] = regex.compile(r'^[\W\d]+$') return __PATTERNS['no_letters'] -def author_label() -> Pattern[str]: +def author_label() -> regex.regex.Pattern[str]: if 'author' not in __PATTERNS: - __PATTERNS['author'] = re.compile(fr'({__labels()["author"]}):?', re.IGNORECASE) + __PATTERNS['author'] = regex.compile(fr'({__labels()["author"]}):?(?i)') return __PATTERNS['author'] def name_pattern() -> regex.regex.Pattern[str]: - if 'name' not in __NAME_PATTERN: - __NAME_PATTERN['name'] = regex.compile( + if 'name' not in __PATTERNS: + __PATTERNS['name'] = regex.compile( r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") - return __NAME_PATTERN['name'] + return __PATTERNS['name'] -def parenthesis_pattern() -> Pattern[str]: +def parenthesis_pattern() -> regex.regex.Pattern[str]: if 'parenthesis' not in __PATTERNS: - __PATTERNS['parenthesis'] = re.compile(r"\(.*?\)") + __PATTERNS['parenthesis'] = regex.compile(r"\(.*?\)") return __PATTERNS['parenthesis'] -def double_capital_letter_pattern() -> Pattern[str]: +def double_capital_letter_pattern() -> regex.regex.Pattern[str]: if 'double_capital_letter' not in __PATTERNS: - __PATTERNS['double_capital_letter'] = re.compile(r"\b[A-Z]{2,}\b") + __PATTERNS['double_capital_letter'] = regex.compile(r"\b[A-Z]{2,}\b") return __PATTERNS['double_capital_letter'] -def binding_word_pattern() -> Pattern[str]: +def binding_word_pattern() -> regex.regex.Pattern[str]: if 'binding_word' not in __PATTERNS: - __PATTERNS['binding_word'] = re.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,') + __PATTERNS['binding_word'] = regex.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,') return __PATTERNS['binding_word'] -def special_char_and_binding_pattern() -> Pattern[str]: +def special_char_and_binding_pattern() -> regex.regex.Pattern[str]: if 'special_char_and_binding' not in __PATTERNS: - __PATTERNS['special_char_and_binding'] = re.compile( + __PATTERNS['special_char_and_binding'] = regex.compile( fr'[;:,.]|({__labels()["bindingWords"]})\b|&+' ) return __PATTERNS['special_char_and_binding'] -def non_alphanumeric_pattern() -> Pattern[str]: +def non_alphanumeric_pattern() -> regex.regex.Pattern[str]: if 'non_alphanumeric' not in __PATTERNS: - __PATTERNS['non_alphanumeric'] = re.compile(r"\W+") + __PATTERNS['non_alphanumeric'] = regex.compile(r"\W+") return __PATTERNS['non_alphanumeric'] -def photograph_label() -> Pattern[str]: +def photograph_label() -> regex.regex.Pattern[str]: if 'photograph' not in __PATTERNS: - __PATTERNS['photograph'] = re.compile(fr'\b({__labels()["photo"]})\b', re.IGNORECASE) + __PATTERNS['photograph'] = regex.compile(fr'\b({__labels()["photo"]})\b(?i)') return __PATTERNS['photograph'] @@ -130,9 +128,9 @@ def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: Optional argument to stop the search after pages. Returns page number (starts at 1) if the title is found or 0 otherwise.""" - title_tokens = re.sub(r'\W+', ' ', title).strip() + title_tokens = regex.sub(r'\W+', ' ', title).strip() for page_number in range(min(len(pages), max_pages)): - page_tokens = re.sub(r'\W+', ' ', pages[page_number + 1]).strip() + page_tokens = regex.sub(r'\W+', ' ', pages[page_number + 1]).strip() if f' {title_tokens} ' in f' {page_tokens} ': return page_number + 1 return 0 @@ -141,7 +139,7 @@ def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int: def find_isxn(identifier: str, text: str) -> Optional[ValueAndContext]: match = __PATTERNS[identifier].search("." + text + ".") if match: - return ValueAndContext(re.sub('–', '-', match.group(1)), text.lower()) + return ValueAndContext(regex.sub('–', '-', match.group(1)), text.lower()) return None @@ -172,7 +170,7 @@ def has_no_letters(text: str) -> bool: def clean_whitespace(text: str) -> str: - return re.sub(r'\s+', ' ', text).strip() + return regex.sub(r'\s+', ' ', text).strip() def split_on_binding_word(text: str) -> list[str]: @@ -188,4 +186,4 @@ def substitute_non_alphanumeric(text: str) -> str: def has_non_author_keywords(text: str) -> bool: - return bool(re.search(photograph_label(), text)) + return bool(regex.search(photograph_label(), text)) From 61b9058eb5a56d864e5436d0fcfc8a3f562a3f81 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Tue, 12 Sep 2023 15:43:26 +0200 Subject: [PATCH 13/15] Remove unecessary class variables and simplify methods --- metadata_extract/resource_loader.py | 40 ++++++++++------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/metadata_extract/resource_loader.py b/metadata_extract/resource_loader.py index 24a4cf3..d254b95 100644 --- a/metadata_extract/resource_loader.py +++ b/metadata_extract/resource_loader.py @@ -17,10 +17,8 @@ class ResourceLoader: """ __info_page_keywords: list[str] = [] __stopwords: list[str] = [] - __labels: dict[str, Any] - __doc_type_mapping: dict[str, str] - __lang_labels: dict[str, Any] = {} - __lang_doc_type_mapping: dict[str, Any] = {} + __labels: dict[str, Any] = {} + __doc_type_mapping: dict[str, str] = {} @staticmethod def load(selected_languages: Optional[list[str]] = None) -> None: @@ -68,41 +66,31 @@ def __load_stopwords(selected_languages: Optional[list[str]] = None) -> None: @staticmethod def __load_labels(selected_languages: Optional[list[str]] = None) -> None: - if ResourceLoader.__lang_labels: + if ResourceLoader.__labels: return with files("metadata_extract.data").joinpath("txt/labels.json").open() as file: label_data = json.load(file) - ResourceLoader.__lang_labels = {} - if selected_languages: - for lang in filter(lambda x: x in label_data, selected_languages): - ResourceLoader.__lang_labels[lang] = label_data[lang] - else: - ResourceLoader.__lang_labels = label_data - labels: dict[str, str] = {} - for lang, label_dict in ResourceLoader.__lang_labels.items(): - for key in label_dict: - if key not in labels: - labels[key] = "" - labels[key] += "|" + "|".join(ResourceLoader.__lang_labels[lang][key]) + for lang in label_data: + if selected_languages is None or lang in selected_languages: + for key in label_data[lang]: + if key not in labels: + labels[key] = "" + labels[key] += "|" + "|".join(label_data[lang][key]) for key in labels: labels[key] = labels[key].lstrip("|").rstrip("|") ResourceLoader.__labels = labels @staticmethod def __load_doc_type_mapping(selected_languages: Optional[list[str]] = None) -> None: - if ResourceLoader.__lang_doc_type_mapping: + if ResourceLoader.__doc_type_mapping: return with files("metadata_extract.data") \ .joinpath("txt/doc_type_mapping.json").open() as file: - ResourceLoader.__lang_doc_type_mapping = json.load(file) + doc_type_mapping_data = json.load(file) doc_type_mapping: dict[str, str] = {} - if selected_languages: - for lang in filter( - lambda x: x in ResourceLoader.__lang_doc_type_mapping, selected_languages - ): - doc_type_mapping.update(ResourceLoader.__lang_doc_type_mapping[lang]) - else: - doc_type_mapping = ResourceLoader.__lang_doc_type_mapping + for lang in doc_type_mapping_data: + if selected_languages is None or lang in selected_languages: + doc_type_mapping.update(doc_type_mapping_data[lang]) ResourceLoader.__doc_type_mapping = doc_type_mapping From 16835350eda9c04ecea4e2f1b296381175709e2c Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Wed, 13 Sep 2023 09:19:47 +0200 Subject: [PATCH 14/15] Initialize regexes not depending on resource data on startup --- metadata_extract/resource_loader.py | 4 +-- metadata_extract/text.py | 46 +++++++++-------------------- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/metadata_extract/resource_loader.py b/metadata_extract/resource_loader.py index d254b95..07c6265 100644 --- a/metadata_extract/resource_loader.py +++ b/metadata_extract/resource_loader.py @@ -1,7 +1,7 @@ # pylint: disable=missing-module-docstring import json from importlib.resources import files -from typing import Any, Optional +from typing import Optional class ResourceLoader: @@ -17,7 +17,7 @@ class ResourceLoader: """ __info_page_keywords: list[str] = [] __stopwords: list[str] = [] - __labels: dict[str, Any] = {} + __labels: dict[str, str] = {} __doc_type_mapping: dict[str, str] = {} @staticmethod diff --git a/metadata_extract/text.py b/metadata_extract/text.py index 3edcd0b..dcb9522 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -23,22 +23,18 @@ def __labels() -> dict[str, str]: __PATTERNS: dict[str, regex.regex.Pattern[str]] = { 'ISSN': regex.compile(r"\D(\d{4}[–-][\dX]{4})\D"), - 'ISBN': regex.compile(r"\D([\d–-]{13,17})\D") + 'ISBN': regex.compile(r"\D([\d–-]{13,17})\D"), + 'type_pattern_2': regex.compile(r'\bNOU\b'), + 'no_letters_pattern': regex.compile(r'^[\W\d]+$'), + 'name_pattern': regex.compile( + r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + + r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()"), + 'parenthesis_pattern': regex.compile(r"\(.*?\)"), + 'double_capital_letter_pattern': regex.compile(r"\b[A-Z]{2,}\b"), + 'non_alphanumeric_pattern': regex.compile(r"\W+") } -def issn_pattern() -> regex.regex.Pattern[str]: - if 'ISSN' not in __PATTERNS: - __PATTERNS['ISSN'] = regex.compile(r"\D(\d{4}[–-][\dX]{4})\D") - return __PATTERNS['ISSN'] - - -def isbn_pattern() -> regex.regex.Pattern[str]: - if 'ISBN' not in __PATTERNS: - __PATTERNS['ISBN'] = regex.compile(r"\D([\d–-]{13,17})\D") - return __PATTERNS['ISBN'] - - def report_pattern() -> regex.regex.Pattern[str]: if 'report' not in __PATTERNS: __PATTERNS['report'] = regex.compile(fr'^(\w+)\W({__labels()["report"]})\W(?i)') @@ -54,8 +50,6 @@ def type_pattern_1() -> regex.regex.Pattern[str]: def type_pattern_2() -> regex.regex.Pattern[str]: - if 'type_pattern_2' not in __PATTERNS: - __PATTERNS['type_pattern_2'] = regex.compile(r'\bNOU\b') return __PATTERNS['type_pattern_2'] @@ -66,9 +60,7 @@ def publisher_label() -> regex.regex.Pattern[str]: def no_letters_pattern() -> regex.regex.Pattern[str]: - if 'no_letters' not in __PATTERNS: - __PATTERNS['no_letters'] = regex.compile(r'^[\W\d]+$') - return __PATTERNS['no_letters'] + return __PATTERNS['no_letters_pattern'] def author_label() -> regex.regex.Pattern[str]: @@ -78,23 +70,15 @@ def author_label() -> regex.regex.Pattern[str]: def name_pattern() -> regex.regex.Pattern[str]: - if 'name' not in __PATTERNS: - __PATTERNS['name'] = regex.compile( - r"\b[^\P{Lu}][^\P{Ll}]*[-|‐]?[^\P{Lu}]?[^\P{Ll}’]*\.?" + - r"(?: [^\P{Lu}][^\P{Ll}’]*[-|‐]?[^\P{Lu}]?[^\P{Ll}]*\.?)+\b(?! *\()") - return __PATTERNS['name'] + return __PATTERNS['name_pattern'] def parenthesis_pattern() -> regex.regex.Pattern[str]: - if 'parenthesis' not in __PATTERNS: - __PATTERNS['parenthesis'] = regex.compile(r"\(.*?\)") - return __PATTERNS['parenthesis'] + return __PATTERNS['parenthesis_pattern'] def double_capital_letter_pattern() -> regex.regex.Pattern[str]: - if 'double_capital_letter' not in __PATTERNS: - __PATTERNS['double_capital_letter'] = regex.compile(r"\b[A-Z]{2,}\b") - return __PATTERNS['double_capital_letter'] + return __PATTERNS['double_capital_letter_pattern'] def binding_word_pattern() -> regex.regex.Pattern[str]: @@ -112,9 +96,7 @@ def special_char_and_binding_pattern() -> regex.regex.Pattern[str]: def non_alphanumeric_pattern() -> regex.regex.Pattern[str]: - if 'non_alphanumeric' not in __PATTERNS: - __PATTERNS['non_alphanumeric'] = regex.compile(r"\W+") - return __PATTERNS['non_alphanumeric'] + return __PATTERNS['non_alphanumeric_pattern'] def photograph_label() -> regex.regex.Pattern[str]: From b49756dae23c48b030cee440b66bb7ed8db9bc73 Mon Sep 17 00:00:00 2001 From: Fredrik Monsen Date: Thu, 14 Sep 2023 09:39:45 +0200 Subject: [PATCH 15/15] Apply suggestions --- src/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.py b/src/util.py index 315b918..97d5d88 100644 --- a/src/util.py +++ b/src/util.py @@ -41,7 +41,7 @@ def __init__(self) -> None: ) @staticmethod - def get_languages() -> list[str] | None: + def get_languages() -> Optional[list[str]]: if not get_settings().LANGUAGES: return None return get_settings().LANGUAGES.split(',')