Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TT-1042: Read language codes for initialized files from .env #9

Merged
merged 15 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ MAX_FILE_SIZE_MB=123
ENVIRONMENT=local
DIFF_FILES_FOLDER=/path/to/diff/files

# A comma separated list of ISO 639-2 language codes
# Include 'mul' to catch keywords present in multiple languages such as 'isbn' or 'issn'
# Languages must be present in the metadata_extract.data.txt files, otherwise they will be skipped on a per file basis
LANGUAGES=mul,eng,nob

# To use a authority registry database, specify either...

# ... the path to a SQLite file
Expand Down
8 changes: 4 additions & 4 deletions metadata_extract/author_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_author_names(text_block: str) -> Optional[list[str]]:
def remove_multi_capital_letter(authors: list[str]) -> list[str]:
"""Remove all names with multiple sequential capital letters"""
return [author for author in authors
if not text.DOUBLE_CAPITAL_LETTER_PATTERN.search(author)]
if not text.double_capital_letter_pattern().search(author)]


def remove_non_author_name(name: str) -> Optional[str]:
Expand All @@ -44,7 +44,7 @@ def remove_non_author_name(name: str) -> Optional[str]:

def remove_parenthesis(author_text: str) -> str:
"""Remove all parenthesis and text inside them"""
parenthesis_match = text.PARENTHESIS_PATTERN.findall(author_text)
parenthesis_match = text.parenthesis_pattern().findall(author_text)
if parenthesis_match:
for match in parenthesis_match:
author_text = author_text.replace(match, "")
Expand All @@ -53,7 +53,7 @@ def remove_parenthesis(author_text: str) -> str:

def match_text_name_regex(author_text: str) -> Optional[str]:
"""Match all names in text to name_pattern regular expression"""
name_match = text.NAME_PATTERN.findall(author_text)
name_match = text.name_pattern().findall(author_text)
if name_match:
author_text = ", ".join(name_match)
return author_text
Expand Down Expand Up @@ -82,7 +82,7 @@ def is_probable_name_block(text_block: str) -> bool:
text_block = text.substitute_special_char_and_binding(text_block)

# Step 2: Check if the block and the match have the same length
match_block = "".join(text.NAME_PATTERN.findall(text_block))
match_block = "".join(text.name_pattern().findall(text_block))
return len(text_block) == len(match_block)


Expand Down
14 changes: 14 additions & 0 deletions metadata_extract/data/txt/doc_type_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"nob": {
"årsrapport": "annualReport",
"evaluering": "evaluation",
"veileder": "guidance",
"undersøkelse": "survey"
},
"nno": {
"årsrapport": "annualReport",
"evaluering": "evaluation",
"veileiar": "guidance",
"undersøking": "survey"
}
}
6 changes: 0 additions & 6 deletions metadata_extract/data/txt/doc_type_mapping_no_en.json

This file was deleted.

26 changes: 16 additions & 10 deletions metadata_extract/data/txt/info_page_keywords.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
{
"en": [
"year", "date", "publish", "isbn", "issn", "project no.", "title", "author", "number of pages", "summary",
"abstract", "©", "subject words", "keywords", "country", "county", "report", "employer", "availability",
"eng": [
"year", "date", "publish", "project no.", "title", "author", "number of pages", "summary",
"abstract", "subject words", "keywords", "country", "county", "report", "employer", "availability",
"contact person", "publication type"
],
"no": [
"nob": [
"år", "dato", "utgivelsesår", "utgiver", "utgitt", "isbn", "issn", "prosjektnummer", "prosjektnr.", "tittel",
"forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "©", "opphavsrett",
"rettighetshaver", "emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører",
"oppdragsgiver", "tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype",
"sitering", "signatur"
"forfatter", "antallsider", "antall sider", "sidetall", "sammendrag", "opphavsrett", "rettighetshaver",
"emneord", "nøkkelord", "stikkord", "land", "fylke", "rapport", "godkjennere", "redaktører", "oppdragsgiver",
"tilgjengelighet", "kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering",
"signatur"
],
"nn": [
"forfattar"
"nno": [
"år", "dato", "utgivingsår", "utgivar", "gitt ut", "prosjektnummer", "prosjektnr.", "tittel",
"forfattar", "talet på sider", "sidetal", "samandrag", "opphavsrett", "rettshavar", "emneord", "nøkkelord",
"stikkord", "land", "fylke", "rapport", "godkjennere", "redaktørar", "oppdragsgivar", "tilgjengelegheit",
"kontaktperson", "oppdrag", "referanse", "publikasjonstype", "publiseringstype", "sitering", "signatur"
],
"mul": [
"isbn", "issn", "©"
]
}
18 changes: 9 additions & 9 deletions metadata_extract/data/txt/labels.json
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
{
"en": {
"eng": {
"author": ["authors", "author(s)", "author"],
"photo": ["photographer", "photo", "illustration"],
"publisher": ["published by", "publisher"],
"reportType": ["annual report", "evaluation", "guidance", "survey"],
"bindingWords": ["and"],
"report": ["report"]
},
"no": {
"nob": {
"author": ["forfatter(e)", "forfattere", "forfatter", "skrevet av"],
"photo": ["fotograf", "foto", "illustrasjon", "bilde", "fotomontasje"],
"publisher": ["utgiver", "utgivere", "utgitt av"],
"reportType": ["årsrapport", "evaluering", "veileder", "undersøkelse"],
"bindingWords": ["og"],
"report": ["rapport"]
},
"nn": {
"author": ["forfattar(ar)", "forfattarar", "forfattar"],
"photo": [],
"publisher": ["utgjevar", "utgjevarar"],
"reportType": [],
"bindingWords": [],
"report": []
"nno": {
"author": ["forfattar(ar)", "forfattarar", "forfattar", "skrive av"],
"photo": ["fotograf", "foto", "illustrasjon", "bilete", "fotomontasje"],
"publisher": ["utgjevar", "utgjevarar", "utgivar", "utgivarar", "gitt ut av"],
"reportType": ["årsrapport", "evaluering", "rettleiar", "undersøking"],
"bindingWords": ["og"],
"report": ["rapport"]
}
}
28 changes: 19 additions & 9 deletions metadata_extract/data/txt/stopwords.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
{
"no": [
"eng": [
"academy", "university", "director", "directorate", "defence", "military", "researcher", "senior researcher",
"research leader", "research manager", "professor", "college", "consultant", "consulting", "municipal", "municipality",
"royal", "politics", "politician", "police", "authority", "authorities", "surveillance", "research", "europe",
"america", "county", "office", "senior advisor", "advisor", "senior advisors", "project", "project manager",
"laboratory", "parliament", "system", "science", "section", "school", "faculty", "norway", "norwegian",
"scandinavia", "strategic", "teacher", "agency", "assistant", "organization", "center", "department", "division",
"governor", "institute", "international", "world", "policy", "program", "report", "science", "ministry", "ministries"
],
"nob": [
"akademi", "universitet", "direktorat", "direktoratet", "forsvar", "militær", "forsker", "seniorforsker",
"forskningsleder", "forskningssjef", "professor", "høgskole", "høyskole", "konsulent", "kommunal", "kommune",
"kongelig", "kongelige", "politikk", "politiker", "politi", "politiet", "myndighet", "myndigheter", "overvåking",
Expand All @@ -9,13 +18,14 @@
"lærer", "byrå", "assistent", "assisterende", "organisasjon", "senter", "departement", "avdeling", "fylkesmann",
"institutt", "internasjonal", "verden", "program", "rapport", "vitenskap"
],
"en": [
"academy", "university", "director", "directorate", "defence", "military", "researcher", "senior researcher",
"research leader", "research manager", "professor", "college", "consultant", "consulting", "municipal", "municipality",
"royal", "politics", "politician", "police", "authority", "authorities", "surveillance", "research", "europe",
"america", "county", "office", "senior advisor", "advisor", "senior advisors", "project", "project manager",
"laboratory", "parliament", "system", "science", "section", "school", "faculty", "norway", "norwegian",
"scandinavia", "strategic", "teacher", "agency", "assistant", "organization", "center", "department", "division",
"governor", "institute", "international", "world", "policy", "program", "report", "science", "ministry", "ministries"
"nno": [
"akademi", "universitet", "direktorat", "direktoratet", "forsvar", "militær", "forskar", "seniorforskar",
"forskingsleiar", "forskingssjef", "professor", "høgskule", "konsulent", "kommunal", "kommune",
"kongeleg", "kongelege", "politikk", "politikar", "politi", "politiet", "styresmakt", "styresmakter", "overvaking",
"forsking", "forsvaret", "europa", "amerika", "fylke", "kontor", "seniorrådgivar", "rådgivar",
"rådgivarar", "seniorrådgivarar", "prosjekt", "prosjektleiar", "direktør", "laboratorium", "stortinget", "system",
"vitskap", "seksjon", "skule", "fakultet", "norge", "norsk", "seniorforskar", "skandinavia", "strategisk",
"lærer", "byrå", "assistent", "assisterande", "organisasjon", "senter", "departement", "avdeling", "fylkesmann",
"institutt", "internasjonal", "verda", "program", "rapport", "vitskap"
]
}
9 changes: 5 additions & 4 deletions metadata_extract/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .metadata import Metadata
from .meteor_document import MeteorDocument
from .registry import PublisherRegistry, RegistryType
from .resource_loader import ResourceLoader


class CopyrightType(TypedDict):
Expand Down Expand Up @@ -100,7 +101,7 @@ def find_publisher(self) -> None:
Returns after first value is found."""
for number, page in self.doc.pages.items():
for line in page.split('\n'):
match = text.PUBLISHER_LABEL.match(line)
match = text.publisher_label().match(line)
if match is not None:
value = line[match.span()[1]:].strip()
if value != '':
Expand Down Expand Up @@ -134,7 +135,7 @@ def find_author(self) -> None:
if found_title and isinstance(found_title, str) \
and author_name.name_exists_in_title(found_title, author):
continue
if any(keyword in author.lower() for keyword in text.STOPWORDS):
if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()):
continue
if author_name.is_all_caps_spaced(author):
continue
Expand Down Expand Up @@ -189,10 +190,10 @@ def get_author_from_info(self) -> None:
# TODO: Seems to only fetch first author before comma from pdfinfo
if self.doc.pdfinfo['author']:
author = self.doc.pdfinfo['author']
name_match = text.NAME_PATTERN.findall(author)
name_match = text.name_pattern().findall(author)
for match in name_match:
found_on_page = text.find_in_pages(match, self.doc.pages)
if any(keyword in author.lower() for keyword in text.STOPWORDS):
if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()):
continue
if found_on_page > 0:
candidate = Candidate(author_name.create_author_dict(match),
Expand Down
11 changes: 4 additions & 7 deletions metadata_extract/infopage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from typing import Optional, TypedDict
from fitz import Document

from metadata_extract.init_files import InitFiles
from . import text
from .author_name import get_author_names
from .resource_loader import ResourceLoader
from .page import Page, TextBlock


Expand All @@ -30,9 +30,6 @@ class InfoPage(Page):
HASKEYWORD = 4
KEYWORDFONT = 8

# TODO: add nynorsk / samisk words
keywords = InitFiles().get_info_page_keywords()

@staticmethod
def find_page_number(pages: dict[int, str]) -> int:
"""Looks for the info page, based on a keyword list.
Expand All @@ -42,7 +39,7 @@ def find_page_number(pages: dict[int, str]) -> int:
scores: dict[int, int] = {}
for page in pages:
score = 0
for k in InfoPage.keywords:
for k in ResourceLoader.get_info_page_keywords():
if k in pages[page].lower():
score += 1
scores[page] = score
Expand All @@ -53,7 +50,7 @@ def find_page_number(pages: dict[int, str]) -> int:

@staticmethod
def keyword_appears_in(string: str) -> bool:
for k in InfoPage.keywords:
for k in ResourceLoader.get_info_page_keywords():
if k in string.lower():
return True
return False
Expand Down Expand Up @@ -136,7 +133,7 @@ def find_publisher(self) -> Optional[str]:
def find_author(self) -> Optional[list[str]]:
author_block = None
for block in self.text_blocks:
if text.AUTHOR_LABEL.match(block.text.lower()):
if text.author_label().match(block.text.lower()):
author_block = block
break
if author_block:
Expand Down
88 changes: 0 additions & 88 deletions metadata_extract/init_files.py

This file was deleted.

5 changes: 4 additions & 1 deletion metadata_extract/meteor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@


from typing import Optional

from .resource_loader import ResourceLoader
from .registry import PublisherRegistry
from .meteor_document import MeteorDocument
from .metadata import Results
Expand All @@ -17,8 +19,9 @@ class Meteor:
and return the best ones as a Results object (TypedDict, JSON-serializable)
"""

def __init__(self) -> None:
def __init__(self, languages: Optional[list[str]] = None) -> None:
self.registry: Optional[PublisherRegistry] = None
ResourceLoader.load(languages)

def set_registry(self, registry: PublisherRegistry) -> None:
self.registry = registry
Expand Down
2 changes: 1 addition & 1 deletion metadata_extract/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def find_isxn(self, identifier: str) -> list[ValueAndContext]:
def find_publisher_block(self) -> Optional[TextBlock]:
publisher_block = None
for block in self.text_blocks:
if text.PUBLISHER_LABEL.match(block.text.lower()):
if text.publisher_label().match(block.text.lower()):
publisher_block = block
break
return publisher_block
Expand Down
Loading