Skip to content

Commit

Permalink
Apply suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
fredrikmonsen committed Sep 12, 2023
1 parent 0e977d8 commit 2d5c4ec
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 87 deletions.
14 changes: 14 additions & 0 deletions metadata_extract/data/txt/doc_type_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"nob": {
"årsrapport": "annualReport",
"evaluering": "evaluation",
"veileder": "guidance",
"undersøkelse": "survey"
},
"nno": {
"årsrapport": "annualReport",
"evaluering": "evaluation",
"veileiar": "guidance",
"undersøking": "survey"
}
}
6 changes: 0 additions & 6 deletions metadata_extract/data/txt/doc_type_mapping_no_en.json

This file was deleted.

5 changes: 3 additions & 2 deletions metadata_extract/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .metadata import Metadata
from .meteor_document import MeteorDocument
from .registry import PublisherRegistry, RegistryType
from .resource_loader import ResourceLoader


class CopyrightType(TypedDict):
Expand Down Expand Up @@ -133,7 +134,7 @@ def find_author(self) -> None:
if found_title and isinstance(found_title, str) \
and author_name.name_exists_in_title(found_title, author):
continue
if any(keyword in author.lower() for keyword in text.stopwords()):
if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()):
continue
if author_name.is_all_caps_spaced(author):
continue
Expand Down Expand Up @@ -191,7 +192,7 @@ def get_author_from_info(self) -> None:
name_match = text.name_pattern().findall(author)
for match in name_match:
found_on_page = text.find_in_pages(match, self.doc.pages)
if any(keyword in author.lower() for keyword in text.stopwords()):
if any(keyword in author.lower() for keyword in ResourceLoader.get_stopwords()):
continue
if found_on_page > 0:
candidate = Candidate(author_name.create_author_dict(match),
Expand Down
4 changes: 1 addition & 3 deletions metadata_extract/meteor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ class Meteor:

def __init__(self, languages: Optional[list[str]] = None) -> None:
self.registry: Optional[PublisherRegistry] = None
ResourceLoader.load_info_page_keywords(languages)
ResourceLoader.load_stopwords(languages)
ResourceLoader.load_labels(languages)
ResourceLoader.load(languages)

def set_registry(self, registry: PublisherRegistry) -> None:
self.registry = registry
Expand Down
136 changes: 84 additions & 52 deletions metadata_extract/resource_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,116 @@ class ResourceLoader:
- txt/info_page_keywords.json
- txt/stopwords.json
- txt/labels.json
- txt/doc_type_mapping_no_en.json
- txt/doc_type_mapping.json
"""
__info_page_keywords: list[str]
__stopwords: list[str]
__labels: dict[str, Any]
__doc_type_mapping: dict[str, str]

__lang_info_page_keywords: dict[str, list[str]]
__lang_stopwords: dict[str, list[str]]
__lang_labels: dict[str, Any]
__lang_info_page_keywords: dict[str, list[str]] = {}
__lang_stopwords: dict[str, list[str]] = {}
__lang_labels: dict[str, Any] = {}
__lang_doc_type_mapping: dict[str, Any] = {}

@staticmethod
def load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None:
with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file:
info_page_keywords = json.load(file)
ResourceLoader.__lang_info_page_keywords = {}
if selected_languages:
for lang in filter(
lambda x: x in info_page_keywords, selected_languages
):
ResourceLoader.__lang_info_page_keywords[lang] = info_page_keywords[lang]
else:
ResourceLoader.__lang_info_page_keywords = info_page_keywords

@staticmethod
def load_stopwords(selected_languages: Optional[list[str]] = None) -> None:
with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file:
stopwords = json.load(file)
ResourceLoader.__lang_stopwords = {}
if selected_languages:
for lang in filter(lambda x: x in stopwords, selected_languages):
ResourceLoader.__lang_stopwords[lang] = stopwords[lang]
else:
ResourceLoader.__lang_stopwords = stopwords

@staticmethod
def load_labels(selected_languages: Optional[list[str]] = None) -> None:
with files("metadata_extract.data").joinpath("txt/labels.json").open() as file:
labels = json.load(file)
ResourceLoader.__lang_labels = {}
if selected_languages:
for lang in filter(lambda x: x in labels, selected_languages):
ResourceLoader.__lang_labels[lang] = labels[lang]
else:
ResourceLoader.__lang_labels = labels
def load(selected_languages: Optional[list[str]] = None) -> None:
ResourceLoader.__load_info_page_keywords(selected_languages)
ResourceLoader.__load_stopwords(selected_languages)
ResourceLoader.__load_labels(selected_languages)
ResourceLoader.__load_doc_type_mapping(selected_languages)

@staticmethod
def get_info_page_keywords() -> list[str]:
keywords = []
for lang in ResourceLoader.__lang_info_page_keywords:
keywords.extend(ResourceLoader.__lang_info_page_keywords[lang])
ResourceLoader.__info_page_keywords = keywords
return ResourceLoader.__info_page_keywords

@staticmethod
def get_stopwords() -> list[str]:
stopwords = []
for lang in ResourceLoader.__lang_stopwords:
stopwords.extend(ResourceLoader.__lang_stopwords[lang])
ResourceLoader.__stopwords = stopwords
return ResourceLoader.__stopwords

@staticmethod
def get_labels() -> dict[str, str]:
return ResourceLoader.__labels

@staticmethod
def get_doc_type_mapping() -> dict[str, str]:
return ResourceLoader.__doc_type_mapping

@staticmethod
def __load_info_page_keywords(selected_languages: Optional[list[str]] = None) -> None:
if ResourceLoader.__lang_info_page_keywords:
return
with files("metadata_extract.data").joinpath("txt/info_page_keywords.json").open() as file:
keyword_data = json.load(file)
ResourceLoader.__lang_info_page_keywords = {}
if selected_languages:
for lang in filter(
lambda x: x in keyword_data, selected_languages
):
ResourceLoader.__lang_info_page_keywords[lang] = keyword_data[lang]
else:
ResourceLoader.__lang_info_page_keywords = keyword_data

keyword_list = []
for lang, items in ResourceLoader.__lang_info_page_keywords.items():
keyword_list.extend(items)
ResourceLoader.__info_page_keywords = keyword_list

@staticmethod
def __load_stopwords(selected_languages: Optional[list[str]] = None) -> None:
if ResourceLoader.__lang_stopwords:
return
with files("metadata_extract.data").joinpath("txt/stopwords.json").open() as file:
stopwords_data = json.load(file)
ResourceLoader.__lang_stopwords = {}
if selected_languages:
for lang in filter(lambda x: x in stopwords_data, selected_languages):
ResourceLoader.__lang_stopwords[lang] = stopwords_data[lang]
else:
ResourceLoader.__lang_stopwords = stopwords_data

stopwords_list = []
for lang, items in ResourceLoader.__lang_stopwords.items():
stopwords_list.extend(items)
ResourceLoader.__stopwords = stopwords_list

@staticmethod
def __load_labels(selected_languages: Optional[list[str]] = None) -> None:
if ResourceLoader.__lang_labels:
return
with files("metadata_extract.data").joinpath("txt/labels.json").open() as file:
label_data = json.load(file)
ResourceLoader.__lang_labels = {}
if selected_languages:
for lang in filter(lambda x: x in label_data, selected_languages):
ResourceLoader.__lang_labels[lang] = label_data[lang]
else:
ResourceLoader.__lang_labels = label_data

labels: dict[str, str] = {}
for lang in ResourceLoader.__lang_labels:
for key in ResourceLoader.__lang_labels[lang]:
for lang, label_dict in ResourceLoader.__lang_labels.items():
for key in label_dict:
if key not in labels:
labels[key] = ""
labels[key] += "|" + "|".join(ResourceLoader.__lang_labels[lang][key])
for key in labels:
labels[key] = labels[key].lstrip("|").rstrip("|")
ResourceLoader.__labels = labels
return ResourceLoader.__labels

@staticmethod
def get_doc_type_mapping() -> dict[str, str]:
def __load_doc_type_mapping(selected_languages: Optional[list[str]] = None) -> None:
if ResourceLoader.__lang_doc_type_mapping:
return
with files("metadata_extract.data") \
.joinpath("txt/doc_type_mapping_no_en.json").open() as file:
ResourceLoader.__doc_type_mapping = json.load(file)
return ResourceLoader.__doc_type_mapping
.joinpath("txt/doc_type_mapping.json").open() as file:
ResourceLoader.__lang_doc_type_mapping = json.load(file)

doc_type_mapping: dict[str, str] = {}
if selected_languages:
for lang in filter(
lambda x: x in ResourceLoader.__lang_doc_type_mapping, selected_languages
):
doc_type_mapping.update(ResourceLoader.__lang_doc_type_mapping[lang])
else:
doc_type_mapping = ResourceLoader.__lang_doc_type_mapping
ResourceLoader.__doc_type_mapping = doc_type_mapping
32 changes: 10 additions & 22 deletions metadata_extract/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,50 +18,38 @@ def append_to_context(self, extra_context: str) -> None:
self.context = (self.context or '') + extra_context


def labels() -> dict[str, str]:
def __labels() -> dict[str, str]:
return ResourceLoader.get_labels()


def stopwords() -> list[str]:
return ResourceLoader.get_stopwords()


def info_page_keywords() -> list[str]:
return ResourceLoader.get_info_page_keywords()


ISXN_PATTERN = {
'ISSN': re.compile(r"\D(\d{4}[–-][\dX]{4})\D"),
'ISBN': re.compile(r"\D([\d–-]{13,17})\D")
}


def report_pattern() -> Pattern[str]:
return re.compile(fr'^(\w+)\W({labels()["report"]})\W', re.IGNORECASE)
return re.compile(fr'^(\w+)\W({__labels()["report"]})\W', re.IGNORECASE)


def type_pattern_1() -> Pattern[str]:
return re.compile(fr'\b({labels()["reportType"]})\b', re.IGNORECASE)
return re.compile(fr'\b({__labels()["reportType"]})\b', re.IGNORECASE)


def type_pattern_2() -> Pattern[str]:
return re.compile(r'\bNOU\b')


def doc_type_mapping() -> dict[str, str]:
return ResourceLoader.get_doc_type_mapping()


def publisher_label() -> Pattern[str]:
return re.compile(fr'({labels()["publisher"]}):?', re.IGNORECASE)
return re.compile(fr'({__labels()["publisher"]}):?', re.IGNORECASE)


def no_letters_pattern() -> Pattern[str]:
return re.compile(r'^[\W\d]+$')


def author_label() -> Pattern[str]:
return re.compile(fr'({labels()["author"]}):?', re.IGNORECASE)
return re.compile(fr'({__labels()["author"]}):?', re.IGNORECASE)


def name_pattern() -> regex.regex.Pattern[str]:
Expand All @@ -78,19 +66,19 @@ def double_capital_letter_pattern() -> Pattern[str]:


def binding_word_pattern() -> Pattern[str]:
return re.compile(fr'\b(?:{labels()["bindingWords"]})\b|&|,')
return re.compile(fr'\b(?:{__labels()["bindingWords"]})\b|&|,')


def special_char_and_binding_pattern() -> Pattern[str]:
return re.compile(fr'[;:,.]|({labels()["bindingWords"]})\b|&+')
return re.compile(fr'[;:,.]|({__labels()["bindingWords"]})\b|&+')


def non_alphanumeric_pattern() -> Pattern[str]:
return re.compile(r"\W+")


def photograph_label() -> Pattern[str]:
return re.compile(fr'\b({labels()["photo"]})\b', re.IGNORECASE)
return re.compile(fr'\b({__labels()["photo"]})\b', re.IGNORECASE)


def find_in_pages(title: str, pages: dict[int, str], max_pages: int = 3) -> int:
Expand Down Expand Up @@ -126,8 +114,8 @@ def find_doc_type(page_text: str) -> Optional[str]:
match = type_pattern_1().search(page_text)
if match:
doc_type = match.group(1).lower()
if doc_type in doc_type_mapping():
return doc_type_mapping()[doc_type]
if doc_type in ResourceLoader.get_doc_type_mapping():
return ResourceLoader.get_doc_type_mapping()[doc_type]
return doc_type
match = type_pattern_2().search(page_text)
if match:
Expand Down
2 changes: 1 addition & 1 deletion src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self) -> None:

@staticmethod
def get_languages() -> list[str] | None:
if len(get_settings().LANGUAGES) == 0:
if not get_settings().LANGUAGES:
return None
return get_settings().LANGUAGES.split(',')

Expand Down
2 changes: 1 addition & 1 deletion test/test_infopage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from metadata_extract.resource_loader import ResourceLoader

doc = MeteorDocument('test/resources/report2.pdf')
ResourceLoader.load_info_page_keywords(["und", "eng", "nob", "nno"])
ResourceLoader.load(["und", "eng", "nob", "nno"])
infopagenr = InfoPage.find_page_number(doc.pages)


Expand Down

0 comments on commit 2d5c4ec

Please sign in to comment.