diff --git a/metadata_extract/data/txt/labels.json b/metadata_extract/data/txt/labels.json index e05381c..5bbbc5c 100644 --- a/metadata_extract/data/txt/labels.json +++ b/metadata_extract/data/txt/labels.json @@ -5,7 +5,9 @@ "publisher": ["published by", "publisher"], "reportType": ["annual report", "evaluation", "guidance", "survey"], "bindingWords": ["and"], - "report": ["report"] + "report": ["report"], + "e_isxn": ["digital", "pdf", "web"], + "p_isxn": ["paper", "print"] }, "nob": { "author": ["forfatter(e)", "forfattere", "forfatter", "skrevet av"], @@ -13,7 +15,9 @@ "publisher": ["utgiver", "utgivere", "utgitt av"], "reportType": ["årsrapport", "evaluering", "veileder", "undersøkelse"], "bindingWords": ["og"], - "report": ["rapport"] + "report": ["rapport"], + "e_isxn": ["digital", "pdf", "elektroni", "net", "web"], + "p_isxn": ["papir", "tryk"] }, "nno": { "author": ["forfattar(ar)", "forfattarar", "forfattar", "skrive av"], @@ -21,6 +25,8 @@ "publisher": ["utgjevar", "utgjevarar", "utgivar", "utgivarar", "gitt ut av"], "reportType": ["årsrapport", "evaluering", "rettleiar", "undersøking"], "bindingWords": ["og"], - "report": ["rapport"] + "report": ["rapport"], + "e_isxn": ["digital", "pdf", "elektroni", "net", "web"], + "p_isxn": ["papir", "tryk"] } -} \ No newline at end of file +} diff --git a/metadata_extract/metadata.py b/metadata_extract/metadata.py index ed5cd1d..f1a855b 100644 --- a/metadata_extract/metadata.py +++ b/metadata_extract/metadata.py @@ -89,13 +89,7 @@ def choose_isxn(self, identifier: str) -> Optional[CandidateType]: isxn_values = {} for isxn in self.candidates[identifier]: if isxn.value not in isxn_values: - isxn_values[isxn.value] = 0 - for word in ['digital', 'pdf', 'elektroni', 'net', 'web']: - if isxn.context and word in isxn.context: - isxn_values[isxn.value] += 1 - for word in ['paper', 'papir', 'tryk']: - if isxn.context and word in isxn.context: - isxn_values[isxn.value] -= 1 + isxn_values[isxn.value] = text.score_isxn_context(isxn.context) sorted_dict = sorted(isxn_values.items(), key=lambda x: -x[1]) for k in sorted_dict: diff --git a/metadata_extract/text.py b/metadata_extract/text.py index dae59e5..5e7ed89 100644 --- a/metadata_extract/text.py +++ b/metadata_extract/text.py @@ -169,3 +169,14 @@ def substitute_non_alphanumeric(text: str) -> str: def has_non_author_keywords(text: str) -> bool: return bool(regex.search(photograph_label(), text)) + + +def score_isxn_context(context: Optional[str]) -> int: + if not context: + return 0 + score = 0 + e_matches = regex.findall(fr'{__labels()["e_isxn"]}|\be\b', context) + score += len(e_matches) + p_matches = regex.findall(fr'{__labels()["p_isxn"]}|\bp\b', context) + score -= len(p_matches) + return score