Skip to content

Commit

Permalink
fix: look for 'e'/'p' to choose electronic standard number (TT-944) (#23
Browse files Browse the repository at this point in the history
)

* fix: Look for 'e'/'p' in ISXN context

* move isxn keywords to labels.json
  • Loading branch information
pierrebeauguitte authored Mar 8, 2024
1 parent e10414b commit 0655456
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 11 deletions.
14 changes: 10 additions & 4 deletions metadata_extract/data/txt/labels.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,28 @@
"publisher": ["published by", "publisher"],
"reportType": ["annual report", "evaluation", "guidance", "survey"],
"bindingWords": ["and"],
"report": ["report"]
"report": ["report"],
"e_isxn": ["digital", "pdf", "web"],
"p_isxn": ["paper", "print"]
},
"nob": {
"author": ["forfatter(e)", "forfattere", "forfatter", "skrevet av"],
"photo": ["fotograf", "foto", "illustrasjon", "bilde", "fotomontasje"],
"publisher": ["utgiver", "utgivere", "utgitt av"],
"reportType": ["årsrapport", "evaluering", "veileder", "undersøkelse"],
"bindingWords": ["og"],
"report": ["rapport"]
"report": ["rapport"],
"e_isxn": ["digital", "pdf", "elektroni", "net", "web"],
"p_isxn": ["papir", "tryk"]
},
"nno": {
"author": ["forfattar(ar)", "forfattarar", "forfattar", "skrive av"],
"photo": ["fotograf", "foto", "illustrasjon", "bilete", "fotomontasje"],
"publisher": ["utgjevar", "utgjevarar", "utgivar", "utgivarar", "gitt ut av"],
"reportType": ["årsrapport", "evaluering", "rettleiar", "undersøking"],
"bindingWords": ["og"],
"report": ["rapport"]
"report": ["rapport"],
"e_isxn": ["digital", "pdf", "elektroni", "net", "web"],
"p_isxn": ["papir", "tryk"]
}
}
}
8 changes: 1 addition & 7 deletions metadata_extract/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,7 @@ def choose_isxn(self, identifier: str) -> Optional[CandidateType]:
isxn_values = {}
for isxn in self.candidates[identifier]:
if isxn.value not in isxn_values:
isxn_values[isxn.value] = 0
for word in ['digital', 'pdf', 'elektroni', 'net', 'web']:
if isxn.context and word in isxn.context:
isxn_values[isxn.value] += 1
for word in ['paper', 'papir', 'tryk']:
if isxn.context and word in isxn.context:
isxn_values[isxn.value] -= 1
isxn_values[isxn.value] = text.score_isxn_context(isxn.context)

sorted_dict = sorted(isxn_values.items(), key=lambda x: -x[1])
for k in sorted_dict:
Expand Down
11 changes: 11 additions & 0 deletions metadata_extract/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,14 @@ def substitute_non_alphanumeric(text: str) -> str:

def has_non_author_keywords(text: str) -> bool:
return bool(regex.search(photograph_label(), text))


def score_isxn_context(context: Optional[str]) -> int:
if not context:
return 0
score = 0
e_matches = regex.findall(fr'{__labels()["e_isxn"]}|\be\b', context)
score += len(e_matches)
p_matches = regex.findall(fr'{__labels()["p_isxn"]}|\bp\b', context)
score -= len(p_matches)
return score

0 comments on commit 0655456

Please sign in to comment.