Skip to content

Commit

Permalink
names: fix affiliations deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
slint authored and jrcastro2 committed Dec 6, 2024
1 parent d62395f commit 449dc5b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
10 changes: 4 additions & 6 deletions invenio_vocabularies/contrib/names/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,23 +260,21 @@ def _extract_affiliations(self, record):
employment.get("employment-summary", {}) for employment in employments
]

history = set()
for employment in employments:
terminated = employment.get("end-date")
org = employment["organization"]

if terminated or org["name"] in history:
if terminated:
continue

history.add(org["name"])
org = employment["organization"]
aff = {"name": org["name"]}

# Extract the org ID, to link to the affiliation vocabulary
aff_id = self._extract_affiliation_id(org)
if aff_id:
aff["id"] = aff_id

result.append(aff)
if aff not in result:
result.append(aff)
except Exception:
pass
return result
Expand Down
14 changes: 14 additions & 0 deletions tests/contrib/names/test_names_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,12 @@ def orcid_data():
"employments": {"affiliation-group": employments[0]}
},
},
"duplicate_affiliations": {
**base,
"activities-summary": {
"employments": {"affiliation-group": employments + employments},
},
},
},
}

Expand Down Expand Up @@ -267,6 +273,14 @@ def expected_from_xml():
**base,
"affiliations": [{"id": "01ggx4157", "name": "CERN"}],
},
"duplicate_affiliations": {
**base,
"affiliations": [
# Affiliations are deduplicated
{"id": "01ggx4157", "name": "CERN"},
{"name": "ACME Inc."},
],
},
}


Expand Down

0 comments on commit 449dc5b

Please sign in to comment.