From 9401dbd8649e16368d7fe661c976a56bc13fdac2 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 22 Jul 2024 10:44:25 +0200 Subject: [PATCH] copy of original version from huggingface.co/bigbio/drugprot --- dataset_builders/hf/drugprot/.gitattributes | 54 ++ dataset_builders/hf/drugprot/README.md | 48 ++ dataset_builders/hf/drugprot/bigbiohub.py | 592 ++++++++++++++++++ dataset_builders/hf/drugprot/drugprot.py | 260 ++++++++ .../hf/drugprot/test_drugprot.py | 263 ++++++++ 5 files changed, 1217 insertions(+) create mode 100644 dataset_builders/hf/drugprot/.gitattributes create mode 100644 dataset_builders/hf/drugprot/README.md create mode 100644 dataset_builders/hf/drugprot/bigbiohub.py create mode 100644 dataset_builders/hf/drugprot/drugprot.py create mode 100644 tests/dataset_builders/hf/drugprot/test_drugprot.py diff --git a/dataset_builders/hf/drugprot/.gitattributes b/dataset_builders/hf/drugprot/.gitattributes new file mode 100644 index 00000000..f4f3945b --- /dev/null +++ b/dataset_builders/hf/drugprot/.gitattributes @@ -0,0 +1,54 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/dataset_builders/hf/drugprot/README.md b/dataset_builders/hf/drugprot/README.md new file mode 100644 index 00000000..bb59f4b5 --- /dev/null +++ b/dataset_builders/hf/drugprot/README.md @@ -0,0 +1,48 @@ + +--- +language: +- en +bigbio_language: +- English +license: cc-by-4.0 +multilinguality: monolingual +bigbio_license_shortname: CC_BY_4p0 +pretty_name: DrugProt +homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/ +bigbio_pubmed: True +bigbio_public: True +bigbio_tasks: +- NAMED_ENTITY_RECOGNITION +- RELATION_EXTRACTION +--- + + +# Dataset Card for DrugProt + +## Dataset Description + +- **Homepage:** https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/ +- **Pubmed:** True +- **Public:** True +- **Tasks:** NER,RE + + +The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships +between them corresponding to a specific set of biologically relevant relation types. The corpus was introduced +in context of the BioCreative VII Track 1 (Text mining drug and chemical-protein interactions). + + + +## Citation Information + +``` +@inproceedings{miranda2021overview, + title={Overview of DrugProt BioCreative VII track: quality evaluation and large scale text mining of \ + drug-gene/protein relations}, + author={Miranda, Antonio and Mehryary, Farrokh and Luoma, Jouni and Pyysalo, Sampo and Valencia, Alfonso \ + and Krallinger, Martin}, + booktitle={Proceedings of the seventh BioCreative challenge evaluation workshop}, + year={2021} +} + +``` diff --git a/dataset_builders/hf/drugprot/bigbiohub.py b/dataset_builders/hf/drugprot/bigbiohub.py new file mode 100644 index 00000000..a4792b4b --- /dev/null +++ b/dataset_builders/hf/drugprot/bigbiohub.py @@ -0,0 +1,592 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + try: + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + except Exception: + continue + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/dataset_builders/hf/drugprot/drugprot.py b/dataset_builders/hf/drugprot/drugprot.py new file mode 100644 index 00000000..6ea24d0d --- /dev/null +++ b/dataset_builders/hf/drugprot/drugprot.py @@ -0,0 +1,260 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships +between them corresponding to a specific set of biologically relevant relation types. The corpus was introduced +in context of the BioCreative VII Track 1 (Text mining drug and chemical-protein interactions). + +For further information see: +https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/ +""" +import collections +from pathlib import Path +from typing import Dict, Iterator, Tuple + +import datasets + +from .bigbiohub import kb_features +from .bigbiohub import BigBioConfig +from .bigbiohub import Tasks + +_LANGUAGES = ['English'] +_PUBMED = True +_LOCAL = False +_CITATION = """\ +@inproceedings{miranda2021overview, + title={Overview of DrugProt BioCreative VII track: quality evaluation and large scale text mining of \ + drug-gene/protein relations}, + author={Miranda, Antonio and Mehryary, Farrokh and Luoma, Jouni and Pyysalo, Sampo and Valencia, Alfonso \ + and Krallinger, Martin}, + booktitle={Proceedings of the seventh BioCreative challenge evaluation workshop}, + year={2021} +} +""" + +_DATASETNAME = "drugprot" +_DISPLAYNAME = "DrugProt" + + +_DESCRIPTION = """\ +The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships \ +between them corresponding to a specific set of biologically relevant relation types. +""" + +_HOMEPAGE = "https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/" + +_LICENSE = 'Creative Commons Attribution 4.0 International' + +_URLS = {_DATASETNAME: "https://zenodo.org/record/5119892/files/drugprot-training-development-test-background.zip?download=1"} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.2" +_BIGBIO_VERSION = "1.0.0" + + +class DrugProtDataset(datasets.GeneratorBasedBuilder): + """ + The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and \ + (b) all binary relationships between them. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="drugprot_source", + version=SOURCE_VERSION, + description="DrugProt source schema", + schema="source", + subset_id="drugprot", + ), + BigBioConfig( + name="drugprot_bigbio_kb", + version=BIGBIO_VERSION, + description="DrugProt BigBio schema", + schema="bigbio_kb", + subset_id="drugprot", + ), + ] + + DEFAULT_CONFIG_NAME = "drugprot_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "title": datasets.Value("string"), + "abstract": datasets.Value("string"), + "text": datasets.Value("string"), + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Value("string"), + "offset": datasets.Sequence(datasets.Value("int32")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + } + ], + } + ) + + elif self.config.schema == "bigbio_kb": + features = kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + urls = _URLS[_DATASETNAME] + data_dir = Path(dl_manager.download_and_extract(urls)) + data_dir = data_dir / "drugprot-gs-training-development" + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"data_dir": data_dir, "split": "training"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"data_dir": data_dir, "split": "development"}, + ), + ] + + def _generate_examples(self, data_dir: Path, split: str) -> Iterator[Tuple[str, Dict]]: + if self.config.name == "drugprot_source": + documents = self._read_source_examples(data_dir, split) + for document_id, document in documents.items(): + yield document_id, document + + elif self.config.name == "drugprot_bigbio_kb": + documents = self._read_source_examples(data_dir, split) + for document_id, document in documents.items(): + yield document_id, self._transform_source_to_kb(document) + + def _read_source_examples(self, input_dir: Path, split: str) -> Dict: + """ """ + split_dir = input_dir / split + abstracts_file = split_dir / f"drugprot_{split}_abstracs.tsv" + entities_file = split_dir / f"drugprot_{split}_entities.tsv" + relations_file = split_dir / f"drugprot_{split}_relations.tsv" + + document_to_entities = collections.defaultdict(list) + for line in entities_file.read_text().splitlines(): + columns = line.split("\t") + document_id = columns[0] + + document_to_entities[document_id].append( + { + "id": document_id + "_" + columns[1], + "type": columns[2], + "offset": [columns[3], columns[4]], + "text": columns[5], + } + ) + + document_to_relations = collections.defaultdict(list) + for line in relations_file.read_text().splitlines(): + columns = line.split("\t") + document_id = columns[0] + + document_relations = document_to_relations[document_id] + + document_relations.append( + { + "id": document_id + "_" + str(len(document_relations)), + "type": columns[1], + "arg1_id": document_id + "_" + columns[2][5:], + "arg2_id": document_id + "_" + columns[3][5:], + } + ) + + document_to_source = {} + for line in abstracts_file.read_text().splitlines(): + document_id, title, abstract = line.split("\t") + + document_to_source[document_id] = { + "document_id": document_id, + "title": title, + "abstract": abstract, + "text": " ".join([title, abstract]), + "entities": document_to_entities[document_id], + "relations": document_to_relations[document_id], + } + + return document_to_source + + def _transform_source_to_kb(self, source_document: Dict) -> Dict: + document_id = source_document["document_id"] + + offset = 0 + passages = [] + for text_field in ["title", "abstract"]: + text = source_document[text_field] + passages.append( + { + "id": document_id + "_" + text_field, + "type": text_field, + "text": [text], + "offsets": [[offset, offset + len(text)]], + } + ) + offset += len(text) + 1 + + entities = [ + { + "id": entity["id"], + "type": entity["type"], + "text": [entity["text"]], + "offsets": [entity["offset"]], + "normalized": [], + } + for entity in source_document["entities"] + ] + + relations = [ + { + "id": relation["id"], + "type": relation["type"], + "arg1_id": relation["arg1_id"], + "arg2_id": relation["arg2_id"], + "normalized": [], + } + for relation in source_document["relations"] + ] + + return { + "id": document_id, + "document_id": document_id, + "passages": passages, + "entities": entities, + "relations": relations, + "events": [], + "coreferences": [], + } diff --git a/tests/dataset_builders/hf/drugprot/test_drugprot.py b/tests/dataset_builders/hf/drugprot/test_drugprot.py new file mode 100644 index 00000000..4ed7e5aa --- /dev/null +++ b/tests/dataset_builders/hf/drugprot/test_drugprot.py @@ -0,0 +1,263 @@ +from typing import Any, Dict + +import datasets +import pytest + +from dataset_builders.hf.drugprot.drugprot import ( + DrugProtDataset, +) +from tests.dataset_builders.common import HF_BASE_PATH + +DATASET_NAME = "drugprot" +HF_DATASET_PATH = str(HF_BASE_PATH / DATASET_NAME) +SPLIT_NAMES = {"train", "validation"} +SPLIT_SIZES = {"train": 3500, "validation": 750} + + +@pytest.fixture(params=[config.name for config in DrugProtDataset.BUILDER_CONFIGS], scope="module") +def dataset_variant(request) -> str: + return request.param + + +@pytest.fixture(scope="module") +def hf_dataset(dataset_variant) -> datasets.DatasetDict: + return datasets.load_dataset(HF_DATASET_PATH, name=dataset_variant) + + +def test_hf_dataset(hf_dataset): + assert set(hf_dataset) == SPLIT_NAMES + split_sizes = {split_name: len(ds) for split_name, ds in hf_dataset.items()} + assert split_sizes == SPLIT_SIZES + + +@pytest.fixture(scope="module") +def hf_example(hf_dataset) -> Dict[str, Any]: + return hf_dataset["train"][0] + + +def test_hf_example(hf_example, dataset_variant): + if dataset_variant == "drugprot_source": + assert hf_example == { + "document_id": "17512723", + "title": "RDH12, a retinol dehydrogenase causing Leber's congenital amaurosis, is also involved in steroid metabolism.", + "abstract": "Three retinol dehydrogenases (RDHs) were tested for steroid converting abilities: human and murine RDH 12 and human RDH13. RDH12 is involved in retinal degeneration in Leber's congenital amaurosis (LCA). We show that murine Rdh12 and human RDH13 do not reveal activity towards the checked steroids, but that human type 12 RDH reduces dihydrotestosterone to androstanediol, and is thus also involved in steroid metabolism. Furthermore, we analyzed both expression and subcellular localization of these enzymes.", + "text": "RDH12, a retinol dehydrogenase causing Leber's congenital amaurosis, is also involved in steroid metabolism. Three retinol dehydrogenases (RDHs) were tested for steroid converting abilities: human and murine RDH 12 and human RDH13. RDH12 is involved in retinal degeneration in Leber's congenital amaurosis (LCA). We show that murine Rdh12 and human RDH13 do not reveal activity towards the checked steroids, but that human type 12 RDH reduces dihydrotestosterone to androstanediol, and is thus also involved in steroid metabolism. Furthermore, we analyzed both expression and subcellular localization of these enzymes.", + "entities": [ + { + "id": "17512723_T1", + "type": "CHEMICAL", + "text": "androstanediol", + "offset": [466, 480], + }, + { + "id": "17512723_T2", + "type": "CHEMICAL", + "text": "retinol", + "offset": [115, 122], + }, + { + "id": "17512723_T3", + "type": "CHEMICAL", + "text": "retinol", + "offset": [9, 16], + }, + { + "id": "17512723_T4", + "type": "GENE-Y", + "text": "human RDH13", + "offset": [219, 230], + }, + { + "id": "17512723_T5", + "type": "GENE-Y", + "text": "RDH12", + "offset": [232, 237], + }, + { + "id": "17512723_T6", + "type": "GENE-Y", + "text": "murine Rdh12", + "offset": [326, 338], + }, + { + "id": "17512723_T7", + "type": "GENE-Y", + "text": "human RDH13", + "offset": [343, 354], + }, + { + "id": "17512723_T8", + "type": "GENE-N", + "text": "RDHs", + "offset": [139, 143], + }, + { + "id": "17512723_T9", + "type": "GENE-Y", + "text": "human type 12 RDH", + "offset": [417, 434], + }, + { + "id": "17512723_T10", + "type": "GENE-N", + "text": "retinol dehydrogenases", + "offset": [115, 137], + }, + { + "id": "17512723_T11", + "type": "GENE-N", + "text": "human and murine RDH 12", + "offset": [191, 214], + }, + { + "id": "17512723_T12", + "type": "GENE-Y", + "text": "RDH12", + "offset": [0, 5], + }, + { + "id": "17512723_T13", + "type": "GENE-N", + "text": "retinol dehydrogenase", + "offset": [9, 30], + }, + ], + "relations": [ + { + "id": "17512723_0", + "type": "PRODUCT-OF", + "arg1_id": "17512723_T1", + "arg2_id": "17512723_T9", + } + ], + } + elif dataset_variant == "drugprot_bigbio_kb": + assert hf_example == { + "id": "17512723", + "document_id": "17512723", + "passages": [ + { + "id": "17512723_title", + "type": "title", + "text": [ + "RDH12, a retinol dehydrogenase causing Leber's congenital amaurosis, is also involved in steroid metabolism." + ], + "offsets": [[0, 108]], + }, + { + "id": "17512723_abstract", + "type": "abstract", + "text": [ + "Three retinol dehydrogenases (RDHs) were tested for steroid converting abilities: human and murine RDH 12 and human RDH13. RDH12 is involved in retinal degeneration in Leber's congenital amaurosis (LCA). We show that murine Rdh12 and human RDH13 do not reveal activity towards the checked steroids, but that human type 12 RDH reduces dihydrotestosterone to androstanediol, and is thus also involved in steroid metabolism. Furthermore, we analyzed both expression and subcellular localization of these enzymes." + ], + "offsets": [[109, 618]], + }, + ], + "entities": [ + { + "id": "17512723_T1", + "type": "CHEMICAL", + "text": ["androstanediol"], + "offsets": [[466, 480]], + "normalized": [], + }, + { + "id": "17512723_T2", + "type": "CHEMICAL", + "text": ["retinol"], + "offsets": [[115, 122]], + "normalized": [], + }, + { + "id": "17512723_T3", + "type": "CHEMICAL", + "text": ["retinol"], + "offsets": [[9, 16]], + "normalized": [], + }, + { + "id": "17512723_T4", + "type": "GENE-Y", + "text": ["human RDH13"], + "offsets": [[219, 230]], + "normalized": [], + }, + { + "id": "17512723_T5", + "type": "GENE-Y", + "text": ["RDH12"], + "offsets": [[232, 237]], + "normalized": [], + }, + { + "id": "17512723_T6", + "type": "GENE-Y", + "text": ["murine Rdh12"], + "offsets": [[326, 338]], + "normalized": [], + }, + { + "id": "17512723_T7", + "type": "GENE-Y", + "text": ["human RDH13"], + "offsets": [[343, 354]], + "normalized": [], + }, + { + "id": "17512723_T8", + "type": "GENE-N", + "text": ["RDHs"], + "offsets": [[139, 143]], + "normalized": [], + }, + { + "id": "17512723_T9", + "type": "GENE-Y", + "text": ["human type 12 RDH"], + "offsets": [[417, 434]], + "normalized": [], + }, + { + "id": "17512723_T10", + "type": "GENE-N", + "text": ["retinol dehydrogenases"], + "offsets": [[115, 137]], + "normalized": [], + }, + { + "id": "17512723_T11", + "type": "GENE-N", + "text": ["human and murine RDH 12"], + "offsets": [[191, 214]], + "normalized": [], + }, + { + "id": "17512723_T12", + "type": "GENE-Y", + "text": ["RDH12"], + "offsets": [[0, 5]], + "normalized": [], + }, + { + "id": "17512723_T13", + "type": "GENE-N", + "text": ["retinol dehydrogenase"], + "offsets": [[9, 30]], + "normalized": [], + }, + ], + "events": [], + "coreferences": [], + "relations": [ + { + "id": "17512723_0", + "type": "PRODUCT-OF", + "arg1_id": "17512723_T1", + "arg2_id": "17512723_T9", + "normalized": [], + } + ], + } + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") +