From d35e983d82a4a0303cde2e8e03070d386264aea1 Mon Sep 17 00:00:00 2001 From: ArneBinder Date: Wed, 8 Nov 2023 14:09:08 +0100 Subject: [PATCH] remove Huggingface dataset scripts (moved to pie-datasets, see https://github.com/ArneBinder/pie-datasets/pull/36) (#368) --- src/pytorch_ie/data/datasets/__init__.py | 3 - .../data/datasets/hf_datasets/__init__.py | 0 .../data/datasets/hf_datasets/ace2004.py | 153 ---- .../data/datasets/hf_datasets/ace2005.py | 140 ---- .../data/datasets/hf_datasets/brat.py | 337 -------- .../data/datasets/hf_datasets/chemprot.py | 176 ----- .../data/datasets/hf_datasets/fewrel.py | 285 ------- .../data/datasets/hf_datasets/genia.py | 406 ---------- .../data/datasets/hf_datasets/ontonotes.py | 161 ---- .../data/datasets/hf_datasets/scierc.py | 132 ---- .../hf_datasets/semeval_2010_task_8.py | 185 ----- .../data/datasets/hf_datasets/tacred.py | 257 ------ .../data/datasets/hf_datasets/webred.py | 746 ------------------ .../data/datasets/hf_datasets/wiki80.py | 199 ----- .../data/datasets/hf_datasets/wikigold.py | 122 --- 15 files changed, 3302 deletions(-) delete mode 100644 src/pytorch_ie/data/datasets/__init__.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/__init__.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ace2004.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ace2005.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/brat.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/chemprot.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/fewrel.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/genia.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/scierc.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/tacred.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/webred.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/wiki80.py delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/wikigold.py diff --git a/src/pytorch_ie/data/datasets/__init__.py b/src/pytorch_ie/data/datasets/__init__.py deleted file mode 100644 index ffbe5d00..00000000 --- a/src/pytorch_ie/data/datasets/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -import pathlib - -HF_DATASETS_ROOT = pathlib.Path(__file__).parent / "hf_datasets" diff --git a/src/pytorch_ie/data/datasets/hf_datasets/__init__.py b/src/pytorch_ie/data/datasets/hf_datasets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py b/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py deleted file mode 100644 index 646f1f34..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py +++ /dev/null @@ -1,153 +0,0 @@ -"""TODO: Add a description here.""" - - -import json -import os - -import datasets - -_CITATION_ACE2004 = """\ -@inproceedings{doddington-etal-2004-automatic, - title = "The Automatic Content Extraction ({ACE}) Program {--} Tasks, Data, and Evaluation", - author = "Doddington, George and - Mitchell, Alexis and - Przybocki, Mark and - Ramshaw, Lance and - Strassel, Stephanie and - Weischedel, Ralph", - booktitle = "Proceedings of the Fourth International Conference on Language Resources and Evaluation ({LREC}{'}04)", - month = may, - year = "2004", - address = "Lisbon, Portugal", - publisher = "European Language Resources Association (ELRA)", - url = "http://www.lrec-conf.org/proceedings/lrec2004/pdf/5.pdf", -} -""" - -# You can copy an official description -_DESCRIPTION = """\ -ACE 2004 Multilingual Training Corpus contains the complete set of English, Arabic and Chinese -training data for the 2004 Automatic Content Extraction (ACE) technology evaluation. The corpus consists of data of -various types annotated for entities and relations and was created by Linguistic Data Consortium with support from -the ACE Program, with additional assistance from the DARPA TIDES (Translingual Information Detection, Extraction and -Summarization) Program. This data was previously distributed as an e-corpus (LDC2004E17) to participants in the 2004 -ACE evaluation. - -The objective of the ACE program is to develop automatic content extraction technology to support automatic -processing of human language in text form. In September 2004, sites were evaluated on system performance in six -areas: Entity Detection and Recognition (EDR), Entity Mention Detection (EMD), EDR Co-reference, Relation Detection -and Recognition (RDR), Relation Mention Detection (RMD), and RDR given reference entities. All tasks were evaluated -in three languages: English, Chinese and Arabic. - -The current publication consists of the official training data for these evaluation tasks. A seventh evaluation area, -Timex Detection and Recognition, is supported by the ACE Time Normalization (TERN) 2004 English Training Data Corpus -(LDC2005T07). The TERN corpus source data largely overlaps with the English source data contained in the current -release. - -For more information about linguistic resources for the ACE program, including annotation guidelines, -task definitions, free annotation tools and other documentation, please visit LDC's ACE website: -https://www.ldc.upenn.edu/collaborations/past-projects/ace -""" - -_HOMEPAGE = "https://catalog.ldc.upenn.edu/LDC2005T09" - -# TODO: Add the license for the dataset here if you can find it -_LICENSE = """https://catalog.ldc.upenn.edu/license/ldc-non-members-agreement.pdf""" - -# TODO: Add class labels -_CLASS_LABELS = ["PHYS", "EMP-ORG", "ART", "OTHER-AFF", "GPE-AFF", "PER-SOC"] - - -class ACE2004(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - @property - def manual_download_instructions(self): - return ( - "To use ACE2004 you have to download it manually. " - "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2005T09" - "Preprocess the data as described in " - "https://github.com/LorrinWWW/two-are-better-than-one/tree/master/datasets and " - "extract test.ACE04_0,json, train.ACE04_0.json, valid.ACE04_0.json files from the " - "unified folder in one folder, and load the dataset with: " - "`datasets.load_dataset('ace2004', data_dir='path/to/folder/folder_name')`" - ) - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_ACE2004, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) - - if not os.path.exists(data_dir): - raise FileNotFoundError( - "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('ace2004', data_dir=...)` that includes the train, valid, test files. Manual download instructions: {}".format( - data_dir, self.manual_download_instructions - ) - ) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": os.path.join(data_dir, "train.ACE04_0.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": os.path.join(data_dir, "test.ACE04_0.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": os.path.join(data_dir, "valid.ACE04_0.json")}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - data = json.load(f) - for example in data: - idx = 0 - for rel in example["relations"]: - head_start, head_end, tail_start, tail_end, label = rel - - id_ = str(idx) - idx += 1 - - yield id_, { - "tokens": example["tokens"], - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py b/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py deleted file mode 100644 index be86fa70..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py +++ /dev/null @@ -1,140 +0,0 @@ -"""TODO: Add a description here.""" - - -import json -import os - -import datasets - -_CITATION_ACE2005 = """\ -@article{walker2006ace, - title={ACE 2005 multilingual training corpus}, - author={Walker, Christopher and Strassel, Stephanie and Medero, Julie and Maeda, Kazuaki}, - journal={Linguistic Data Consortium, Philadelphia}, - volume={57}, - pages={45}, - year={2006} -} -""" - -# You can copy an official description -_DESCRIPTION = """\ -ACE 2005 Multilingual Training Corpus contains the complete set of English, Arabic and Chinese -training data for the 2005 Automatic Content Extraction (ACE) technology evaluation. The corpus consists of data of -various types annotated for entities, relations and events by the Linguistic Data Consortium (LDC) with support from -the ACE Program and additional assistance from LDC. - -The objective of the ACE program was to develop automatic content extraction technology to support automatic -processing of human language in text form. - -In November 2005, sites were evaluated on system performance in five primary areas: the recognition of entities, -values, temporal expressions, relations, and events. Entity, relation and event mention detection were also offered -as diagnostic tasks. All tasks with the exception of event tasks were performed for three languages, English, -Chinese and Arabic. Events tasks were evaluated in English and Chinese only. This release comprises the official -training data for these evaluation tasks. - -For more information about linguistic resources for the ACE Program, including annotation guidelines, -task definitions and other documentation, see LDC's ACE website: -http://projects.ldc.upenn.edu/ace/ -""" - -_HOMEPAGE = "https://catalog.ldc.upenn.edu/LDC2006T06" - -_LICENSE = """https://catalog.ldc.upenn.edu/license/ldc-non-members-agreement.pdf""" - -_CLASS_LABELS = ["PHYS", "ART", "PART-WHOLE", "ORG-AFF", "GEN-AFF", "PER-SOC"] - - -class ACE2004(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - @property - def manual_download_instructions(self): - return ( - "To use ACE2005 you have to download it manually. " - "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2006T06" - "Preprocess the data as described in " - "https://github.com/LorrinWWW/two-are-better-than-one/tree/master/datasets and " - "extract test.ACE05.json, train.ACE05.json, valid.ACE05.json files from the " - "unified folder in one folder, and load the dataset with: " - "`datasets.load_dataset('ace2005', data_dir='path/to/folder/folder_name')`" - ) - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_ACE2005, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) - - if not os.path.exists(data_dir): - raise FileNotFoundError( - "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('ace2005', data_dir=...)` that includes the train, valid, test files. Manual download instructions: {}".format( - data_dir, self.manual_download_instructions - ) - ) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": os.path.join(data_dir, "train.ACE05.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": os.path.join(data_dir, "test.ACE05.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": os.path.join(data_dir, "valid.ACE05.json")}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - data = json.load(f) - for example in data: - idx = 0 - for rel in example["relations"]: - head_start, head_end, tail_start, tail_end, label = rel - - id_ = str(idx) - idx += 1 - - yield id_, { - "tokens": example["tokens"], - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/brat.py b/src/pytorch_ie/data/datasets/hf_datasets/brat.py deleted file mode 100644 index 32ae39ca..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/brat.py +++ /dev/null @@ -1,337 +0,0 @@ -import glob -import logging -from dataclasses import dataclass -from os import listdir, path -from typing import Dict, List, Optional - -import datasets -from datasets import BuilderConfig, DatasetInfo, Features, Sequence, SplitGenerator, Value - -logger = logging.getLogger(__name__) - - -@dataclass -class BratConfig(BuilderConfig): - """BuilderConfig for BRAT.""" - - url: str = None # type: ignore - description: Optional[str] = None - citation: Optional[str] = None - homepage: Optional[str] = None - - subdirectory_mapping: Optional[Dict[str, str]] = None - file_name_blacklist: Optional[List[str]] = None - ann_file_extension: str = "ann" - txt_file_extension: str = "txt" - - -class Brat(datasets.GeneratorBasedBuilder): - BUILDER_CONFIG_CLASS = BratConfig - - def _info(self): - return DatasetInfo( - description=self.config.description, - citation=self.config.citation, - homepage=self.config.homepage, - features=Features( - { - "context": Value("string"), - "file_name": Value("string"), - "spans": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "locations": Sequence( - { - "start": Value("int32"), - "end": Value("int32"), - } - ), - "text": Value("string"), - } - ), - "relations": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "arguments": Sequence( - {"type": Value("string"), "target": Value("string")} - ), - } - ), - "equivalence_relations": Sequence( - { - "type": Value("string"), - "targets": Sequence(Value("string")), - } - ), - "events": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "trigger": Value("string"), - "arguments": Sequence( - {"type": Value("string"), "target": Value("string")} - ), - } - ), - "attributions": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "value": Value("string"), - } - ), - "normalizations": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "resource_id": Value("string"), - "entity_id": Value("string"), - } - ), - "notes": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "note": Value("string"), - } - ), - } - ), - ) - - @staticmethod - def _get_location(location_string): - parts = location_string.split(" ") - assert ( - len(parts) == 2 - ), f"Wrong number of entries in location string. Expected 2, but found: {parts}" - return {"start": int(parts[0]), "end": int(parts[1])} - - @staticmethod - def _get_span_annotation(annotation_line): - """ - example input: - T1 Organization 0 4 Sony - """ - - _id, remaining, text = annotation_line.split("\t", maxsplit=2) - _type, locations = remaining.split(" ", maxsplit=1) - return { - "id": _id, - "text": text, - "type": _type, - "locations": [Brat._get_location(loc) for loc in locations.split(";")], - } - - @staticmethod - def _get_event_annotation(annotation_line): - """ - example input: - E1 MERGE-ORG:T2 Org1:T1 Org2:T3 - """ - _id, remaining = annotation_line.strip().split("\t") - args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] - return { - "id": _id, - "type": args[0]["type"], - "trigger": args[0]["target"], - "arguments": args[1:], - } - - @staticmethod - def _get_relation_annotation(annotation_line): - """ - example input: - R1 Origin Arg1:T3 Arg2:T4 - """ - - _id, remaining = annotation_line.strip().split("\t") - _type, remaining = remaining.split(" ", maxsplit=1) - args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] - return {"id": _id, "type": _type, "arguments": args} - - @staticmethod - def _get_equivalence_relation_annotation(annotation_line): - """ - example input: - * Equiv T1 T2 T3 - """ - _, remaining = annotation_line.strip().split("\t") - parts = remaining.split(" ") - return {"type": parts[0], "targets": parts[1:]} - - @staticmethod - def _get_attribute_annotation(annotation_line): - """ - example input (binary: implicit value is True, if present, False otherwise): - A1 Negation E1 - example input (multi-value: explicit value) - A2 Confidence E2 L1 - """ - - _id, remaining = annotation_line.strip().split("\t") - parts = remaining.split(" ") - # if no value is present, it is implicitly "true" - if len(parts) == 2: - parts.append("true") - return { - "id": _id, - "type": parts[0], - "target": parts[1], - "value": parts[2], - } - - @staticmethod - def _get_normalization_annotation(annotation_line): - """ - example input: - N1 Reference T1 Wikipedia:534366 Barack Obama - """ - _id, remaining, text = annotation_line.split("\t", maxsplit=2) - _type, target, ref = remaining.split(" ") - res_id, ent_id = ref.split(":") - return { - "id": _id, - "type": _type, - "target": target, - "resource_id": res_id, - "entity_id": ent_id, - } - - @staticmethod - def _get_note_annotation(annotation_line): - """ - example input: - #1 AnnotatorNotes T1 this annotation is suspect - """ - _id, remaining, note = annotation_line.split("\t", maxsplit=2) - _type, target = remaining.split(" ") - return { - "id": _id, - "type": _type, - "target": target, - "note": note, - } - - @staticmethod - def _read_annotation_file(filename): - """ - reads a BRAT v1.3 annotations file (see https://brat.nlplab.org/standoff.html) - """ - - res = { - "spans": [], - "events": [], - "relations": [], - "equivalence_relations": [], - "attributions": [], - "normalizations": [], - "notes": [], - } - - with open(filename) as file: - for i, line in enumerate(file): - if len(line.strip()) == 0: - continue - ann_type = line[0] - - # strip away the new line character - if line.endswith("\n"): - line = line[:-1] - - if ann_type == "T": - res["spans"].append(Brat._get_span_annotation(line)) - elif ann_type == "E": - res["events"].append(Brat._get_event_annotation(line)) - elif ann_type == "R": - res["relations"].append(Brat._get_relation_annotation(line)) - elif ann_type == "*": - res["equivalence_relations"].append( - Brat._get_equivalence_relation_annotation(line) - ) - elif ann_type in ["A", "M"]: - res["attributions"].append(Brat._get_attribute_annotation(line)) - elif ann_type == "N": - res["normalizations"].append(Brat._get_normalization_annotation(line)) - elif ann_type == "#": - res["notes"].append(Brat._get_note_annotation(line)) - else: - raise ValueError( - f'unknown BRAT annotation id type: "{line}" (from file {filename} @line {i}). ' - f"Annotation ids have to start with T (spans), E (events), R (relations), " - f"A (attributions), or N (normalizations). See " - f"https://brat.nlplab.org/standoff.html for the BRAT annotation file " - f"specification." - ) - return res - - def _generate_examples(self, files=None, directory=None): - """Read context (.txt) and annotation (.ann) files.""" - if files is None: - assert ( - directory is not None - ), "If files is None, directory has to be provided, but it is also None." - _files = glob.glob(f"{directory}/*.{self.config.ann_file_extension}") - files = sorted(path.splitext(fn)[0] for fn in _files) - - for filename in files: - basename = path.basename(filename) - if ( - self.config.file_name_blacklist is not None - and basename in self.config.file_name_blacklist - ): - logger.info(f"skip annotation file: {basename} (blacklisted)") - continue - - ann_fn = f"{filename}.{self.config.ann_file_extension}" - brat_annotations = Brat._read_annotation_file(ann_fn) - - txt_fn = f"{filename}.{self.config.txt_file_extension}" - txt_content = open(txt_fn).read() - brat_annotations["context"] = txt_content - brat_annotations["file_name"] = basename - - yield basename, brat_annotations - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - - subdirectory_mapping = self.config.subdirectory_mapping - - # since subclasses of BuilderConfig are not allowed to define - # attributes without defaults, check here - assert self.config.url is not None, "data url not specified" - - # if url points to a local directory, just point to that - if path.exists(self.config.url) and path.isdir(self.config.url): - data_dir = self.config.url - # otherwise, download and extract - else: - data_dir = dl_manager.download_and_extract(self.config.url) - logging.info(f"load from data dir: {data_dir}") - - # if no subdirectory mapping is provided, ... - if subdirectory_mapping is None: - # ... use available subdirectories as split names ... - subdirs = [f for f in listdir(data_dir) if path.isdir(path.join(data_dir, f))] - if len(subdirs) > 0: - subdirectory_mapping = {subdir: subdir for subdir in subdirs} - else: - # ... otherwise, default to a single train split with the base directory - subdirectory_mapping = {"": "train"} - - return [ - SplitGenerator( - name=split, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "directory": path.join(data_dir, subdir), - }, - ) - for subdir, split in subdirectory_mapping.items() - ] diff --git a/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py b/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py deleted file mode 100644 index d09aa2c2..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py +++ /dev/null @@ -1,176 +0,0 @@ -"""TODO: Add a description here.""" - - -import json -import re - -import datasets - -_CITATION_CHEMPROT = """\ -@article{article, -author = {Kringelum, Jens and Kjaerulff, Sonny and Brunak, Søren and Lund, Ole and Oprea, Tudor and Taboureau, Olivier}, -year = {2016}, -month = {02}, -pages = {bav123}, -title = {ChemProt-3.0: A global chemical biology diseases mapping}, -volume = {2016}, -journal = {Database}, -doi = {10.1093/database/bav123} -}""" - -# You can copy an official description -_DESCRIPTION = """\ -ChemProt is a publicly available compilation of chemical-protein-disease annotation resources that enables the study -of systems pharmacology for a small molecule across multiple layers of complexity from molecular to clinical levels. -In this third version, ChemProt has been updated to more than 1.7 million compounds with 7.8 million bioactivity -measurements for 19 504 proteins. -""" - -_HOMEPAGE = "http://potentia.cbs.dtu.dk/ChemProt/" - -# TODO: Add the license for the dataset here if you can find it -_LICENSE = "" - -# TODO: Add link to the official dataset URLs here, currently pointing to preprocessed scibert files -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_DATA_URLs = { - "train": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/train.txt", - "dev": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/dev.txt", - "test": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/test.txt", -} - -_CLASS_LABELS = [ - "ACTIVATOR", - "AGONIST", - "AGONIST-ACTIVATOR", - "AGONIST-INHIBITOR", - "ANTAGONIST", - "DOWNREGULATOR", - "INDIRECT-DOWNREGULATOR", - "INDIRECT-UPREGULATOR", - "INHIBITOR", - "PRODUCT-OF", - "SUBSTRATE", - "SUBSTRATE_PRODUCT-OF", - "UPREGULATOR", -] - - -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case -class ChemProt(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("3.0.0") # type: ignore - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_CHEMPROT, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - data_files = dl_manager.download_and_extract(_DATA_URLs) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": data_files.get("train")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": data_files.get("dev")}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": data_files.get("test")}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - for idx, line in enumerate(f.readlines()): - example = json.loads(line) - raw_text = example["text"] - label = example["label"] - id_ = str(idx) - - # handle special case with square brackets surrounding entities in raw text - raw_text = re.sub(r"\[\[\[", "[ [[", raw_text) - raw_text = re.sub(r"\]\]\]", "]] ]", raw_text) - # handle unicode remnants - raw_text = re.sub(r"(\u2002|\xa0)", " ", raw_text) - - # TODO check whether adding whitespace before and after symbols may be too aggressive - raw_text = re.sub(r"([.,!?()])(\S)", r"\1 \2", raw_text) - raw_text = re.sub(r"(\S)([.,!?()])", r"\1 \2", raw_text) - - # add whitespace before start marker and after end marker - raw_text = re.sub(r"(\S)(\[\[)", r"\1 \2", raw_text) - raw_text = re.sub(r"(\S)(<<)", r"\1 \2", raw_text) - raw_text = re.sub(r"(\]\])(\S)", r"\1 \2", raw_text) - raw_text = re.sub(r"(>>)(\S)", r"\1 \2", raw_text) - - tokens = raw_text.split(" ") - - assert any(e in tokens for e in ["[[", "]]", "<<", ">>"]), ( - f"Missing head/tail markers in " f"{example}\n Tokens: {tokens}" - ) - - # Get head/tail order before determining head/tail indices and popping markers - head_start = tokens.index("[[") - tail_start = tokens.index("<<") - if head_start < tail_start: - tokens.pop(head_start) - head_end = tokens.index("]]") - tokens.pop(head_end) - tail_start = tokens.index("<<") - tokens.pop(tail_start) - tail_end = tokens.index(">>") - tokens.pop(tail_end) - else: - tokens.pop(tail_start) - tail_end = tokens.index(">>") - tokens.pop(tail_end) - head_start = tokens.index("[[") - tokens.pop(head_start) - head_end = tokens.index("]]") - tokens.pop(head_end) - - yield id_, { - "tokens": tokens, - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py b/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py deleted file mode 100644 index 5eb73609..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py +++ /dev/null @@ -1,285 +0,0 @@ -"""TODO: Add a description here.""" - - -import json - -import datasets - -# TODO: Add the licence for the dataset here if you can find it -_LICENSE = "" - -# TODO: Add link to the official dataset URLs here -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) - -_CITATION_FEWREL_1 = """\ -@inproceedings{han-etal-2018-fewrel, - title = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation Classification Dataset with State-of-the-Art Evaluation", - author = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao, Yuan and Liu, Zhiyuan and Sun, Maosong", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/D18-1514", - doi = "10.18653/v1/D18-1514", - pages = "4803--4809" -}""" - -_CITATION_FEWREL_2 = """\ -@inproceedings{han-etal-2018-fewrel, - title = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation Classification Dataset with State-of-the-Art Evaluation", - author = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao, Yuan and Liu, Zhiyuan and Sun, Maosong", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/D18-1514", - doi = "10.18653/v1/D18-1514", - pages = "4803--4809" -} - -@inproceedings{gao-etal-2019-fewrel, - title = "{F}ew{R}el 2.0: Towards More Challenging Few-Shot Relation Classification", - author = "Gao, Tianyu and Han, Xu and Zhu, Hao and Liu, Zhiyuan and Li, Peng and Sun, Maosong and Zhou, Jie", - booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", - month = nov, - year = "2019", - address = "Hong Kong, China", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/D19-1649", - doi = "10.18653/v1/D19-1649", - pages = "6251--6256" -} -""" - - -class FewRelConfig(datasets.BuilderConfig): - """BuilderConfig for FewRel.""" - - def __init__( - self, - data_url, - citation, - url, - class_labels, - description, - **kwargs, - ): - """BuilderConfig for FewRel. - Args: - data_url: `string`, url to download the zip file from - citation: `string`, citation for the data set - url: `string`, url for information about the data set - class_labels: `list[string]`, the list of classes if the label is - categorical. If not provided, then the label will be of type - `datasets.Value('float32')`. - **kwargs: keyword arguments forwarded to super. - """ - super().__init__(version=datasets.Version("1.0.0", ""), **kwargs) - self.class_labels = class_labels - self.data_url = data_url - self.citation = citation - self.url = url - self.description = description - - -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case -class FewRel(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - BUILDER_CONFIGS = [ - FewRelConfig( - name="fewrel_train", - data_url="https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json", - citation=_CITATION_FEWREL_1, - url="https://thunlp.github.io/1/fewrel1.html", - class_labels=[ - "P931", - "P4552", - "P140", - "P1923", - "P150", - "P6", - "P27", - "P449", - "P1435", - "P175", - "P1344", - "P39", - "P527", - "P740", - "P706", - "P84", - "P495", - "P123", - "P57", - "P22", - "P178", - "P241", - "P403", - "P1411", - "P135", - "P991", - "P156", - "P176", - "P31", - "P1877", - "P102", - "P1408", - "P159", - "P3373", - "P1303", - "P17", - "P106", - "P551", - "P937", - "P355", - "P710", - "P137", - "P674", - "P466", - "P136", - "P306", - "P127", - "P400", - "P974", - "P1346", - "P460", - "P86", - "P118", - "P264", - "P750", - "P58", - "P3450", - "P105", - "P276", - "P101", - "P407", - "P1001", - "P800", - "P131", - ], - description="", - ), - FewRelConfig( - name="fewrel_validation", - data_url="https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json", - citation=_CITATION_FEWREL_1, - url="https://thunlp.github.io/1/fewrel1.html", - class_labels=[ - "P177", - "P364", - "P2094", - "P361", - "P641", - "P59", - "P413", - "P206", - "P412", - "P155", - "P26", - "P410", - "P25", - "P463", - "P40", - "P921", - ], - description="", - ), - FewRelConfig( - name="fewrel2_validation", - data_url="https://github.com/thunlp/FewRel/raw/master/data/val_pubmed.json", - citation=_CITATION_FEWREL_2, - url="https://thunlp.github.io/2/fewrel2_da.html", - class_labels=[ - "biological_process_involves_gene_product", - "inheritance_type_of", - "is_normal_tissue_origin_of_disease", - "ingredient_of", - "is_primary_anatomic_site_of_disease", - "gene_found_in_organism", - "occurs_in", - "causative_agent_of", - "classified_as", - "gene_plays_role_in_process", - ], - description="", - ), - ] - - def _info(self): - features = datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=self.config.class_labels), - } - ) - - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=self.config.description, - # This defines the different columns of the dataset and their types - features=features, # Here we define them above because they are different between the two configurations - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=self.config.url, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=self.config.citation, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - filepath = dl_manager.download_and_extract(self.config.data_url) - - split = ( - datasets.Split.VALIDATION if "validation" in self.config.name else datasets.Split.TRAIN - ) - - return [ - datasets.SplitGenerator( - name=split, - gen_kwargs={"filepath": filepath}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - data = json.load(f) - for label, examples in data.items(): - for idx, example in enumerate(examples): - id_ = label + "_" + str(idx) - - head_token_positions = example["h"][2][0] - tail_token_positions = example["t"][2][0] - - head_start = head_token_positions[0] - head_end = head_token_positions[-1] - tail_start = tail_token_positions[0] - tail_end = tail_token_positions[-1] - - yield id_, { - "tokens": example["tokens"], - "head_start": head_start, - "head_end": head_end + 1, # make end offset exclusive - "tail_start": tail_start, - "tail_end": tail_end + 1, # make end offset exclusive - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/genia.py b/src/pytorch_ie/data/datasets/hf_datasets/genia.py deleted file mode 100644 index 8fd88503..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/genia.py +++ /dev/null @@ -1,406 +0,0 @@ -"""TODO: Add a description here.""" - - -import os - -import datasets -import spacy -from spacy.lang.en import English -from spacy.symbols import ORTH - -_CITATION_GENIA = """\ -@article{article, - author = {Kim, Jin-Dong and Ohta, Tomoko and Tateisi, Yuka and Tsujii, Jun'ichi}, - year = {2003}, - month = {02}, - pages = {i180-2}, - title = {GENIA corpus—A semantically annotated corpus for bio-textmining}, - volume = {19 Suppl 1}, - journal = {Bioinformatics (Oxford, England)}, - doi = {10.1093/bioinformatics/btg1023} -}""" - -# You can copy an official description -_DESCRIPTION = """ -The GENIA corpus is the primary collection of biomedical literature compiled and annotated within the scope -of the GENIA project. The corpus was created to support the development and evaluation of information -extraction and text mining systems for the domain of molecular biology. -""" - -_HOMEPAGE = "http://www.geniaproject.org/genia-corpus/relation-corpus" - -# TODO: Add the license for the dataset here if you can find it -_LICENSE = """\ -GENIA Project License for Annotated Corpora - -1. Copyright of abstracts - -Any abstracts contained in this corpus are from PubMed(R), a database -of the U.S. National Library of Medicine (NLM). - -NLM data are produced by a U.S. Government agency and include works of -the United States Government that are not protected by U.S. copyright -law but may be protected by non-US copyright law, as well as abstracts -originating from publications that may be protected by U.S. copyright -law. - -NLM assumes no responsibility or liability associated with use of -copyrighted material, including transmitting, reproducing, -redistributing, or making commercial use of the data. NLM does not -provide legal advice regarding copyright, fair use, or other aspects -of intellectual property rights. Persons contemplating any type of -transmission or reproduction of copyrighted material such as abstracts -are advised to consult legal counsel. - -2. Copyright of full texts - -Any full texts contained in this corpus are from the PMC Open Access -Subset of PubMed Central (PMC), the U.S. National Institutes of Health -(NIH) free digital archive of biomedical and life sciences journal -literature. - -Articles in the PMC Open Access Subset are protected by copyright, but -are made available under a Creative Commons or similar license that -generally allows more liberal redistribution and reuse than a -traditional copyrighted work. Please refer to the license of each -article for specific license terms. - -3. Copyright of annotations - -The copyrights of annotations created in the GENIA Project of Tsujii -Laboratory, University of Tokyo, belong in their entirety to the GENIA -Project. - -4. Licence terms - -Use and distribution of abstracts drawn from PubMed is subject to the -PubMed(R) license terms as stated in Clause 1. - -Use and distribution of full texts is subject to the license terms -applying to each publication. - -Annotations created by the GENIA Project are licensed under the -Creative Commons Attribution 3.0 Unported License. To view a copy of -this license, visit http://creativecommons.org/licenses/by/3.0/ or -send a letter to Creative Commons, 444 Castro Street, Suite 900, -Mountain View, California, 94041, USA. - -Annotations created by the GENIA Project must be attributed as -detailed in Clause 5. - -5. Attribution - -The GENIA Project was founded and led by prof. Jun'ichi Tsujii and -the project and its annotation efforts have been coordinated in part -by Nigel Collier, Yuka Tateisi, Sang-Zoo Lee, Tomoko Ohta, Jin-Dong -Kim, and Sampo Pyysalo. - -For a complete list of the GENIA Project members and contributors, -please refer to http://www.geniaproject.org. - -The GENIA Project has been supported by Grant-in-Aid for Scientific -Research on Priority Area "Genome Information Science" (MEXT, Japan), -Grant-in-Aid for Scientific Research on Priority Area "Systems -Genomics" (MEXT, Japan), Core Research for Evolutional Science & -Technology (CREST) "Information Mobility Project" (JST, Japan), -Solution Oriented Research for Science and Technology (SORST) (JST, -Japan), Genome Network Project (MEXT, Japan) and Grant-in-Aid for -Specially Promoted Research (MEXT, Japan). - -Annotations covered by this license must be attributed as follows: - - Corpus annotations (c) GENIA Project - -Distributions including annotations covered by this licence must -include this license text and Attribution section. - -6. References - -- GENIA Project : http://www.geniaproject.org -- PubMed : http://www.pubmed.gov/ -- NLM (United States National Library of Medicine) : http://www.nlm.nih.gov/ -- MEXT (Ministry of Education, Culture, Sports, Science and Technology) : http://www.mext.go.jp/ -- JST (Japan Science and Technology Agency) : http://www.jst.go.jp -""" - -# TODO: Add link to the official dataset URLs here, currently test points to blind test file -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_DATA_URLs = { - "train": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_training_data.tar.gz", - "dev": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_development_data.tar.gz", - # "test": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_test_data.tar.gz" -} -# TODO: Add class labels -_CLASS_LABELS = ["Subunit-Complex", "Protein-Component"] - - -class Genia(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_GENIA, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - data_files = dl_manager.download_and_extract(_DATA_URLs) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": data_files.get("train")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": data_files.get("dev")}, - ), - # datasets.SplitGenerator( - # name=datasets.Split.TEST, - # gen_kwargs={"filepath": data_files.get("test")}, - # ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - doc_ids, list_of_files = self._get_doc_ids_and_file_paths(filepath) - processed_docs = self._get_processed_docs(doc_ids, list_of_files) - - idx = 0 - for doc in processed_docs: - if "sentences" in doc and "sent_rels" in doc: - sent_start_index = 0 - for sent, rels in zip(doc["sentences"], doc["sent_rels"]): - for rel in rels: - label = rel["label"] - head_start = rel["head_start"] - sent_start_index - head_end = rel["head_end"] - sent_start_index - tail_start = rel["tail_start"] - sent_start_index - tail_end = rel["tail_end"] - sent_start_index - - id_ = str(idx) + "_" + doc["doc_id"] - idx += 1 - - yield id_, { - "tokens": sent["tokens"], - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } - - sent_start_index += len(sent) - else: - for rel in doc["relations"]: - label = rel["label"] - head_start = rel["head_start"] - head_end = rel["head_end"] - tail_start = rel["tail_start"] - tail_end = rel["tail_end"] - - id_ = str(idx) + "_" + doc["doc_id"] - idx += 1 - - yield id_, { - "tokens": doc["tokens"], - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } - - def _get_doc_ids_and_file_paths(self, path): - list_of_files = {} - for root, dirs, files in os.walk(path): - for file in files: - if file not in ["LICENSE", "README"]: - list_of_files[file] = os.path.join(root, file) - doc_ids = list({file_name.split(".")[0] for file_name in list_of_files.keys()}) - doc_ids.sort() - doc_ids.sort(key=len) - return doc_ids, list_of_files - - def _get_processed_docs(self, doc_ids, list_of_files): - ssplit = False - try: - nlp = spacy.load("en_core_web_sm") - special_case = [{ORTH: "ca."}] - nlp.tokenizer.add_special_case("ca.", special_case) - ssplit = True - except OSError as e: - print(e) - print( - "You have to download the model first to enable sentence splitting: " - "\tpython -m spacy download en_core_web_sm" - ) - print("Resorting to tokenization only") - nlp = English() - processed_docs = [] - for doc_id in doc_ids: - try: - txt_file = list_of_files[doc_id + ".txt"] - a1_file = list_of_files[doc_id + ".a1"] - rel_file = list_of_files[doc_id + ".rel"] - except KeyError: - print(f"Missing annotation file for doc {doc_id}") - continue - - relations = [] - entities = {} - with open(txt_file, encoding="utf-8") as txt: - text = txt.read() - doc = nlp(text) - with open(a1_file, encoding="utf-8") as a1: - for line in a1.readlines(): - if line.startswith("T"): - entity_id, entity = self._retrieve_entity(line, doc, doc_id) - entities[entity_id] = entity - with open(rel_file, encoding="utf-8") as rel: - for line in rel.readlines(): - if line.startswith("T"): - entity_id, entity = self._retrieve_entity(line, doc, doc_id) - entities[entity_id] = entity - elif line.startswith("R"): - relations.append(self._retrieve_relation(line, entities)) - tokens = [token.text for token in doc] - processed_doc = { - "doc_id": doc_id, - "text": text, - "tokens": tokens, - "entities": entities, - "relations": relations, - } - if ssplit: - sentences = self._convert_sentences(doc.sents) - sentences = self._fix_ssplit(doc_id, sentences) - sentence_tokens = [] - sentence_relations = [] - left_over_rels_indices = [True for _ in relations] - for sent in sentences: - sent_rels = [] - for idx, relation in enumerate(relations): - if ( - min(relation["head_start"], relation["tail_start"]) >= sent["start"] - and max(relation["head_end"], relation["tail_end"]) <= sent["end"] - ): - sent_rels.append(relation) - left_over_rels_indices[idx] = False - sentence_tokens.append(sent["tokens"]) - sentence_relations.append(sent_rels) - left_over_rels = [] - for indicator, relation in zip(left_over_rels_indices, relations): - if indicator: - left_over_rels.append(relation) - if left_over_rels: - print( - f"Examples in doc {doc_id} where spaCy ssplit were not compatible with relation annotation:" - ) - print([list(sent) for sent in doc.sents]) - print(sentences) - print(left_over_rels) - processed_doc["sentences"] = sentences - processed_doc["sent_rels"] = sentence_relations - processed_docs.append(processed_doc) - return processed_docs - - def _retrieve_entity(self, line, doc, doc_id=""): - cols = line.strip().split() - entity_id, _, start_char, end_char = cols[0:4] - start_char, end_char = int(start_char), int(end_char) - entity_type = " ".join(cols[4:]) - # default alignment mode is strict, but charOffset in annotation sometimes does not translate to token offsets - # well, e.g. charOffsets only cover "LMP1" in "LMP1+" - span = doc.char_span(start_char, end_char, alignment_mode="expand") - if span: - start, end = span.start, span.end - else: - snippet_start = max(0, start_char - 10) - snippet_end = min(len(doc.text), end_char + 10) - raise ValueError( - f"{doc_id} Could not retrieve span for character offsets: " - f"text[{start_char},{end_char}] = {doc.text[start_char:end_char]}\n" - f"{doc.text[snippet_start:snippet_end]}\n" - f"{list(doc)}" - ) - return (entity_id, {"start": start, "end": end, "entity_type": entity_type}) - - def _retrieve_relation(self, line, entities): - cols = line.strip().split() - relation_id, rel_type, arg1, arg2 = cols - arg1 = arg1.split(":")[-1] - head_start, head_end = entities[arg1]["start"], entities[arg1]["end"] - arg2 = arg2.split(":")[-1] - tail_start, tail_end = entities[arg2]["start"], entities[arg2]["end"] - return { - "rel_id": relation_id, - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": rel_type, - } - - def _convert_sentences(self, sentences): - sentence_dicts = [] - for sent in sentences: - start, end = sent.start, sent.end - tokens = [token.text for token in sent] - sentence_dicts.append({"tokens": tokens, "start": start, "end": end}) - return sentence_dicts - - def _fix_ssplit(self, doc_id, sentences): - if doc_id == "PMID-8164652": - sentences[2]["tokens"] += sentences[3]["tokens"] - sentences[2]["end"] = sentences[3]["end"] - del sentences[3] - elif doc_id == "PMID-9442380": - sentences[4]["tokens"].append(sentences[5]["tokens"].pop(0)) - sentences[4]["end"] += 1 - sentences[5]["start"] += 1 - elif doc_id == "PMID-10201929": - sentences[4]["tokens"] += sentences[5]["tokens"] - sentences[4]["end"] = sentences[5]["end"] - del sentences[5] - elif doc_id == "PMID-10428853": - sentences[3]["tokens"] += sentences[4]["tokens"] - sentences[3]["end"] = sentences[4]["end"] - del sentences[4] - elif doc_id == "PMID-1675604": - sentences[2]["tokens"] += sentences[3]["tokens"] - sentences[2]["end"] = sentences[3]["end"] - del sentences[3] - sentences = [sent for sent in sentences if sent["tokens"]] - return sentences diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py b/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py deleted file mode 100644 index e1307e09..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py +++ /dev/null @@ -1,161 +0,0 @@ -import os - -import datasets -from tqdm import tqdm - -_CITATION = """ -""" - -_DESCRIPTION = """ -OntoNotes 5.0 -""" - -_URL = ( - "https://cloud.dfki.de/owncloud/index.php/s/S8pB4xTBZ3zQEic/download/OntoNotes-5.0-NER-BIO.zip" -) - -_LICENCE = "LDC User Agreement for Non-Members" - -# the label ids for ner_tags -NER_TAGS_DICT = { - "O": 0, - "CARDINAL": 1, - "DATE": 2, - "EVENT": 3, - "FAC": 4, - "GPE": 5, - "LANGUAGE": 6, - "LAW": 7, - "LOC": 8, - "MONEY": 9, - "NORP": 10, - "ORDINAL": 11, - "ORG": 12, - "PERCENT": 13, - "PERSON": 14, - "PRODUCT": 15, - "QUANTITY": 16, - "TIME": 17, - "WORK_OF_ART": 18, -} - - -class OntoNotesConfig(datasets.BuilderConfig): - """BuilderConfig for OntoNotes""" - - def __init__(self, **kwargs): - """BuilderConfig for OntoNotes. - Args: - **kwargs: keyword arguments forwarded to super. - """ - super().__init__(**kwargs) - - -class OntoNotes(datasets.GeneratorBasedBuilder): - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "id": datasets.Value("string"), - "tokens": datasets.features.Sequence(datasets.Value("string")), - "pos_tags": datasets.features.Sequence(datasets.Value("string")), - "parsing": datasets.features.Sequence(datasets.Value("string")), - "ner_tags": datasets.features.Sequence( - datasets.features.ClassLabel( - names=[ - "O", - "CARDINAL", - "DATE", - "EVENT", - "FAC", - "GPE", - "LANGUAGE", - "LAW", - "LOC", - "MONEY", - "NORP", - "ORDINAL", - "ORG", - "PERCENT", - "PERSON", - "PRODUCT", - "QUANTITY", - "TIME", - "WORK_OF_ART", - ] - ) - ), - } - ), - supervised_keys=None, - homepage="https://catalog.ldc.upenn.edu/LDC2013T19", - citation=_CITATION, - license=_LICENCE, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - urls_to_download = dl_manager.download_and_extract(_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": os.path.join( - urls_to_download, - "onto.train.ner", - ) - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": os.path.join(urls_to_download, "onto.development.ner")}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": os.path.join(urls_to_download, "onto.test.ner")}, - ), - ] - - def _generate_examples(self, filepath=None): - num_lines = sum(1 for _ in open(filepath)) - id = 0 - - with open(filepath) as f: - tokens, pos_tags, dependencies, ner_tags = [], [], [], [] - for line in tqdm(f, total=num_lines): - line = line.strip().split() - - if line: - assert len(line) == 4 - token, pos_tag, dependency, ner_tag = line - if ner_tag != "O": - ner_tag = ner_tag.split("-")[1] - tokens.append(token) - pos_tags.append(pos_tag) - dependencies.append(dependency) - ner_tags.append(NER_TAGS_DICT[ner_tag]) - - elif tokens: - # organize a record to be written into json - record = { - "tokens": tokens, - "id": str(id), - "pos_tags": pos_tags, - "parsing": dependencies, - "ner_tags": ner_tags, - } - tokens, pos_tags, dependencies, ner_tags = [], [], [], [] - id += 1 - yield record["id"], record - - # take the last sentence - if tokens: - record = { - "tokens": tokens, - "id": str(id), - "pos_tags": pos_tags, - "parsing": dependencies, - "ner_tags": ner_tags, - } - yield record["id"], record diff --git a/src/pytorch_ie/data/datasets/hf_datasets/scierc.py b/src/pytorch_ie/data/datasets/hf_datasets/scierc.py deleted file mode 100644 index 4b3d9b7b..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/scierc.py +++ /dev/null @@ -1,132 +0,0 @@ -"""TODO: Add a description here.""" - - -import json -import os - -import datasets - -_CITATION_SCIERC = """\ -@InProceedings{luan2018multitask, - author = {Luan, Yi and He, Luheng and Ostendorf, Mari and Hajishirzi, Hannaneh}, - title = {Multi-Task Identification of Entities, Relations, and Coreferencefor Scientific Knowledge Graph Construction}, - booktitle = {Proc.\\ Conf. Empirical Methods Natural Language Process. (EMNLP)}, - year = {2018}, -}""" - -# You can copy an official description -_DESCRIPTION = """\ -SCIERC includes annotations for scientific entities, their relations, and coreference clusters -for 500 scientific abstracts. These abstracts are taken from 12 AI conference/workshop proceedings -in four AI communities, from the Semantic Scholar Corpus. SCI-ERC extends previous datasets in scientific -articles SemEval 2017 Task 10 and SemEval 2018 Task 7 by extending entity types, relation types, relation coverage, -and adding cross-sentence relations using coreference links. -""" - -_HOMEPAGE = "http://nlp.cs.washington.edu/sciIE/" - -# TODO: Add the licence for the dataset here if you can find it -_LICENSE = "" - -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_DATA_URL = "http://nlp.cs.washington.edu/sciIE/data/sciERC_processed.tar.gz" - -_CLASS_LABELS = [ - "USED-FOR", - "FEATURE-OF", - "HYPONYM-OF", - "PART-OF", - "COMPARE", - "CONJUNCTION", - "EVALUATE-FOR", # label in the data is not documented in annotation guidelines -] - - -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case -class SCIERC(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_SCIERC, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - dl_dir = dl_manager.download_and_extract(_DATA_URL) - data_dir = os.path.join(dl_dir, "processed_data/json") - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": os.path.join(data_dir, "train.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": os.path.join(data_dir, "dev.json")}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": os.path.join(data_dir, "test.json")}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - idx = 0 - for line in f.readlines(): - example = json.loads(line) - sent_start_index = 0 - for sent, rels in zip(example["sentences"], example["relations"]): - for rel in rels: - head_start, head_end, tail_start, tail_end, label = rel - head_start -= sent_start_index - head_end -= sent_start_index - tail_start -= sent_start_index - tail_end -= sent_start_index - - id_ = str(idx) + "_" + example["doc_key"] - idx += 1 - - yield id_, { - "tokens": sent, - "head_start": head_start, - "head_end": head_end + 1, # make end offset exclusive - "tail_start": tail_start, - "tail_end": tail_end + 1, # make end offset exclusive - "label": label, - } - - sent_start_index += len(sent) diff --git a/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py b/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py deleted file mode 100644 index 65879d8f..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py +++ /dev/null @@ -1,185 +0,0 @@ -"""The SemEval-2010 Task 8 on Multi-way classification of semantic relations between pairs of nominals""" - - -import os -import re - -import datasets - -_CITATION = """\ -@inproceedings{hendrickx-etal-2010-semeval, - title = "{S}em{E}val-2010 Task 8: Multi-Way Classification of Semantic Relations between Pairs of Nominals", - author = "Hendrickx, Iris and - Kim, Su Nam and - Kozareva, Zornitsa and - Nakov, Preslav and - {\'O} S{\'e}aghdha, Diarmuid and - Pad{\'o}, Sebastian and - Pennacchiotti, Marco and - Romano, Lorenza and - Szpakowicz, Stan", - booktitle = "Proceedings of the 5th International Workshop on Semantic Evaluation", - month = jul, - year = "2010", - address = "Uppsala, Sweden", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/S10-1006", - pages = "33--38", -} -""" - -_DESCRIPTION = """\ -The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals. -The task was designed to compare different approaches to semantic relation classification -and to provide a standard testbed for future research. -""" - -_URL = "https://drive.google.com/uc?export=download&id=0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk" - -_CLASS_LABELS = [ - "Cause-Effect(e1,e2)", - "Cause-Effect(e2,e1)", - "Component-Whole(e1,e2)", - "Component-Whole(e2,e1)", - "Content-Container(e1,e2)", - "Content-Container(e2,e1)", - "Entity-Destination(e1,e2)", - "Entity-Destination(e2,e1)", - "Entity-Origin(e1,e2)", - "Entity-Origin(e2,e1)", - "Instrument-Agency(e1,e2)", - "Instrument-Agency(e2,e1)", - "Member-Collection(e1,e2)", - "Member-Collection(e2,e1)", - "Message-Topic(e1,e2)", - "Message-Topic(e2,e1)", - "Product-Producer(e1,e2)", - "Product-Producer(e2,e1)", - "Other", -] - - -class SemEval2010Task8(datasets.GeneratorBasedBuilder): - """The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals. - The task was designed to compare different approaches to semantic relation classification - and to provide a standard testbed for future research.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage="https://semeval2.fbk.eu/semeval2.php?location=tasks&taskid=11", - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # dl_manager is a datasets.download.DownloadManager that can be used to - # download and extract URLs - dl_dir = dl_manager.download_and_extract(_URL) - data_dir = os.path.join(dl_dir, "SemEval2010_task8_all_data") - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": os.path.join( - data_dir, "SemEval2010_task8_training/TRAIN_FILE.TXT" - ), - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": os.path.join( - data_dir, "SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT" - ), - }, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - raw_lines = [] - for line in f: - line = line.strip() - - if not line: - idx, example = self._raw_lines_to_example(raw_lines) - yield idx, example - raw_lines = [] - continue - - raw_lines.append(line) - - def _raw_lines_to_example(self, raw_lines): - raw_id, raw_text = raw_lines[0].split("\t") - label = raw_lines[1] - id_ = int(raw_id) - raw_text = raw_text.strip('"') - - # Some special cases (e.g., missing spaces before entity marker) - if id_ in [213, 4612, 6373, 8411, 9867]: - raw_text = raw_text.replace("", " ") - if id_ in [2740, 4219, 4784]: - raw_text = raw_text.replace("", " ") - if id_ == 9256: - raw_text = raw_text.replace("log- jam", "log-jam") - - # necessary if text should be whitespace tokenizeable - if id_ in [2609, 7589]: - raw_text = raw_text.replace("1 1/2", "1-1/2") - if id_ == 10591: - raw_text = raw_text.replace("1 1/4", "1-1/4") - if id_ == 10665: - raw_text = raw_text.replace("6 1/2", "6-1/2") - - raw_text = re.sub(r"([.,!?()])$", r" \1", raw_text) - raw_text = re.sub(r"(e[12]>)([',;:\"\(\)])", r"\1 \2", raw_text) - raw_text = re.sub(r"([',;:\"\(\)])(", " ") - raw_text = raw_text.replace("", " ") - raw_text = raw_text.replace("", " ") - raw_text = raw_text.replace("", " ") - - tokens = raw_text.split(" ") - - head_start = tokens.index("") - tokens.pop(head_start) - - head_end = tokens.index("") - tokens.pop(head_end) - - tail_start = tokens.index("") - tokens.pop(tail_start) - - tail_end = tokens.index("") - tokens.pop(tail_end) - - return id_, { - "tokens": tokens, - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/tacred.py b/src/pytorch_ie/data/datasets/hf_datasets/tacred.py deleted file mode 100644 index 3525fb1b..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/tacred.py +++ /dev/null @@ -1,257 +0,0 @@ -"""TODO: Add a description here.""" - - -import json -import os - -import datasets - -_CITATION = """\ -@inproceedings{zhang-etal-2017-position, - title = "Position-aware Attention and Supervised Data Improve Slot Filling", - author = "Zhang, Yuhao and - Zhong, Victor and - Chen, Danqi and - Angeli, Gabor and - Manning, Christopher D.", - booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing", - month = sep, - year = "2017", - address = "Copenhagen, Denmark", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/D17-1004", - doi = "10.18653/v1/D17-1004", - pages = "35--45", -} - -@inproceedings{alt-etal-2020-tacred, - title = "{TACRED} Revisited: A Thorough Evaluation of the {TACRED} Relation Extraction Task", - author = "Alt, Christoph and - Gabryszak, Aleksandra and - Hennig, Leonhard", - booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/2020.acl-main.142", - doi = "10.18653/v1/2020.acl-main.142", - pages = "1558--1569", -} -""" - -# TODO: Add description of the dataset here -# You can copy an official description -_DESCRIPTION = """\ -This new dataset is designed to solve this great NLP task and is crafted with a lot of care. -""" - -# TODO: Add a link to an official homepage for the dataset here -_HOMEPAGE = "" - -# TODO: Add the licence for the dataset here if you can find it -_LICENSE = "" - -# TODO: Add link to the official dataset URLs here -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_PATCH_URLs = { - "dev": "https://raw.githubusercontent.com/DFKI-NLP/tacrev/master/patch/dev_patch.json", - "test": "https://raw.githubusercontent.com/DFKI-NLP/tacrev/master/patch/test_patch.json", -} - -_CLASS_LABELS = [ - "no_relation", - "org:alternate_names", - "org:city_of_headquarters", - "org:country_of_headquarters", - "org:dissolved", - "org:founded", - "org:founded_by", - "org:member_of", - "org:members", - "org:number_of_employees/members", - "org:parents", - "org:political/religious_affiliation", - "org:shareholders", - "org:stateorprovince_of_headquarters", - "org:subsidiaries", - "org:top_members/employees", - "org:website", - "per:age", - "per:alternate_names", - "per:cause_of_death", - "per:charges", - "per:children", - "per:cities_of_residence", - "per:city_of_birth", - "per:city_of_death", - "per:countries_of_residence", - "per:country_of_birth", - "per:country_of_death", - "per:date_of_birth", - "per:date_of_death", - "per:employee_of", - "per:origin", - "per:other_family", - "per:parents", - "per:religion", - "per:schools_attended", - "per:siblings", - "per:spouse", - "per:stateorprovince_of_birth", - "per:stateorprovince_of_death", - "per:stateorprovinces_of_residence", - "per:title", -] - - -def convert_ptb_token(token: str) -> str: - """Convert PTB tokens to normal tokens""" - return { - "-lrb-": "(", - "-rrb-": ")", - "-lsb-": "[", - "-rsb-": "]", - "-lcb-": "{", - "-rcb-": "}", - }.get(token.lower(), token) - - -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case -class TACRED(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - # This is an example of a dataset with multiple configurations. - # If you don't want/need to define several sub-sets in your dataset, - # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. - - # If you need to make complex sub-parts in the datasets with configurable options - # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig - # BUILDER_CONFIG_CLASS = MyBuilderConfig - - # You will be able to load one or the other configurations in the following list with - # data = datasets.load_dataset('my_dataset', 'first_domain') - # data = datasets.load_dataset('my_dataset', 'second_domain') - BUILDER_CONFIGS = [ - datasets.BuilderConfig( - name="original", version=datasets.Version("1.0.0"), description="The original TACRED." - ), - datasets.BuilderConfig( - name="revised", - version=datasets.Version("1.0.0"), - description="The revised TACRED (corrected labels in dev and test split).", - ), - ] - - DEFAULT_CONFIG_NAME = "original" # type: ignore - - @property - def manual_download_instructions(self): - return ( - "To use TACRED you have to download it manually. " - "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2018T24" - "Please extract all files in one folder and load the dataset with: " - "`datasets.load_dataset('tacred', data_dir='path/to/folder/folder_name')`" - ) - - def _info(self): - features = datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ) - - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=features, # Here we define them above because they are different between the two configurations - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - patch_files = {} - if self.config.name == "revised": - patch_files = dl_manager.download_and_extract(_PATCH_URLs) - - data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) - - if not os.path.exists(data_dir): - raise FileNotFoundError( - "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('tacred', data_dir=...)` that includes the unzipped files from the TACRED_LDC zip. Manual download instructions: {}".format( - data_dir, self.manual_download_instructions - ) - ) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": os.path.join(data_dir, "train.json"), - "patch_filepath": None, - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": os.path.join(data_dir, "test.json"), - "patch_filepath": patch_files.get("test"), - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": os.path.join(data_dir, "dev.json"), - "patch_filepath": patch_files.get("dev"), - }, - ), - ] - - def _generate_examples(self, filepath, patch_filepath): - """Yields examples.""" - # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method. - # It is in charge of opening the given file and yielding (key, example) tuples from the dataset - # The key is not important, it's more here for legacy reason (legacy from tfds) - patch_examples = {} - if patch_filepath is not None: - with open(patch_filepath, encoding="utf-8") as f: - patch_examples = {example["id"]: example for example in json.load(f)} - - with open(filepath, encoding="utf-8") as f: - data = json.load(f) - for example in data: - id_ = example["id"] - - if id_ in patch_examples: - example.update(patch_examples[id_]) - - yield id_, { - "tokens": [convert_ptb_token(token) for token in example["token"]], - "head_start": example["subj_start"], - "head_end": example["subj_end"] + 1, # make end offset exclusive - "tail_start": example["obj_start"], - "tail_end": example["obj_end"] + 1, # make end offset exclusive - "label": example["relation"], - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/webred.py b/src/pytorch_ie/data/datasets/hf_datasets/webred.py deleted file mode 100644 index 8ae785a3..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/webred.py +++ /dev/null @@ -1,746 +0,0 @@ -"""TODO: Add a description here.""" - - -import os -import re - -import datasets -import tensorflow as tf -from spacy.lang.en import English - -_CITATION_WEBRED = """\ -@misc{ormandi2021webred, - title={WebRED: Effective Pretraining And Finetuning For Relation Extraction On The Web}, - author={Robert Ormandi and Mohammad Saleh and Erin Winter and Vinay Rao}, - year={2021}, - eprint={2102.09681}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2102.09681}, -} -""" - -# You can copy an official description -_DESCRIPTION = """\ -A dataset for extracting relationships from a variety of text found on the World Wide Web. Text -on the web has diverse surface forms including writing styles, complexity and grammar. This dataset collects -sentences from a variety of webpages and documents that represent a variety of those categories. In each sentence, -there will be a subject and object entities tagged with subject SUBJ{...} and object OBJ{...}, respectively. The two -entities are either related by a relation from a set of pre-defined ones or has no relation. - -More information about the dataset can be found in our paper: https://arxiv.org/abs/2102.09681 -""" - -_HOMEPAGE = "https://github.com/google-research-datasets/WebRED" - -_LICENSE = """\ -This data is licensed by Google LLC under a Creative Commons Attribution 4.0 International License ( -http://creativecommons.org/licenses/by/4.0/) Users will be allowed to modify and repost it, and we encourage them to -analyze and publish research based on the data. -""" - -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_DATA_URL = "https://github.com/google-research-datasets/WebRED" - -_CLASS_LABELS = [ - "based on", - "part of the series", - "drug used for treatment", - "architectural style", - "writable file format", - "work location", - "position held", - "followed by", - "flash point", - "indigenous to", - "Mohs' hardness", - "political alignment", - "located in protected area", - "translator", - "director", - "highest judicial authority", - "producer", - "compressive modulus of elasticity", - "series spin-off", - "quantity", - "lyrics by", - "cell component", - "medical condition treated", - "place of death", - "number of seats", - "record label", - "league level above", - "military branch", - "origin of the watercourse", - "diameter", - "conversion to SI unit", - "works in collection", - "presenter", - "chairperson", - "temperature", - "currency", - "frequency", - "standards body", - "manufacturer", - "location of final assembly", - "coat of arms", - "astronaut mission", - "length", - "publication date", - "place of publication", - "country of citizenship", - "minimal lethal dose", - "game mechanics", - "afflicts", - "used by", - "oxidation state", - "mother", - "affiliation", - "head of state", - "creator", - "defendant", - "head coach of sports team", - "country", - "developer", - "approved by", - "cover artist", - "lake inflows", - "separated from", - "operating area", - "water as percent of area", - "head coach", - "update method", - "floruit", - "party chief representative", - "commander of", - "gestation period", - "religious order", - "school district", - "depicted by", - "publisher", - "excavation director", - "airline alliance", - "librettist", - "executive producer", - "donated by", - "mushroom ecological type", - "iconographic symbol", - "speed limit", - "number of representatives in an organization/legislature", - "subsidiary", - "educated at", - "number of participants", - "founded by", - "country of origin", - "family", - "package management system", - "subject has role", - "sibling", - "interchange station", - "facet of", - "decays to", - "repeals", - "legislative body", - "occupant", - "atomic number", - "CPU", - "GUI toolkit or framework", - "has parts of the class", - "director of photography", - "shares border with", - "parent organization", - "population", - "upper flammable limit", - "performer", - "isospin z-component", - "number of injured", - "number of seasons", - "choreographer", - "replaces", - "doctoral advisor", - "official residence", - "top-level Internet domain", - "VAT-rate", - "point in time", - "distance from Earth", - "public holiday", - "languages spoken, written or signed", - "located on astronomical location", - "solved by", - "designed by", - "twinned administrative body", - "encoded by", - "located in time zone", - "canonization status", - "date of official opening", - "student", - "brand", - "refractive index", - "inflation rate", - "home venue", - "neutron number", - "chief operating officer", - "lowest point", - "signatory", - "consecrator", - "model item", - "time of earliest written record", - "area", - "terminus location", - "significant event", - "inspired by", - "backup or reserve team or crew", - "maximum number of players", - "talk show guest", - "number of deaths", - "exclave of", - "maximal incubation period in humans", - "league", - "film crew member", - "electric charge", - "symptoms", - "replaced by", - "nominated for", - "religion", - "wavelength", - "total produced", - "time of discovery or invention", - "invasive to", - "use", - "negative therapeutic predictor", - "item operated", - "participating team", - "political ideology", - "compulsory education (maximum age)", - "applies to jurisdiction", - "history of topic", - "author", - "mass", - "heart rate", - "killed by", - "characters", - "diocese", - "Erdős number", - "time period", - "has part", - "age of candidacy", - "semi-major axis", - "dual to", - "official language", - "production company", - "replaced synonym (for nom. nov.)", - "main regulatory text", - "participant of", - "head of government", - "age of majority", - "heritage designation", - "drafted by", - "family relationship degree", - "discontinued date", - "operator", - "term length of office", - "spin quantum number", - "vehicles per capita (1000)", - "enclave within", - "embodied energy", - "represents", - "partner", - "stepparent", - "taxon synonym", - "time of spacecraft launch", - "conversion to standard unit", - "nominal GDP", - "lower flammable limit", - "readable file format", - "minimal incubation period in humans", - "connecting line", - "located in the administrative territorial entity", - "place of burial", - "contains administrative territorial entity", - "statistical leader", - "sports discipline competed in", - "tensile modulus of elasticity", - "research site", - "connects with", - "has cause", - "date of birth", - "location", - "age of consent", - "mains voltage", - "industry", - "basionym", - "marriageable age", - "visitors per year", - "Poisson's ratio", - "suicide rate", - "carries scientific instrument", - "connecting service", - "place of detention", - "crew member", - "place served by transport hub", - "organisation directed from the office or person", - "memory capacity", - "primary destinations", - "relative permeability", - "parent club", - "organizer", - "space launch vehicle", - "encodes", - "architect", - "notable work", - "commissioned by", - "depicts", - "individual tax rate", - "website account on", - "central bank", - "software engine", - "numeric value", - "official religion", - "wingspan", - "occupation", - "member count", - "ceiling exposure limit", - "date of first performance", - "discoverer or inventor", - "described by source", - "executive body", - "parent taxon", - "pole position", - "sports league level", - "pKa", - "genetic association", - "mountain range", - "part of", - "legal form", - "regulates (molecular biology)", - "end time", - "month of the year", - "employer", - "from fictional universe", - "spouse", - "copyright holder", - "lake outflow", - "solubility", - "located in or next to body of water", - "IDLH", - "office held by head of the organisation", - "office held by head of government", - "territory claimed by", - "tracklist", - "takes place in fictional universe", - "mount", - "season of club or team", - "this taxon is source of", - "theme music", - "Alexa rank", - "film editor", - "derivative work", - "territory overlaps", - "perimeter", - "price", - "secretary general", - "frequency of event", - "mascot", - "maintained by", - "duration", - "screenwriter", - "life expectancy", - "minimum number of players", - "winner", - "native language", - "start time", - "highest point", - "legislated by", - "parity", - "melting point", - "location of formation", - "ultimate tensile strength", - "defined daily dose", - "chief executive officer", - "number of parts of this work of art", - "endemic to", - "subclass of", - "dissolved, abolished or demolished", - "service entry", - "follows", - "number of constituencies", - "structural engineer", - "writing system", - "capital of", - "taxonomic type", - "next higher rank", - "commemorates", - "continent", - "relative", - "residence time of water", - "number of speakers", - "conferred by", - "Gram staining", - "work period (start)", - "sport", - "has effect", - "tributary", - "place of birth", - "member of sports team", - "relative permittivity", - "instrument", - "interested in", - "academic degree", - "location of discovery", - "electronegativity", - "located on terrain feature", - "conflict", - "height", - "short-term exposure limit", - "start point", - "original language of film or TV show", - "publication interval", - "amended by", - "material used", - "located in present-day administrative territorial entity", - "drainage basin", - "lakes on river", - "league level below", - "licensed to broadcast to", - "residence", - "after a work by", - "present in work", - "basin country", - "product certification", - "mouth of the watercourse", - "for work", - "has quality", - "uses", - "time-weighted average exposure limit", - "license", - "significant person", - "archives at", - "natural product of taxon", - "anthem", - "adjacent station", - "real gross domestic product growth rate", - "carries", - "member of political party", - "professional or sports partner", - "ethnic group", - "member of", - "platform", - "destination point", - "sports season of league or competition", - "country for sport", - "account charge / subscription fee", - "patron saint", - "compulsory education (minimum age)", - "route of administration", - "antiparticle", - "sponsor", - "floors above ground", - "timezone offset", - "programming language", - "stock exchange", - "opposite of", - "mouthpiece", - "unemployment rate", - "watershed area", - "editor", - "collection", - "award received", - "designated as terrorist by", - "illustrator", - "student of", - "dedicated to", - "youth wing", - "total fertility rate", - "elevation above sea level", - "repealed by", - "practiced by", - "named after", - "movement", - "flattening", - "position played on team / speciality", - "median lethal dose", - "employees", - "physically interacts with", - "highway system", - "parent peak", - "participant", - "number of cases", - "editor-in-chief", - "instance of", - "sidekick of", - "width", - "cites", - "child", - "has edition", - "doctoral student", - "original network", - "board member", - "service retirement", - "anatomical location", - "biological variant of", - "Euler characteristic", - "diplomatic relation", - "number of children", - "narrative location", - "incidence", - "allegiance", - "airline hub", - "vapor pressure", - "constellation", - "voice actor", - "number of platform tracks", - "work period (end)", - "military rank", - "vertical depth", - "vessel class", - "parent astronomical body", - "director/manager", - "owner of", - "distribution", - "court", - "angular resolution", - "located on street", - "owned by", - "retirement age", - "said to be the same as", - "language used", - "applies to part", - "business division", - "contains settlement", - "main subject", - "operating system", - "authority", - "number of representations", - "ancestral home", - "radius", - "binding energy", - "general manager", - "measured by", - "next lower rank", - "cast member", - "thermal conductivity", - "health specialty", - "father", - "worshipped by", - "headquarters location", - "child astronomical body", - "distributor", - "noble title", - "studied by", - "officeholder", - "genre", - "vaccine for", - "inception", - "produced by", - "narrator", - "different from", - "volcano observatory", - "art director", - "objective of project or action", - "composer", - "hardness", - "edition or translation of", - "isospin quantum number", - "foundational text", - "broadcast by", - "office held by head of state", - "boiling point", - "minimum wavelength of sensitivity", - "speaker", - "studies", - "capital", - "terminus", - "pressure", - "number of episodes", - "decomposition point", - "filming location", - "product or material produced", - "gene inversion association with", - "found in taxon", - "field of work", - "language of work or name", - "ranking", - "crosses", - "culture", - "location of first performance", - "dialect of", - "date of death", - "influenced by", -] - - -class WebRedConfig(datasets.BuilderConfig): - """BuilderConfig for WebRed.""" - - def __init__( - self, - data_url, - citation, - url, - class_labels, - description, - **kwargs, - ): - """BuilderConfig for WebRed. - Args: - data_url: `string`, url to download the zip file from - citation: `string`, citation for the data set - url: `string`, url for information about the data set - class_labels: `list[string]`, the list of classes if the label is - categorical. If not provided, then the label will be of type - `datasets.Value('float32')`. - **kwargs: keyword arguments forwarded to super. - """ - super().__init__(version=datasets.Version("1.0.0", ""), **kwargs) - self.class_labels = class_labels - self.data_url = data_url - self.citation = citation - self.url = url - self.description = description - - -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case -class WebRed(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - BUILDER_CONFIGS = [ - WebRedConfig( - name="webred_5", - data_url="https://github.com/google-research-datasets/WebRED/raw/main/webred_5.tfrecord", - citation=_CITATION_WEBRED, - url=_HOMEPAGE, - class_labels=_CLASS_LABELS, - description=_DESCRIPTION - + "\nEach example in WebRED 5 was annotated by exactly 5 independent human annotators.", - ), - WebRedConfig( - name="webred_21", - data_url="https://github.com/google-research-datasets/WebRED/raw/main/webred_21.tfrecord", - citation=_CITATION_WEBRED, - url=_HOMEPAGE, - class_labels=_CLASS_LABELS, - description=_DESCRIPTION - + "\nIn WebRED 2+1, each example was annotated by 2 independent annotators. If they " - "disagreed, an additional annotator (+1) was assigned to the example who also " - "provided a disambiguating annotation.", - ), - ] - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=self.config.description, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_WEBRED, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - file_path = dl_manager.download_and_extract(self.config.data_url) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": file_path}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - dataset = tf.data.TFRecordDataset(filepath) - idx = 0 - nlp = English() - - def get_feature_value(feature, key): - return feature[key].bytes_list.value[0].decode("utf-8") - - for raw_sentence in dataset: - example = tf.train.Example() - example.ParseFromString(raw_sentence.numpy()) - - rel_id = get_feature_value(example.features.feature, "relation_id") - sentence = get_feature_value(example.features.feature, "sentence") - label = get_feature_value(example.features.feature, "relation_name") - - # 1. Find OBJ{} and SUBJ{} marker indices - subj = re.search("SUBJ{.+?}", sentence) - obj = re.search("OBJ{.+?}", sentence) - if not subj or not obj: - print(f"Did not find OBJ or SUBJ marker in sentence: {sentence}") - continue - else: - subj_start, subj_end = subj.span() - obj_start, obj_end = obj.span() - # 2. OPTIONAL: Replace with source and target strings (they contain special characters while the sentence - # contains standard writing?) - # source = get_feature_value(sentence.features.feature, "source_name") - # target = get_feature_value(sentence.features.feature, "target_name") - - # 3. Remove markers and adjust indices: divide sentence at marker indices, remove marker, merge - # what if subj or obj is at the start or end of the sentence? - cleaned_sentence = "" - if subj_start < obj_start: - cleaned_sentence += sentence[:subj_start] - cleaned_sentence += sentence[subj_start + 5 : subj_end - 1] - cleaned_sentence += sentence[subj_end:obj_start] - cleaned_sentence += sentence[obj_start + 4 : obj_end - 1] - cleaned_sentence += sentence[obj_end:] - subj_end -= 6 - obj_start -= 6 - obj_end -= 11 - else: - cleaned_sentence += sentence[:obj_start] - cleaned_sentence += sentence[obj_start + 4 : obj_end - 1] - cleaned_sentence += sentence[obj_end:subj_start] - cleaned_sentence += sentence[subj_start + 5 : subj_end - 1] - cleaned_sentence += sentence[subj_end:] - obj_end -= 5 - subj_start -= 5 - subj_end -= 11 - # 4. Tokenize and calculate token indices from char offsets - doc = nlp(cleaned_sentence) - tokens = [token.text for token in doc] - subj_span = doc.char_span(subj_start, subj_end, alignment_mode="expand") - head_start = subj_span.start - head_end = subj_span.end - obj_span = doc.char_span(obj_start, obj_end, alignment_mode="expand") - tail_start = obj_span.start - tail_end = obj_span.end - - id_ = str(idx) + "_" + rel_id - idx += 1 - - yield id_, { - "tokens": tokens, - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py b/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py deleted file mode 100644 index 02aa2a7d..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py +++ /dev/null @@ -1,199 +0,0 @@ -"""TODO: Add a description here.""" - - -import json - -import datasets - -_CITATION_WIKI80 = """\ -@inproceedings{han-etal-2019-opennre, - title = "{O}pen{NRE}: An Open and Extensible Toolkit for Neural Relation Extraction", - author = "Han, Xu and Gao, Tianyu and Yao, Yuan and Ye, Deming and Liu, Zhiyuan and Sun, Maosong", - booktitle = "Proceedings of EMNLP-IJCNLP: System Demonstrations", - year = "2019", - url = "https://www.aclweb.org/anthology/D19-3029", - doi = "10.18653/v1/D19-3029", - pages = "169--174" -}""" - -# TODO: Add description of the dataset here -# You can copy an official description -_DESCRIPTION = """\ -Wiki80 is derived from FewRel, a large -scale few-shot dataset. It contains 80 relations and -56,000 instances from Wikipedia and Wikidata.""" - -# TODO: Add a link to an official homepage for the dataset here -_HOMEPAGE = "" - -# TODO: Add the licence for the dataset here if you can find it -_LICENSE = "" - -# The HuggingFace dataset library don't host the datasets but only point to the original files -# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_DATA_URLs = { - "train": "https://thunlp.oss-cn-qingdao.aliyuncs.com/opennre/benchmark/wiki80/wiki80_train.txt", - "validation": "https://thunlp.oss-cn-qingdao.aliyuncs.com/opennre/benchmark/wiki80/wiki80_val.txt", -} - -_CLASS_LABELS = [ - "place served by transport hub", - "mountain range", - "religion", - "participating team", - "contains administrative territorial entity", - "head of government", - "country of citizenship", - "original network", - "heritage designation", - "performer", - "participant of", - "position held", - "has part", - "location of formation", - "located on terrain feature", - "architect", - "country of origin", - "publisher", - "director", - "father", - "developer", - "military branch", - "mouth of the watercourse", - "nominated for", - "movement", - "successful candidate", - "followed by", - "manufacturer", - "instance of", - "after a work by", - "member of political party", - "licensed to broadcast to", - "headquarters location", - "sibling", - "instrument", - "country", - "occupation", - "residence", - "work location", - "subsidiary", - "participant", - "operator", - "characters", - "occupant", - "genre", - "operating system", - "owned by", - "platform", - "tributary", - "winner", - "said to be the same as", - "composer", - "league", - "record label", - "distributor", - "screenwriter", - "sports season of league or competition", - "taxon rank", - "location", - "field of work", - "language of work or name", - "applies to jurisdiction", - "notable work", - "located in the administrative territorial entity", - "crosses", - "original language of film or TV show", - "competition class", - "part of", - "sport", - "constellation", - "position played on team / speciality", - "located in or next to body of water", - "voice type", - "follows", - "spouse", - "military rank", - "mother", - "member of", - "child", - "main subject", -] - - -class Wiki80(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - VERSION = datasets.Version("1.0.0") # type: ignore - - def _info(self): - return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. - description=_DESCRIPTION, - # This defines the different columns of the dataset and their types - features=datasets.Features( - { - "tokens": datasets.Sequence(datasets.Value("string")), - "head_start": datasets.Value("int32"), - "head_end": datasets.Value("int32"), - "tail_start": datasets.Value("int32"), - "tail_end": datasets.Value("int32"), - "label": datasets.ClassLabel(names=_CLASS_LABELS), - } - ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage=_HOMEPAGE, - # License for the dataset if available - license=_LICENSE, - # Citation for the dataset - citation=_CITATION_WIKI80, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - data_files = dl_manager.download_and_extract(_DATA_URLs) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": data_files.get("train")}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": data_files.get("validation")}, - ), - ] - - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as f: - for idx, line in enumerate(f.readlines()): - example = json.loads(line) - label = example["relation"] - id_ = str(idx) - - head_token_positions = example["h"]["pos"] - tail_token_positions = example["t"]["pos"] - - head_start = head_token_positions[0] - head_end = head_token_positions[-1] - tail_start = tail_token_positions[0] - tail_end = tail_token_positions[-1] - - yield id_, { - "tokens": example["token"], - "head_start": head_start, - "head_end": head_end, - "tail_start": tail_start, - "tail_end": tail_end, - "label": label, - } diff --git a/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py b/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py deleted file mode 100644 index ffd3a729..00000000 --- a/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py +++ /dev/null @@ -1,122 +0,0 @@ -import datasets -from tqdm import tqdm - -_CITATION = """ -@inproceedings{balasuriya-etal-2009-named, - title = "Named Entity Recognition in Wikipedia", - author = "Balasuriya, Dominic and - Ringland, Nicky and - Nothman, Joel and - Murphy, Tara and - Curran, James R.", - booktitle = "Proceedings of the 2009 Workshop on The People{'}s Web Meets {NLP}: - Collaboratively Constructed Semantic Resources (People{'}s Web)", - month = aug, - year = "2009", - address = "Suntec, Singapore", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W09-3302", - pages = "10--18", -} -""" - -_LICENCE = "CC-BY 4.0" - -_DESCRIPTION = """ -WikiGold dataset. -""" - -_URL = ( - "https://github.com/juand-r/entity-recognition-datasets/raw/master/" - "data/wikigold/CONLL-format/data/wikigold.conll.txt" -) - -# the label ids -NER_TAGS_DICT = { - "O": 0, - "PER": 1, - "LOC": 2, - "ORG": 3, - "MISC": 4, -} - - -class WikiGoldConfig(datasets.BuilderConfig): - """BuilderConfig for WikiGold""" - - def __init__(self, **kwargs): - """BuilderConfig for WikiGold. - Args: - **kwargs: keyword arguments forwarded to super. - """ - super().__init__(**kwargs) - - -class WikiGold(datasets.GeneratorBasedBuilder): - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "id": datasets.Value("string"), - "tokens": datasets.features.Sequence(datasets.Value("string")), - "ner_tags": datasets.features.Sequence( - datasets.features.ClassLabel(names=["O", "PER", "LOC", "ORG", "MISC"]) - ), - } - ), - supervised_keys=None, - citation=_CITATION, - license=_LICENCE, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - urls_to_download = dl_manager.download_and_extract(_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": urls_to_download}, - ), - ] - - def _generate_examples(self, filepath=None): - num_lines = sum(1 for _ in open(filepath)) - id = 0 - - with open(filepath) as f: - tokens, ner_tags = [], [] - for line in tqdm(f, total=num_lines): - line = line.strip().split() - - if line: - assert len(line) == 2 - token, ner_tag = line - - if token == "-DOCSTART-": - continue - - tokens.append(token) - if ner_tag != "O": - ner_tag = ner_tag.split("-")[1] - ner_tags.append(NER_TAGS_DICT[ner_tag]) - - elif tokens: - # organize a record to be written into json - record = { - "tokens": tokens, - "id": str(id), - "ner_tags": ner_tags, - } - tokens, ner_tags = [], [] - id += 1 - yield record["id"], record - - # take the last sentence - if tokens: - record = { - "tokens": tokens, - "id": str(id), - "ner_tags": ner_tags, - } - yield record["id"], record