diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c6c8ee5..6c25d6f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -80,9 +80,8 @@ repos: - id: codespell args: - --skip=logs/**,data/**,tests/fixtures/** - # hist: required for plotext.hist() - # ba: denotes beginning of an encoding with label as 'a'. More details at src/pie_utils/sequence_tagging/ill_formed.py - - --ignore-words-list=hist,ba + # arbitral: this is a legal term and used in example data (cdcp dataset) + - --ignore-words-list=arbitral # python static type checking - repo: https://github.com/pre-commit/mirrors-mypy diff --git a/dataset_builders/pie/cdcp/README.md b/dataset_builders/pie/cdcp/README.md new file mode 100644 index 00000000..33edaa3a --- /dev/null +++ b/dataset_builders/pie/cdcp/README.md @@ -0,0 +1,29 @@ +# PIE Dataset Card for "CDCP" + +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the +[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp). + +## Data Schema + +The document type for this dataset is `CDCPDocument` which defines the following data fields: + +- `text` (str) +- `id` (str, optional) +- `metadata` (dictionary, optional) + +and the following annotation layers: + +- `propositions` (annotation type: `LabeledSpan`, target: `text`) +- `relations` (annotation type: `BinaryRelation`, target: `propositions`) +- `urls` (annotation type: `Attribute`, target: `propositions`) + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions. + +## Document Converters + +The dataset provides document converters for the following target document types: + +- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type +definitions. diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py new file mode 100644 index 00000000..73e9c8bd --- /dev/null +++ b/dataset_builders/pie/cdcp/cdcp.py @@ -0,0 +1,142 @@ +import dataclasses +import logging +from typing import Any, Callable, Dict, List, Optional + +import datasets +from pytorch_ie.annotations import BinaryRelation, LabeledSpan +from pytorch_ie.core import Annotation, AnnotationList, annotation_field +from pytorch_ie.documents import ( + TextBasedDocument, + TextDocumentWithLabeledSpansAndBinaryRelations, +) + +from pie_datasets import GeneratorBasedBuilder +from pie_datasets.document.processing.text_span_trimmer import trim_text_spans + +log = logging.getLogger(__name__) + + +def dl2ld(dict_of_lists): + return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())] + + +def ld2dl(list_of_dicts, keys: Optional[List[str]] = None): + return {k: [d[k] for d in list_of_dicts] for k in keys} + + +@dataclasses.dataclass(frozen=True) +class Attribute(Annotation): + value: str + annotation: Annotation + + +@dataclasses.dataclass +class CDCPDocument(TextBasedDocument): + propositions: AnnotationList[LabeledSpan] = annotation_field(target="text") + relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions") + urls: AnnotationList[Attribute] = annotation_field(target="propositions") + + +def example_to_document( + example: Dict[str, Any], + relation_label: datasets.ClassLabel, + proposition_label: datasets.ClassLabel, +): + document = CDCPDocument(id=example["id"], text=example["text"]) + for proposition_dict in dl2ld(example["propositions"]): + proposition = LabeledSpan( + start=proposition_dict["start"], + end=proposition_dict["end"], + label=proposition_label.int2str(proposition_dict["label"]), + ) + document.propositions.append(proposition) + if proposition_dict.get("url", "") != "": + url = Attribute(annotation=proposition, value=proposition_dict["url"]) + document.urls.append(url) + + for relation_dict in dl2ld(example["relations"]): + relation = BinaryRelation( + head=document.propositions[relation_dict["head"]], + tail=document.propositions[relation_dict["tail"]], + label=relation_label.int2str(relation_dict["label"]), + ) + document.relations.append(relation) + + return document + + +def document_to_example( + document: CDCPDocument, + relation_label: datasets.ClassLabel, + proposition_label: datasets.ClassLabel, +) -> Dict[str, Any]: + result = {"id": document.id, "text": document.text} + proposition2dict = {} + proposition2idx = {} + for idx, proposition in enumerate(document.propositions): + proposition2dict[proposition] = { + "start": proposition.start, + "end": proposition.end, + "label": proposition_label.str2int(proposition.label), + "url": "", + } + proposition2idx[proposition] = idx + for url in document.urls: + proposition2dict[url.annotation]["url"] = url.value + + result["propositions"] = ld2dl( + proposition2dict.values(), keys=["start", "end", "label", "url"] + ) + + relations = [ + { + "head": proposition2idx[relation.head], + "tail": proposition2idx[relation.tail], + "label": relation_label.str2int(relation.label), + } + for relation in document.relations + ] + result["relations"] = ld2dl(relations, keys=["head", "tail", "label"]) + + return result + + +def convert_to_text_document_with_labeled_spans_and_binary_relations( + document: CDCPDocument, + verbose: bool = True, +) -> TextDocumentWithLabeledSpansAndBinaryRelations: + doc_simplified = document.as_type( + TextDocumentWithLabeledSpansAndBinaryRelations, + field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"}, + ) + result = trim_text_spans( + doc_simplified, + layer="labeled_spans", + verbose=verbose, + ) + return result + + +class CDCP(GeneratorBasedBuilder): + DOCUMENT_TYPE = CDCPDocument + + DOCUMENT_CONVERTERS = { + TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations + } + + BASE_DATASET_PATH = "DFKI-SLT/cdcp" + + BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")] + + DEFAULT_CONFIG_NAME = "default" # type: ignore + + def _generate_document_kwargs(self, dataset): + return { + "relation_label": dataset.features["relations"].feature["label"], + "proposition_label": dataset.features["propositions"].feature["label"], + } + + def _generate_document(self, example, relation_label, proposition_label): + return example_to_document( + example, relation_label=relation_label, proposition_label=proposition_label + ) diff --git a/dataset_builders/pie/cdcp/requirements.txt b/dataset_builders/pie/cdcp/requirements.txt new file mode 100644 index 00000000..96711063 --- /dev/null +++ b/dataset_builders/pie/cdcp/requirements.txt @@ -0,0 +1 @@ +pie-datasets>=0.3.0 diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py index d57292f7..983d80ff 100644 --- a/src/pie_datasets/document/types.py +++ b/src/pie_datasets/document/types.py @@ -3,7 +3,7 @@ from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan from pytorch_ie.core import Annotation, AnnotationList, annotation_field -from pytorch_ie.documents import TextBasedDocument +from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument @dataclasses.dataclass(eq=True, frozen=True) @@ -28,3 +28,13 @@ class BratDocumentWithMergedSpans(TextBasedDocument): relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") span_attributes: AnnotationList[Attribute] = annotation_field(target="spans") relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations") + + +@dataclasses.dataclass +class TokenDocumentWithLabeledSpans(TokenBasedDocument): + labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens") + + +@dataclasses.dataclass +class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans): + binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans") diff --git a/tests/dataset_builders/pie/__init__.py b/tests/dataset_builders/pie/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/dataset_builders/pie/test_cdcp.py b/tests/dataset_builders/pie/test_cdcp.py new file mode 100644 index 00000000..e2cb01f1 --- /dev/null +++ b/tests/dataset_builders/pie/test_cdcp.py @@ -0,0 +1,441 @@ +import dataclasses +from typing import List + +import pytest +from datasets import disable_caching, load_dataset +from pytorch_ie.annotations import LabeledSpan +from pytorch_ie.core import AnnotationList, Document, annotation_field +from pytorch_ie.documents import ( + TextBasedDocument, + TextDocumentWithLabeledSpansAndBinaryRelations, +) +from transformers import AutoTokenizer, PreTrainedTokenizer + +from dataset_builders.pie.cdcp.cdcp import ( + CDCP, + CDCPDocument, + convert_to_text_document_with_labeled_spans_and_binary_relations, + document_to_example, + example_to_document, +) +from pie_datasets import DatasetDict +from pie_datasets.document.conversion import tokenize_document +from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations +from tests import FIXTURES_ROOT +from tests.dataset_builders.common import PIE_BASE_PATH, _deep_compare + +disable_caching() + +DATASET_NAME = "cdcp" +SPLIT_SIZES = {"train": 581, "test": 150} +HF_DATASET_PATH = CDCP.BASE_DATASET_PATH +PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME +DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "cdcp_acl17.zip" + +HF_EXAMPLE_00195 = { + "id": "00195", + "text": "State and local court rules sometimes make default judgments much more likely. For example, " + "when a person who allegedly owes a debt is told to come to court on a work day, they may be " + "forced to choose between a default judgment and their job. I urge the CFPB to find practices " + "that involve scheduling hearings at inconvenient times unfair, deceptive, and abusive, or " + "inconsistent with 1692i.", + "propositions": { + "start": [0, 78, 242], + "end": [78, 242, 391], + "label": [4, 4, 1], + "url": ["", "", ""], + }, + "relations": {"head": [0, 2], "tail": [1, 0], "label": [1, 1]}, +} + + +HF_EXAMPLE_00194 = { + "id": "00194", + "text": "Recently, courts have held that debt collectors can escape 1692i's venue provisions entirely " + "by pursuing debt collection through arbitration instead. As the NAF studies reflect, arbitration " + "has not proven a satisfactory alternative. I urge the CFPB to include in a rule language " + "interpreting 1692i as requiring debt collectors to proceed in court, not through " + "largely-unregulated arbitral forums.", + "propositions": { + "start": [0, 149, 232], + "end": [149, 232, 396], + "label": [0, 4, 1], + "url": ["", "", ""], + }, + "relations": {"head": [2], "tail": [1], "label": [1]}, +} + + +@pytest.fixture(scope="module", params=["train", "test"]) +def split(request): + return request.param + + +@pytest.fixture(scope="module") +def hf_dataset(): + return load_dataset(str(HF_DATASET_PATH), data_dir=DATA_PATH) + + +def test_hf_dataset(hf_dataset): + assert hf_dataset is not None + assert {name: len(ds) for name, ds in hf_dataset.items()} == SPLIT_SIZES + + +@pytest.fixture(scope="module") +def hf_example(hf_dataset, split): + return hf_dataset[split][0] + + +def test_hf_example(hf_example, split): + assert hf_example is not None + if split == "train": + assert hf_example == HF_EXAMPLE_00195 + elif split == "test": + assert hf_example == HF_EXAMPLE_00194 + else: + raise ValueError(f"Unknown split: {split}") + + +@pytest.fixture(scope="module") +def generate_document_kwargs(hf_dataset, split): + return CDCP()._generate_document_kwargs(hf_dataset[split]) + + +@pytest.fixture(scope="module") +def generated_document(hf_example, generate_document_kwargs): + return CDCP()._generate_document(hf_example, **generate_document_kwargs) + + +def test_generated_document(generated_document, split): + assert isinstance(generated_document, CDCPDocument) + if split == "train": + assert generated_document.text == HF_EXAMPLE_00195["text"] + assert len(generated_document.propositions) == 3 + assert len(generated_document.relations) == 2 + elif split == "test": + assert generated_document.text == HF_EXAMPLE_00194["text"] + assert len(generated_document.propositions) == 3 + assert len(generated_document.relations) == 1 + else: + raise ValueError(f"Unknown split: {split}") + + +@pytest.fixture(scope="module") +def hf_example_back(generated_document, generate_document_kwargs): + return document_to_example(generated_document, **generate_document_kwargs) + + +def test_example_to_document_and_back(hf_example, hf_example_back): + _deep_compare( + obj=hf_example_back, + obj_expected=hf_example, + ) + + +def test_example_to_document_and_back_all(hf_dataset, generate_document_kwargs, split): + for hf_ex in hf_dataset[split]: + doc = example_to_document(hf_ex, **generate_document_kwargs) + _assert_no_span_overlap(document=doc, text_field="text", span_layer="propositions") + hf_example_back = document_to_example(doc, **generate_document_kwargs) + _deep_compare( + obj=hf_example_back, + obj_expected=hf_ex, + ) + + +@pytest.fixture(scope="module") +def dataset() -> DatasetDict: + return DatasetDict.load_dataset(str(PIE_DATASET_PATH)) + + +def test_pie_dataset(dataset): + assert dataset is not None + assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES + + +@pytest.fixture(scope="module") +def document(dataset, split) -> CDCPDocument: + result = dataset[split][0] + # we can not assert the real document type because it may come from a dataset loading script + # downloaded to a temporary directory and thus have a different type object, although it is + # semantically the same + assert isinstance(result, Document) + return result + + +def test_compare_document_and_generated_document(document, generated_document): + assert document.text == generated_document.text + assert document.relations == generated_document.relations + assert document.metadata == generated_document.metadata + + +def _assert_no_span_overlap(document: Document, text_field: str, span_layer: str): + spans = document[span_layer] + text = getattr(document, text_field) + seq = [None] * len(text) + for span in spans: + assert seq[span.start : span.end] == [None] * len(text[span.start : span.end]) + seq[span.start : span.end] = text[span.start : span.end] + + +def test_assert_no_span_overlap(): + @dataclasses.dataclass + class TextDocumentWithEntities(TextBasedDocument): + entities: AnnotationList[LabeledSpan] = annotation_field(target="text") + + doc0 = TextDocumentWithEntities(text="abcdefghijklmnop") + doc0.entities.append(LabeledSpan(start=0, end=4, label="A")) + doc0.entities.append(LabeledSpan(start=4, end=6, label="B")) + + # this should work + _assert_no_span_overlap(document=doc0, text_field="text", span_layer="entities") + + doc1 = TextDocumentWithEntities(text="abcdefghijklmnop") + doc1.entities.append(LabeledSpan(start=0, end=4, label="A")) + doc1.entities.append(LabeledSpan(start=2, end=6, label="B")) + + # this should fail + with pytest.raises(AssertionError): + _assert_no_span_overlap(document=doc1, text_field="text", span_layer="entities") + + +@pytest.fixture(scope="module") +def dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset, +) -> DatasetDict: + converted_dataset = dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations) + return converted_dataset + + +def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, split +): + assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None + # get a document to check + document = dataset_of_text_documents_with_labeled_spans_and_binary_relations[split][0] + assert isinstance(document, TextDocumentWithLabeledSpansAndBinaryRelations) + if split == "train": + assert document.id == "00195" + # check entities + assert len(document.labeled_spans) == 3 + entity_tuples = [(str(ent), ent.label) for ent in document.labeled_spans] + assert entity_tuples[0] == ( + "State and local court rules sometimes make default judgments much more likely.", + "value", + ) + assert entity_tuples[1] == ( + "For example, when a person who allegedly owes a debt is told to come to court on a work day, " + "they may be forced to choose between a default judgment and their job.", + "value", + ) + assert entity_tuples[2] == ( + "I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, " + "deceptive, and abusive, or inconsistent with 1692i.", + "policy", + ) + + # check relations + assert len(document.binary_relations) == 2 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in document.binary_relations + ] + assert relation_tuples[0] == ( + "State and local court rules sometimes make default judgments much more likely.", + "reason", + "For example, when a person who allegedly owes a debt is told to come to court on a work day, " + "they may be forced to choose between a default judgment and their job.", + ) + assert relation_tuples[1] == ( + "I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, " + "deceptive, and abusive, or inconsistent with 1692i.", + "reason", + "State and local court rules sometimes make default judgments much more likely.", + ) + + elif split == "test": + assert document.id == "00194" + # check entities + assert len(document.labeled_spans) == 3 + entity_tuples = [(str(ent), ent.label) for ent in document.labeled_spans] + assert entity_tuples[0] == ( + "Recently, courts have held that debt collectors can escape 1692i's venue provisions entirely " + "by pursuing debt collection through arbitration instead.", + "fact", + ) + assert entity_tuples[1] == ( + "As the NAF studies reflect, arbitration has not proven a satisfactory alternative.", + "value", + ) + assert entity_tuples[2] == ( + "I urge the CFPB to include in a rule language interpreting 1692i as requiring debt collectors to proceed " + "in court, not through largely-unregulated arbitral forums.", + "policy", + ) + + # check relations + assert len(document.binary_relations) == 1 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in document.binary_relations + ] + assert relation_tuples[0] == ( + "I urge the CFPB to include in a rule language interpreting 1692i as requiring debt collectors to proceed " + "in court, not through largely-unregulated arbitral forums.", + "reason", + "As the NAF studies reflect, arbitration has not proven a satisfactory alternative.", + ) + else: + raise ValueError(f"Unknown Split {split}") + + +def test_convert_to_textdocument_with_entities_and_relations( + document, dataset_of_text_documents_with_labeled_spans_and_binary_relations, split +): + # just check that we get the same as in the converted dataset when explicitly calling the conversion method + converted_doc = convert_to_text_document_with_labeled_spans_and_binary_relations(document) + doc_from_converted_dataset = dataset_of_text_documents_with_labeled_spans_and_binary_relations[ + split + ][0] + assert converted_doc == doc_from_converted_dataset + + +@pytest.fixture(scope="module") +def tokenizer() -> PreTrainedTokenizer: + return AutoTokenizer.from_pretrained("bert-base-uncased") + + +@pytest.fixture(scope="module") +def tokenized_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer +) -> List[TokenDocumentWithLabeledSpansAndBinaryRelations]: + # get a document to check + doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations["train"][0] + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + verbose=True, + ) + return tokenized_docs + + +def test_tokenized_documents_with_labeled_spans_and_binary_relations( + tokenized_documents_with_labeled_spans_and_binary_relations, +): + docs = tokenized_documents_with_labeled_spans_and_binary_relations + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 84 + assert len(doc.labeled_spans) == 3 + ent = doc.labeled_spans[0] + assert ent.target[ent.start : ent.end] == ( + "state", + "and", + "local", + "court", + "rules", + "sometimes", + "make", + "default", + "judgments", + "much", + "more", + "likely", + ".", + ) + ent = doc.labeled_spans[1] + assert ent.target[ent.start : ent.end] == ( + "for", + "example", + ",", + "when", + "a", + "person", + "who", + "allegedly", + "owes", + "a", + "debt", + "is", + "told", + "to", + "come", + "to", + "court", + "on", + "a", + "work", + "day", + ",", + "they", + "may", + "be", + "forced", + "to", + "choose", + "between", + "a", + "default", + "judgment", + "and", + "their", + "job", + ".", + ) + ent = doc.labeled_spans[2] + assert ent.target[ent.start : ent.end] == ( + "i", + "urge", + "the", + "cf", + "##p", + "##b", + "to", + "find", + "practices", + "that", + "involve", + "scheduling", + "hearings", + "at", + "inc", + "##on", + "##ven", + "##ient", + "times", + "unfair", + ",", + "dec", + "##eptive", + ",", + "and", + "abusive", + ",", + "or", + "inconsistent", + "with", + "1692", + "##i", + ".", + ) + + +def test_tokenized_documents_with_entities_and_relations_all( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer +): + for split, docs in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): + for doc in docs: + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + verbose=True, + ) + # we just ensure that we get at least one tokenized document + assert tokenized_docs is not None + assert len(tokenized_docs) > 0 diff --git a/tests/fixtures/dataset_builders/cdcp_acl17.zip b/tests/fixtures/dataset_builders/cdcp_acl17.zip new file mode 100644 index 00000000..d16997cc Binary files /dev/null and b/tests/fixtures/dataset_builders/cdcp_acl17.zip differ