Skip to content

Commit

Permalink
Merge pull request #90 from ArneBinder/add-scientific_papers
Browse files Browse the repository at this point in the history
Add `scientific_papers` dataset
  • Loading branch information
ArneBinder authored Dec 15, 2023
2 parents 856ca5d + 9e0fe9e commit 991cace
Show file tree
Hide file tree
Showing 4 changed files with 278 additions and 0 deletions.
15 changes: 15 additions & 0 deletions dataset_builders/pie/scientific_papers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# PIE Dataset Card for "scientific_papers"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[scientific_papers Huggingface dataset loading script](https://huggingface.co/datasets/scientific_papers).

## Data Schema

The document type for this dataset is `ScientificPapersDocument` which defines the following data fields:

- `text` (str)

and the following annotation layers:

- `abstract` (annotation type: `AbstractiveSummary`, target: `None`)
- `section_names` (annotation type: `SectionName`, targets: `None`)
2 changes: 2 additions & 0 deletions dataset_builders/pie/scientific_papers/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pie-datasets>=0.8.0,<0.9.0
pie-modules>=0.8.2,<0.9.0
106 changes: 106 additions & 0 deletions dataset_builders/pie/scientific_papers/scientific_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import dataclasses
from typing import Any, Dict, List

import datasets
from pytorch_ie.core import (
Annotation,
AnnotationLayer,
AnnotationList,
annotation_field,
)
from pytorch_ie.documents import TextBasedDocument

from pie_datasets import GeneratorBasedBuilder


@dataclasses.dataclass(eq=True, frozen=True)
class AbstractiveSummary(Annotation):
"""A question about a context."""

text: str

def __str__(self) -> str:
return self.text


@dataclasses.dataclass(eq=True, frozen=True)
class SectionName(Annotation):
"""A question about a context."""

text: str

def __str__(self) -> str:
return self.text


@dataclasses.dataclass
class ScientificPapersDocument(TextBasedDocument):
"""A PIE document for scientific papers dataset."""

abstract: AnnotationLayer[AbstractiveSummary] = annotation_field()
section_names: AnnotationList[SectionName] = annotation_field()


def example_to_document(
example: Dict[str, Any],
) -> ScientificPapersDocument:
"""Convert a Huggingface Scientific Papers example to a PIE document."""
document = ScientificPapersDocument(
text=example["article"],
)
document.abstract.append(AbstractiveSummary(text=example["abstract"]))
document.section_names.extend(
[SectionName(text=section_name) for section_name in example["section_names"].split("\n")]
)

return document


def document_to_example(doc: ScientificPapersDocument) -> Dict[str, Any]:
"""Convert a PIE document to a Huggingface Scientific Papers example."""
example = {
"article": doc.text,
"abstract": doc.abstract[0].text,
"section_names": "\n".join([section_name.text for section_name in doc.section_names]),
}
return example


class ScientificPapersConfig(datasets.BuilderConfig):
"""BuilderConfig for Scientific Papers."""

def __init__(self, **kwargs):
"""BuilderConfig for Scientific Papers.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


class ScientificPapers(GeneratorBasedBuilder):
DOCUMENT_TYPE = ScientificPapersDocument

BASE_DATASET_PATH = "scientific_papers"
BASE_DATASET_REVISION = "14c5296f2d707630f5835c9da59dcaddeea19b20"

BUILDER_CONFIGS = [
ScientificPapersConfig(
name="arxiv",
version=datasets.Version("1.1.1"),
description="Scientific Papers dataset - ArXiv variant",
),
ScientificPapersConfig(
name="pubmed",
version=datasets.Version("1.1.1"),
description="Scientific Papers dataset - PubMed variant",
),
]

DEFAULT_CONFIG_NAME = "arxiv"

def _generate_document(self, example, **kwargs):
return example_to_document(example)

def _generate_example(self, document, **kwargs):
return document_to_example(document)
155 changes: 155 additions & 0 deletions tests/dataset_builders/pie/test_scientific_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import pytest
from datasets import disable_caching, load_dataset

from dataset_builders.pie.scientific_papers.scientific_papers import ScientificPapers
from pie_datasets import load_dataset as load_pie_dataset
from tests.dataset_builders.common import PIE_BASE_PATH

disable_caching()

DATASET_NAME = "scientific_papers"
BUILDER_CLASS = ScientificPapers
STREAM_SIZE = 10
DOCUMENT_TYPE = BUILDER_CLASS.DOCUMENT_TYPE
HF_DATASET_PATH = BUILDER_CLASS.BASE_DATASET_PATH
PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
SPLIT = "train"


@pytest.fixture(scope="module", params=[config.name for config in BUILDER_CLASS.BUILDER_CONFIGS])
def dataset_variant(request):
return request.param


@pytest.fixture(scope="module")
def hf_dataset(dataset_variant):
dataset = load_dataset(
BUILDER_CLASS.BASE_DATASET_PATH,
revision=BUILDER_CLASS.BASE_DATASET_REVISION,
name=dataset_variant,
split=SPLIT,
streaming=True,
)
dataset_head = dataset.take(STREAM_SIZE)
return list(dataset_head)


@pytest.fixture(scope="module")
def pie_dataset(dataset_variant):
dataset = load_pie_dataset(
str(PIE_DATASET_PATH), name=dataset_variant, split=SPLIT, streaming=True
)
dataset_head = dataset.take(STREAM_SIZE)
return list(dataset_head)


@pytest.fixture(scope="module")
def hf_example(hf_dataset):
return hf_dataset[0]


@pytest.fixture(scope="module")
def pie_example(pie_dataset):
return pie_dataset[0]


@pytest.fixture(scope="module")
def expected_output(dataset_variant):
results = {
"arxiv": {
"article": "additive models @xcite provide an important family of models for semiparametric regression or class",
"abstract": " additive models play an important role in semiparametric statistics . \n this paper gives learning r",
"section_names": "introduction\nmain results on learning rates\ncomparison of learning rates",
},
"pubmed": {
"article": "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5",
"abstract": " background : the present study was carried out to assess the effects of community nutrition interve",
"section_names": "INTRODUCTION\nMATERIALS AND METHODS\nParticipants\nInstruments\nProcedure\nFirst step\nSecond step\nThird s",
},
}
return results[dataset_variant]


@pytest.fixture(scope="module")
def generate_document_kwargs(hf_dataset, dataset_variant):
return BUILDER_CLASS(config_name=dataset_variant)._generate_document_kwargs(hf_dataset) or {}


@pytest.fixture(scope="module")
def generated_document(hf_example, generate_document_kwargs, dataset_variant):
doc = BUILDER_CLASS(config_name=dataset_variant)._generate_document(
hf_example, **generate_document_kwargs
)
return doc


@pytest.fixture(scope="module")
def generated_example(generated_document, dataset_variant):
example = BUILDER_CLASS(config_name=dataset_variant)._generate_example(generated_document)
return example


def test_hf_dataset(hf_dataset):
assert hf_dataset is not None


def test_pie_dataset(pie_dataset):
assert pie_dataset is not None


def test_hf_example(hf_example, expected_output):
assert hf_example is not None
assert isinstance(hf_example, dict)
assert hf_example["article"].startswith(expected_output["article"])
assert hf_example["abstract"].startswith(expected_output["abstract"])
assert hf_example["section_names"].startswith(expected_output["section_names"])


def test_pie_example(pie_example, expected_output):
assert pie_example is not None
assert pie_example.text.startswith(expected_output["article"])
# Note that we use the string representation of the abstract and section name annotations
assert str(pie_example.abstract[0]).startswith(expected_output["abstract"])
str_section_names = "\n".join(
[str(section_name) for section_name in pie_example.section_names]
)
assert str_section_names.startswith(expected_output["section_names"])


def test_generate_document_kwargs(hf_dataset, generate_document_kwargs):
assert generate_document_kwargs is not None
assert isinstance(generate_document_kwargs, dict)


def test_generate_document(generated_document, expected_output):
assert generated_document is not None
assert isinstance(generated_document, DOCUMENT_TYPE)
assert generated_document.text is not None
assert generated_document.abstract is not None
assert generated_document.section_names is not None

assert generated_document.text.startswith(expected_output["article"])
assert generated_document.abstract[0].text.startswith(expected_output["abstract"])
str_section_names = "\n".join(
[section_name.text for section_name in generated_document.section_names]
)
assert str_section_names.startswith(expected_output["section_names"])


def test_generate_example(generated_example, expected_output):
assert generated_example is not None
assert isinstance(generated_example, dict)

assert generated_example["article"].startswith(expected_output["article"])
assert generated_example["abstract"].startswith(expected_output["abstract"])
assert generated_example["section_names"].startswith(expected_output["section_names"])


def test_compare_document_and_generated_document(generated_document, pie_example):
assert generated_document.text == pie_example.text
assert generated_document.abstract[0].text == pie_example.abstract[0].text
generated_section_names = [
section_name.text for section_name in generated_document.section_names
]
pie_section_names = [section_name.text for section_name in pie_example.section_names]
assert generated_section_names == pie_section_names

0 comments on commit 991cace

Please sign in to comment.