-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #90 from ArneBinder/add-scientific_papers
Add `scientific_papers` dataset
- Loading branch information
Showing
4 changed files
with
278 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# PIE Dataset Card for "scientific_papers" | ||
|
||
This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the | ||
[scientific_papers Huggingface dataset loading script](https://huggingface.co/datasets/scientific_papers). | ||
|
||
## Data Schema | ||
|
||
The document type for this dataset is `ScientificPapersDocument` which defines the following data fields: | ||
|
||
- `text` (str) | ||
|
||
and the following annotation layers: | ||
|
||
- `abstract` (annotation type: `AbstractiveSummary`, target: `None`) | ||
- `section_names` (annotation type: `SectionName`, targets: `None`) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pie-datasets>=0.8.0,<0.9.0 | ||
pie-modules>=0.8.2,<0.9.0 |
106 changes: 106 additions & 0 deletions
106
dataset_builders/pie/scientific_papers/scientific_papers.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import dataclasses | ||
from typing import Any, Dict, List | ||
|
||
import datasets | ||
from pytorch_ie.core import ( | ||
Annotation, | ||
AnnotationLayer, | ||
AnnotationList, | ||
annotation_field, | ||
) | ||
from pytorch_ie.documents import TextBasedDocument | ||
|
||
from pie_datasets import GeneratorBasedBuilder | ||
|
||
|
||
@dataclasses.dataclass(eq=True, frozen=True) | ||
class AbstractiveSummary(Annotation): | ||
"""A question about a context.""" | ||
|
||
text: str | ||
|
||
def __str__(self) -> str: | ||
return self.text | ||
|
||
|
||
@dataclasses.dataclass(eq=True, frozen=True) | ||
class SectionName(Annotation): | ||
"""A question about a context.""" | ||
|
||
text: str | ||
|
||
def __str__(self) -> str: | ||
return self.text | ||
|
||
|
||
@dataclasses.dataclass | ||
class ScientificPapersDocument(TextBasedDocument): | ||
"""A PIE document for scientific papers dataset.""" | ||
|
||
abstract: AnnotationLayer[AbstractiveSummary] = annotation_field() | ||
section_names: AnnotationList[SectionName] = annotation_field() | ||
|
||
|
||
def example_to_document( | ||
example: Dict[str, Any], | ||
) -> ScientificPapersDocument: | ||
"""Convert a Huggingface Scientific Papers example to a PIE document.""" | ||
document = ScientificPapersDocument( | ||
text=example["article"], | ||
) | ||
document.abstract.append(AbstractiveSummary(text=example["abstract"])) | ||
document.section_names.extend( | ||
[SectionName(text=section_name) for section_name in example["section_names"].split("\n")] | ||
) | ||
|
||
return document | ||
|
||
|
||
def document_to_example(doc: ScientificPapersDocument) -> Dict[str, Any]: | ||
"""Convert a PIE document to a Huggingface Scientific Papers example.""" | ||
example = { | ||
"article": doc.text, | ||
"abstract": doc.abstract[0].text, | ||
"section_names": "\n".join([section_name.text for section_name in doc.section_names]), | ||
} | ||
return example | ||
|
||
|
||
class ScientificPapersConfig(datasets.BuilderConfig): | ||
"""BuilderConfig for Scientific Papers.""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for Scientific Papers. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
class ScientificPapers(GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = ScientificPapersDocument | ||
|
||
BASE_DATASET_PATH = "scientific_papers" | ||
BASE_DATASET_REVISION = "14c5296f2d707630f5835c9da59dcaddeea19b20" | ||
|
||
BUILDER_CONFIGS = [ | ||
ScientificPapersConfig( | ||
name="arxiv", | ||
version=datasets.Version("1.1.1"), | ||
description="Scientific Papers dataset - ArXiv variant", | ||
), | ||
ScientificPapersConfig( | ||
name="pubmed", | ||
version=datasets.Version("1.1.1"), | ||
description="Scientific Papers dataset - PubMed variant", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "arxiv" | ||
|
||
def _generate_document(self, example, **kwargs): | ||
return example_to_document(example) | ||
|
||
def _generate_example(self, document, **kwargs): | ||
return document_to_example(document) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import pytest | ||
from datasets import disable_caching, load_dataset | ||
|
||
from dataset_builders.pie.scientific_papers.scientific_papers import ScientificPapers | ||
from pie_datasets import load_dataset as load_pie_dataset | ||
from tests.dataset_builders.common import PIE_BASE_PATH | ||
|
||
disable_caching() | ||
|
||
DATASET_NAME = "scientific_papers" | ||
BUILDER_CLASS = ScientificPapers | ||
STREAM_SIZE = 10 | ||
DOCUMENT_TYPE = BUILDER_CLASS.DOCUMENT_TYPE | ||
HF_DATASET_PATH = BUILDER_CLASS.BASE_DATASET_PATH | ||
PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME | ||
SPLIT = "train" | ||
|
||
|
||
@pytest.fixture(scope="module", params=[config.name for config in BUILDER_CLASS.BUILDER_CONFIGS]) | ||
def dataset_variant(request): | ||
return request.param | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def hf_dataset(dataset_variant): | ||
dataset = load_dataset( | ||
BUILDER_CLASS.BASE_DATASET_PATH, | ||
revision=BUILDER_CLASS.BASE_DATASET_REVISION, | ||
name=dataset_variant, | ||
split=SPLIT, | ||
streaming=True, | ||
) | ||
dataset_head = dataset.take(STREAM_SIZE) | ||
return list(dataset_head) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def pie_dataset(dataset_variant): | ||
dataset = load_pie_dataset( | ||
str(PIE_DATASET_PATH), name=dataset_variant, split=SPLIT, streaming=True | ||
) | ||
dataset_head = dataset.take(STREAM_SIZE) | ||
return list(dataset_head) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def hf_example(hf_dataset): | ||
return hf_dataset[0] | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def pie_example(pie_dataset): | ||
return pie_dataset[0] | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def expected_output(dataset_variant): | ||
results = { | ||
"arxiv": { | ||
"article": "additive models @xcite provide an important family of models for semiparametric regression or class", | ||
"abstract": " additive models play an important role in semiparametric statistics . \n this paper gives learning r", | ||
"section_names": "introduction\nmain results on learning rates\ncomparison of learning rates", | ||
}, | ||
"pubmed": { | ||
"article": "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5", | ||
"abstract": " background : the present study was carried out to assess the effects of community nutrition interve", | ||
"section_names": "INTRODUCTION\nMATERIALS AND METHODS\nParticipants\nInstruments\nProcedure\nFirst step\nSecond step\nThird s", | ||
}, | ||
} | ||
return results[dataset_variant] | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def generate_document_kwargs(hf_dataset, dataset_variant): | ||
return BUILDER_CLASS(config_name=dataset_variant)._generate_document_kwargs(hf_dataset) or {} | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def generated_document(hf_example, generate_document_kwargs, dataset_variant): | ||
doc = BUILDER_CLASS(config_name=dataset_variant)._generate_document( | ||
hf_example, **generate_document_kwargs | ||
) | ||
return doc | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def generated_example(generated_document, dataset_variant): | ||
example = BUILDER_CLASS(config_name=dataset_variant)._generate_example(generated_document) | ||
return example | ||
|
||
|
||
def test_hf_dataset(hf_dataset): | ||
assert hf_dataset is not None | ||
|
||
|
||
def test_pie_dataset(pie_dataset): | ||
assert pie_dataset is not None | ||
|
||
|
||
def test_hf_example(hf_example, expected_output): | ||
assert hf_example is not None | ||
assert isinstance(hf_example, dict) | ||
assert hf_example["article"].startswith(expected_output["article"]) | ||
assert hf_example["abstract"].startswith(expected_output["abstract"]) | ||
assert hf_example["section_names"].startswith(expected_output["section_names"]) | ||
|
||
|
||
def test_pie_example(pie_example, expected_output): | ||
assert pie_example is not None | ||
assert pie_example.text.startswith(expected_output["article"]) | ||
# Note that we use the string representation of the abstract and section name annotations | ||
assert str(pie_example.abstract[0]).startswith(expected_output["abstract"]) | ||
str_section_names = "\n".join( | ||
[str(section_name) for section_name in pie_example.section_names] | ||
) | ||
assert str_section_names.startswith(expected_output["section_names"]) | ||
|
||
|
||
def test_generate_document_kwargs(hf_dataset, generate_document_kwargs): | ||
assert generate_document_kwargs is not None | ||
assert isinstance(generate_document_kwargs, dict) | ||
|
||
|
||
def test_generate_document(generated_document, expected_output): | ||
assert generated_document is not None | ||
assert isinstance(generated_document, DOCUMENT_TYPE) | ||
assert generated_document.text is not None | ||
assert generated_document.abstract is not None | ||
assert generated_document.section_names is not None | ||
|
||
assert generated_document.text.startswith(expected_output["article"]) | ||
assert generated_document.abstract[0].text.startswith(expected_output["abstract"]) | ||
str_section_names = "\n".join( | ||
[section_name.text for section_name in generated_document.section_names] | ||
) | ||
assert str_section_names.startswith(expected_output["section_names"]) | ||
|
||
|
||
def test_generate_example(generated_example, expected_output): | ||
assert generated_example is not None | ||
assert isinstance(generated_example, dict) | ||
|
||
assert generated_example["article"].startswith(expected_output["article"]) | ||
assert generated_example["abstract"].startswith(expected_output["abstract"]) | ||
assert generated_example["section_names"].startswith(expected_output["section_names"]) | ||
|
||
|
||
def test_compare_document_and_generated_document(generated_document, pie_example): | ||
assert generated_document.text == pie_example.text | ||
assert generated_document.abstract[0].text == pie_example.abstract[0].text | ||
generated_section_names = [ | ||
section_name.text for section_name in generated_document.section_names | ||
] | ||
pie_section_names = [section_name.text for section_name in pie_example.section_names] | ||
assert generated_section_names == pie_section_names |