From 6aea7ad173e5a6b1236441578f6620fa3faae9be Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 22:25:24 +0100 Subject: [PATCH 01/10] add metrics from pytorch-ie --- src/pie_datasets/__init__.py | 2 + src/pie_datasets/common.py | 41 +++- src/pie_datasets/document/__init__.py | 0 src/pie_datasets/metric.py | 76 ++++++++ src/pie_datasets/metrics/__init__.py | 3 + src/pie_datasets/metrics/f1.py | 118 ++++++++++++ src/pie_datasets/metrics/statistics.py | 248 +++++++++++++++++++++++++ src/pie_datasets/statistic.py | 234 +++++++++++++++++++++++ tests/unit/metrics/__init__.py | 0 tests/unit/metrics/test_f1.py | 109 +++++++++++ tests/unit/metrics/test_statistic.py | 225 ++++++++++++++++++++++ tests/unit/test_metric.py | 116 ++++++++++++ 12 files changed, 1170 insertions(+), 2 deletions(-) create mode 100644 src/pie_datasets/document/__init__.py create mode 100644 src/pie_datasets/metric.py create mode 100644 src/pie_datasets/metrics/__init__.py create mode 100644 src/pie_datasets/metrics/f1.py create mode 100644 src/pie_datasets/metrics/statistics.py create mode 100644 src/pie_datasets/statistic.py create mode 100644 tests/unit/metrics/__init__.py create mode 100644 tests/unit/metrics/test_f1.py create mode 100644 tests/unit/metrics/test_statistic.py create mode 100644 tests/unit/test_metric.py diff --git a/src/pie_datasets/__init__.py b/src/pie_datasets/__init__.py index 6c3d2550..b4f2086f 100644 --- a/src/pie_datasets/__init__.py +++ b/src/pie_datasets/__init__.py @@ -8,6 +8,8 @@ from .dataset import Dataset, IterableDataset from .dataset_dict import DatasetDict from .document_formatter import DocumentFormatter +from .metric import DocumentMetric +from .statistic import DocumentStatistic __all__ = [ "GeneratorBasedBuilder", diff --git a/src/pie_datasets/common.py b/src/pie_datasets/common.py index e3213b2e..5eee2cf9 100644 --- a/src/pie_datasets/common.py +++ b/src/pie_datasets/common.py @@ -1,7 +1,13 @@ +import logging from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional, Type, Union -from .dataset import Dataset, IterableDataset +from pytorch_ie.core.document import Document + +from pie_datasets import DatasetDict +from pie_datasets.dataset import Dataset, IterableDataset + +logger = logging.getLogger(__name__) class EnterDatasetMixin(ABC): @@ -38,3 +44,34 @@ class ExitDatasetDictMixin(ABC): @abstractmethod def exit_dataset_dict(self, dataset_dict) -> None: """Exit dataset dict context.""" + + +class RequiresDocumentTypeMixin: + DOCUMENT_TYPE: Optional[Type[Document]] = None + + @property + def document_type(self) -> Optional[Type[Document]]: + return self.DOCUMENT_TYPE + + def convert_dataset(self, dataset: DatasetDict) -> DatasetDict: + name = type(self).__name__ + # auto-convert the dataset if a document type is specified + if self.document_type is not None: + if issubclass(dataset.document_type, self.document_type): + logger.info( + f"the dataset is already of the document type that is specified by {name}: " + f"{self.document_type}" + ) + else: + logger.info( + f"convert the dataset to the document type that is specified by {name}: " + f"{self.document_type}" + ) + dataset = dataset.to_document_type(self.document_type) + else: + logger.warning( + f"{name} does not specify a document type. The dataset can not be automatically converted " + f"to a document type." + ) + + return dataset diff --git a/src/pie_datasets/document/__init__.py b/src/pie_datasets/document/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/pie_datasets/metric.py b/src/pie_datasets/metric.py new file mode 100644 index 00000000..31f44dc6 --- /dev/null +++ b/src/pie_datasets/metric.py @@ -0,0 +1,76 @@ +from abc import ABC, abstractmethod +from typing import Dict, Generic, Iterable, Optional, TypeVar, Union + +from pytorch_ie.core.document import Document + +from pie_datasets.common import RequiresDocumentTypeMixin + +T = TypeVar("T") + + +class DocumentMetric(ABC, RequiresDocumentTypeMixin, Generic[T]): + """This defines the interface for a document metric.""" + + def __init__(self): + self.reset() + self._current_split: Optional[str] = None + + @abstractmethod + def reset(self) -> None: + """Any reset logic that needs to be performed before the metric is called again.""" + + def __call__( + self, + document_or_collection: Union[Iterable[Document], Document, Dict[str, Iterable[Document]]], + ) -> Union[Dict[str, T], T]: + """This method is called to update the metric with a document or collection of documents. + + If a collection is passed, the metric is also computed and the result is returned. If the + collection is a dictionary, the metric is computed for each split and the result is + returned as a dictionary. + """ + if isinstance(document_or_collection, Document): + # do not reset here to allow for multiple calls + self._update(document_or_collection) + return self.compute(reset=False) + elif isinstance(document_or_collection, dict): + result: Dict[str, T] = {} + for split_name, split in document_or_collection.items(): + self._current_split = split_name + self.reset() + split_values: T = self(split) # type: ignore + result[split_name] = split_values + self._current_split = None + return result + elif isinstance(document_or_collection, Iterable): + for doc in document_or_collection: + if not isinstance(doc, Document): + raise TypeError( + f"document_or_collection contains an object that is not a document: {type(doc)}" + ) + self._update(doc) + # do not reset here to allow for multiple calls + return self.compute(reset=False) + else: + raise TypeError( + f"document_or_collection has unknown type: {type(document_or_collection)}" + ) + + def compute(self, reset: bool = True) -> T: + metric_values = self._compute() + if reset: + self.reset() + return metric_values + + @abstractmethod + def _update(self, document: Document) -> None: + """This method is called to update the metric with the new document.""" + + @abstractmethod + def _compute(self) -> T: + """This method is called to get the metric values.""" + + @property + def current_split(self) -> Optional[str]: + """The current split that is being processed.""" + return self._current_split diff --git a/src/pie_datasets/metrics/__init__.py b/src/pie_datasets/metrics/__init__.py new file mode 100644 index 00000000..ec2a853a --- /dev/null +++ b/src/pie_datasets/metrics/__init__.py @@ -0,0 +1,3 @@ +from .f1 import F1Metric + +__all__ = ["F1Metric"] diff --git a/src/pie_datasets/metrics/f1.py b/src/pie_datasets/metrics/f1.py new file mode 100644 index 00000000..7b2fbf78 --- /dev/null +++ b/src/pie_datasets/metrics/f1.py @@ -0,0 +1,118 @@ +import logging +from collections import defaultdict +from functools import partial +from typing import Callable, Collection, Dict, Optional, Tuple + +import pandas as pd +from pytorch_ie.core import Annotation, Document + +from pie_datasets.metric import DocumentMetric + +logger = logging.getLogger(__name__) + + +def has_one_of_the_labels(ann: Annotation, label_field: str, labels: Collection[str]) -> bool: + return getattr(ann, label_field) in labels + + +def has_this_label(ann: Annotation, label_field: str, label: str) -> bool: + return getattr(ann, label_field) == label + + +class F1Metric(DocumentMetric): + """Computes the (micro aggregated) F1 score for a given layer. If labels are provided, it also + computes the F1 score for each label separately and the macro F1 score. + + Args: + layer: The layer to compute the F1 score for. + labels: If provided, calculate F1 score for each label. + label_field: The field to use for the label. Defaults to "label". + show_as_markdown: If True, logs the F1 score as markdown on the console when calling compute(). + """ + + def __init__( + self, + layer: str, + labels: Optional[Collection[str]] = None, + label_field: str = "label", + show_as_markdown: bool = False, + ): + super().__init__() + self.layer = layer + self.label_field = label_field + self.show_as_markdown = show_as_markdown + + self.per_label = labels is not None + self.labels = labels or [] + if self.per_label: + if "MICRO" in self.labels or "MACRO" in self.labels: + raise ValueError( + "labels cannot contain 'MICRO' or 'MACRO' because they are used to capture aggregated metrics" + ) + if len(self.labels) == 0: + raise ValueError("labels cannot be empty") + + def reset(self): + self.counts = defaultdict(lambda: (0, 0, 0)) + + def calculate_counts( + self, + document: Document, + annotation_filter: Optional[Callable[[Annotation], bool]] = None, + ) -> Tuple[int, int, int]: + annotation_filter = annotation_filter or (lambda ann: True) + predicted_annotations = { + ann for ann in document[self.layer].predictions if annotation_filter(ann) + } + gold_annotations = {ann for ann in document[self.layer] if annotation_filter(ann)} + tp = len([ann for ann in predicted_annotations & gold_annotations]) + fn = len([ann for ann in gold_annotations - predicted_annotations]) + fp = len([ann for ann in predicted_annotations - gold_annotations]) + return tp, fp, fn + + def add_counts(self, counts: Tuple[int, int, int], label: str): + self.counts[label] = ( + self.counts[label][0] + counts[0], + self.counts[label][1] + counts[1], + self.counts[label][2] + counts[2], + ) + + def _update(self, document: Document): + new_counts = self.calculate_counts( + document=document, + annotation_filter=partial( + has_one_of_the_labels, label_field=self.label_field, labels=self.labels + ) + if self.per_label + else None, + ) + self.add_counts(new_counts, label="MICRO") + for label in self.labels: + new_counts = self.calculate_counts( + document=document, + annotation_filter=partial( + has_this_label, label_field=self.label_field, label=label + ), + ) + self.add_counts(new_counts, label=label) + + def _compute(self) -> Dict[str, Dict[str, float]]: + res = dict() + if self.per_label: + res["MACRO"] = {"f1": 0.0, "p": 0.0, "r": 0.0} + for label, counts in self.counts.items(): + tp, fp, fn = counts + if tp == 0: + p, r, f1 = 0.0, 0.0, 0.0 + else: + p = tp / (tp + fp) + r = tp / (tp + fn) + f1 = 2 * p * r / (p + r) + res[label] = {"f1": f1, "p": p, "r": r} + if label in self.labels: + res["MACRO"]["f1"] += f1 / len(self.labels) + res["MACRO"]["p"] += p / len(self.labels) + res["MACRO"]["r"] += r / len(self.labels) + if self.show_as_markdown: + logger.info(f"\n{self.layer}:\n{pd.DataFrame(res).round(3).T.to_markdown()}") + return res diff --git a/src/pie_datasets/metrics/statistics.py b/src/pie_datasets/metrics/statistics.py new file mode 100644 index 00000000..549a4fef --- /dev/null +++ b/src/pie_datasets/metrics/statistics.py @@ -0,0 +1,248 @@ +import logging +from collections import defaultdict +from typing import Any, Dict, List, Optional, Type, Union + +from pytorch_ie.annotations import Span +from pytorch_ie.core import Document +from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument +from pytorch_ie.utils.hydra import resolve_optional_document_type +from transformers import AutoTokenizer, PreTrainedTokenizer + +from pie_datasets.document.conversion import tokenize_document +from pie_datasets.statistic import DocumentStatistic + +logger = logging.getLogger(__name__) + + +class TokenCountCollector(DocumentStatistic): + """Collects the token count of a field when tokenizing its content with a Huggingface + tokenizer. + + The content of the field should be a string. + """ + + def __init__( + self, + tokenizer: Union[str, PreTrainedTokenizer], + text_field: str = "text", + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + document_type: Optional[Type[Document]] = None, + **kwargs, + ): + if document_type is None and text_field == "text": + document_type = TextBasedDocument + super().__init__(document_type=document_type, **kwargs) + self.tokenizer = ( + AutoTokenizer.from_pretrained(tokenizer) if isinstance(tokenizer, str) else tokenizer + ) + self.tokenizer_kwargs = tokenizer_kwargs or {} + self.text_field = text_field + + def _collect(self, doc: Document) -> int: + text = getattr(doc, self.text_field) + encodings = self.tokenizer(text, **self.tokenizer_kwargs) + tokens = encodings.tokens() + return len(tokens) + + +class FieldLengthCollector(DocumentStatistic): + """Collects the length of a field, e.g. to collect the number the characters in the input text. + + The field should be a list of sized elements. + """ + + def __init__(self, field: str, **kwargs): + super().__init__(**kwargs) + self.field = field + + def _collect(self, doc: Document) -> int: + field_obj = getattr(doc, self.field) + return len(field_obj) + + +class SubFieldLengthCollector(DocumentStatistic): + """Collects the length of a subfield in a field, e.g. to collect the number of arguments of + N-ary relations.""" + + def __init__(self, field: str, subfield: str, **kwargs): + super().__init__(**kwargs) + self.field = field + self.subfield = subfield + + def _collect(self, doc: Document) -> List[int]: + field_obj = getattr(doc, self.field) + lengths = [] + for entry in field_obj: + subfield_obj = getattr(entry, self.subfield) + lengths.append(len(subfield_obj)) + return lengths + + +class SpanLengthCollector(DocumentStatistic): + """Collects the lengths of Span annotations. If labels are provided, the lengths collected per + label. + + If a tokenizer is provided, the span length is calculated in means of tokens, otherwise in + means of characters. + """ + + DEFAULT_AGGREGATION_FUNCTIONS = ["len", "mean", "std", "min", "max"] + + def __init__( + self, + layer: str, + tokenize: bool = False, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + tokenized_document_type: Optional[Union[str, Type[TokenBasedDocument]]] = None, + labels: Optional[Union[List[str], str]] = None, + label_attribute: str = "label", + tokenize_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.layer = layer + if isinstance(labels, str) and labels != "INFERRED": + raise ValueError("labels must be a list of strings or 'INFERRED'") + if labels == "INFERRED": + logger.warning( + f"Inferring labels with {self.__class__.__name__} from data produces wrong results " + f"for certain aggregation functions (e.g. 'mean', 'std', 'min') because zero values " + f"are not included in the calculation. We remove these aggregation functions from " + f"this collector, but be aware that the results may be wrong for your own aggregation " + f"functions that rely on zero values." + ) + self.aggregation_functions = { + name: func + for name, func in self.aggregation_functions.items() + if name not in ["mean", "std", "min"] + } + self.labels = labels + self.label_field = label_attribute + self.tokenize = tokenize + if self.tokenize: + if tokenizer is None: + raise ValueError( + "tokenizer must be provided to calculate the span length in means of tokens" + ) + if isinstance(tokenizer, str): + tokenizer = AutoTokenizer.from_pretrained(tokenizer) + self.tokenizer = tokenizer + resolved_tokenized_document_type = resolve_optional_document_type( + tokenized_document_type + ) + if resolved_tokenized_document_type is None: + raise ValueError( + "tokenized_document_type must be provided to calculate the span length in means of tokens" + ) + if not ( + isinstance(resolved_tokenized_document_type, type) + and issubclass(resolved_tokenized_document_type, TokenBasedDocument) + ): + raise TypeError( + f"tokenized_document_type must be a subclass of TokenBasedDocument, but it is: " + f"{resolved_tokenized_document_type}" + ) + self.tokenized_document_type = resolved_tokenized_document_type + self.tokenize_kwargs = tokenize_kwargs or {} + + def _collect(self, doc: Document) -> Union[List[int], Dict[str, List[int]]]: + docs: Union[List[Document], List[TokenBasedDocument]] + if self.tokenize: + if not isinstance(doc, TextBasedDocument): + raise ValueError( + "doc must be a TextBasedDocument to calculate the span length in means of tokens" + ) + if not isinstance(doc, TextBasedDocument): + raise ValueError( + "doc must be a TextBasedDocument to calculate the span length in means of tokens" + ) + docs = tokenize_document( + doc, + tokenizer=self.tokenizer, + result_document_type=self.tokenized_document_type, + **self.tokenize_kwargs, + ) + else: + docs = [doc] + + values: Dict[str, List[int]] + if isinstance(self.labels, str): + values = defaultdict(list) + else: + values = {label: [] for label in self.labels or ["ALL"]} + for doc in docs: + layer_obj = getattr(doc, self.layer) + for span in layer_obj: + if not isinstance(span, Span): + raise TypeError( + f"span length calculation is not yet supported for {type(span)}" + ) + length = span.end - span.start + if self.labels is None: + label = "ALL" + else: + label = getattr(span, self.label_field) + values[label].append(length) + + return values if self.labels is not None else values["ALL"] + + +class DummyCollector(DocumentStatistic): + """A dummy collector that always returns 1, e.g. to count the number of documents. + + Can be used to count the number of documents. + """ + + DEFAULT_AGGREGATION_FUNCTIONS = ["sum"] + + def _collect(self, doc: Document) -> int: + return 1 + + +class LabelCountCollector(DocumentStatistic): + """Collects the number of field entries per label, e.g. to collect the number of entities per + type. + + The field should be a list of elements with a label attribute. + + Important: To make correct use of the result data, missing values need to be filled with 0, e.g.: + {("ORG",): [2, 3], ("LOC",): [2]} -> {("ORG",): [2, 3], ("LOC",): [2, 0]} + """ + + DEFAULT_AGGREGATION_FUNCTIONS = ["mean", "std", "min", "max", "len", "sum"] + + def __init__( + self, field: str, labels: Union[List[str], str], label_attribute: str = "label", **kwargs + ): + super().__init__(**kwargs) + self.field = field + self.label_attribute = label_attribute + if not (isinstance(labels, list) or labels == "INFERRED"): + raise ValueError("labels must be a list of strings or 'INFERRED'") + if labels == "INFERRED": + logger.warning( + f"Inferring labels with {self.__class__.__name__} from data produces wrong results " + f"for certain aggregation functions (e.g. 'mean', 'std', 'min') because zero values " + f"are not included in the calculation. We remove these aggregation functions from " + f"this collector, but be aware that the results may be wrong for your own aggregation " + f"functions that rely on zero values." + ) + self.aggregation_functions = { + name: func + for name, func in self.aggregation_functions.items() + if name not in ["mean", "std", "min"] + } + + self.labels = labels + + def _collect(self, doc: Document) -> Dict[str, int]: + field_obj = getattr(doc, self.field) + counts: Dict[str, int] + if self.labels == "INFERRED": + counts = defaultdict(int) + else: + counts = {label: 0 for label in self.labels} + for elem in field_obj: + label = getattr(elem, self.label_attribute) + counts[label] += 1 + return dict(counts) diff --git a/src/pie_datasets/statistic.py b/src/pie_datasets/statistic.py new file mode 100644 index 00000000..9feeb20d --- /dev/null +++ b/src/pie_datasets/statistic.py @@ -0,0 +1,234 @@ +import logging +from abc import abstractmethod +from collections import defaultdict +from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union + +from pytorch_ie.core.document import Document +from pytorch_ie.utils.hydra import ( + InstantiationException, + resolve_optional_document_type, + resolve_target, +) + +from pie_datasets.metric import DocumentMetric + +logger = logging.getLogger(__name__) + + +def _flatten_dict_gen(d, parent_key: Tuple[str, ...] = ()) -> Generator: + for k, v in d.items(): + new_key = parent_key + (k,) + if isinstance(v, dict): + yield from dict(_flatten_dict_gen(v, new_key)).items() + else: + yield new_key, v + + +def flatten_dict(d: Dict[str, Any]) -> Dict[Tuple[str, ...], Any]: + return dict(_flatten_dict_gen(d)) + + +def unflatten_dict(d: Dict[Tuple[str, ...], Any]) -> Union[Dict[str, Any], Any]: + """Unflattens a dictionary with nested keys. + + Example: + >>> d = {("a", "b", "c"): 1, ("a", "b", "d"): 2, ("a", "e"): 3} + >>> unflatten_dict(d) + {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}} + """ + result: Dict[str, Any] = {} + for k, v in d.items(): + if len(k) == 0: + if len(result) > 1: + raise ValueError("Cannot unflatten dictionary with multiple root keys.") + return v + current = result + for key in k[:-1]: + current = current.setdefault(key, {}) + current[k[-1]] = v + return result + + +def _min(values: List[float]) -> Optional[float]: + if len(values) == 0: + return None + return min(values) + + +def _max(values: List[float]) -> Optional[float]: + if len(values) == 0: + return None + return max(values) + + +def _mean(values: List[float]) -> Optional[float]: + if len(values) == 0: + return None + return sum(values) / len(values) + + +def _median(values: List[float]) -> Optional[float]: + if len(values) == 0: + return None + return sorted(values)[len(values) // 2] + + +def _std(values: List[float]) -> Optional[float]: + mean_value = _mean(values) + if mean_value is None: + return None + return (sum((x - mean_value) ** 2 for x in values) / len(values)) ** 0.5 + + +AGGREGATION_FUNCTIONS = { + "min": _min, + "max": _max, + "mean": _mean, + "median": _median, + "std": _std, +} + + +def resolve_agg_function(name: str): + if name in AGGREGATION_FUNCTIONS: + return AGGREGATION_FUNCTIONS[name] + else: + try: + return resolve_target(name) + except InstantiationException: + try: + return resolve_target(f"builtins.{name}") + except InstantiationException: + raise ImportError(f"Cannot resolve aggregation function: {name}") + + +class DocumentStatistic(DocumentMetric): + """A special type of metric that collects statistics from a document. + + Usage: + + ```python + from transformers import AutoTokenizer, PreTrainedTokenizer + from pytorch_ie import DatasetDict + from pytorch_ie.core import Document, DocumentStatistic + + class TokenCountCollector(DocumentStatistic): + + def __init__( + self, + tokenizer: Union[str, PreTrainedTokenizer], + text_field: str, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.tokenizer = ( + AutoTokenizer.from_pretrained(tokenizer) if isinstance(tokenizer, str) else tokenizer + ) + self.tokenizer_kwargs = tokenizer_kwargs or {} + self.text_field = text_field + + def _collect(self, doc: Document) -> int: + text = getattr(doc, self.text_field) + encodings = self.tokenizer(text, **self.tokenizer_kwargs) + tokens = encodings.tokens() + return len(tokens) + + dataset = DatasetDict.load_dataset("pie/conll2003") + statistic = TokenCountCollector( + text_field="text", + tokenizer="bert-base-uncased", + tokenizer_kwargs=dict(add_special_tokens=False), + ) + values = statistic(dataset) + assert values == { + 'train': {'mean': 17.950502100989958, 'std': 13.016237876955675, 'min': 1, 'max': 162}, + 'validation': {'mean': 19.368307692307692, 'std': 14.583363922289669, 'min': 1, 'max': 144}, + 'test': {'mean': 16.774978279756734, 'std': 13.176981022988947, 'min': 1, 'max': 138} + } + ``` + """ + + DEFAULT_AGGREGATION_FUNCTIONS = ["mean", "std", "min", "max"] + + def __init__( + self, + show_histogram: bool = False, + show_as_markdown: bool = False, + aggregation_functions: Optional[List[str]] = None, + title: Optional[str] = None, + document_type: Optional[Union[Type[Document], str]] = None, + ) -> None: + super().__init__() + self.aggregation_functions = { + f_name: resolve_agg_function(f_name) + for f_name in aggregation_functions or self.DEFAULT_AGGREGATION_FUNCTIONS + } + self.show_histogram = show_histogram + self.show_as_markdown = show_as_markdown + self.title = title or self.__class__.__name__ + self._document_type = resolve_optional_document_type(document_type) + + @property + def document_type(self) -> Optional[Type[Document]]: + return self._document_type or super().document_type + + def reset(self) -> None: + self._values: List[Any] = [] + + @abstractmethod + def _collect(self, doc: Document) -> Any: + """Collect any values from a document.""" + + def _update(self, document: Document) -> None: + values = self._collect(document) + self._values.append(values) + + def _compute(self) -> Any: + """We just integrate the values by creating lists for each leaf of the (nested) + dictionary.""" + stats = defaultdict(list) + for collected_result in self._values: + if isinstance(collected_result, dict): + collected_result_flat = flatten_dict(collected_result) + for k, v in collected_result_flat.items(): + if isinstance(v, list): + stats[k].extend(v) + else: + stats[k].append(v) + else: + if isinstance(collected_result, list): + stats[()].extend(collected_result) + else: + stats[()].append(collected_result) + if self.current_split is not None: + title = f"{self.title} (split: {self.current_split}, {len(self._values)} documents)" + else: + title = f"{self.title} ({len(self._values)} documents)" + if self.show_histogram: + import plotext as plt + + for k, values in stats.items(): + if isinstance(values, list): + plt.hist(values, label=".".join(k) if len(k) > 0 else None) + plt.title(title) + plt.show() + plt.clear_figure() + + aggregated_stats = {} + for k, v in stats.items(): + for f_name, f in self.aggregation_functions.items(): + aggregated_stats[k + (f_name,)] = f(v) + + if self.show_as_markdown: + import pandas as pd + + series = pd.Series(aggregated_stats) + if len(series.index.levels) > 1: + df = series.unstack(-1) + logger.info(f"{title}\n{df.round(3).to_markdown()}") + else: + series.index = series.index.get_level_values(0) + logger.info(f"{title}\n{series.round(3).to_markdown()}") + + return unflatten_dict(aggregated_stats) diff --git a/tests/unit/metrics/__init__.py b/tests/unit/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/metrics/test_f1.py b/tests/unit/metrics/test_f1.py new file mode 100644 index 00000000..3dd3acb5 --- /dev/null +++ b/tests/unit/metrics/test_f1.py @@ -0,0 +1,109 @@ +from dataclasses import dataclass + +import pytest +from pytorch_ie.annotations import LabeledSpan +from pytorch_ie.core import AnnotationList, annotation_field +from pytorch_ie.documents import TextBasedDocument + +from pie_datasets.metrics import F1Metric + + +@pytest.fixture +def documents(): + @dataclass + class TextDocumentWithEntities(TextBasedDocument): + entities: AnnotationList[LabeledSpan] = annotation_field(target="text") + + # a test sentence with two entities + doc1 = TextDocumentWithEntities( + text="The quick brown fox jumps over the lazy dog.", + ) + doc1.entities.append(LabeledSpan(start=4, end=19, label="animal")) + doc1.entities.append(LabeledSpan(start=35, end=43, label="animal")) + assert str(doc1.entities[0]) == "quick brown fox" + assert str(doc1.entities[1]) == "lazy dog" + + # a second test sentence with a different text and a single entity (a company) + doc2 = TextDocumentWithEntities(text="Apple is a great company.") + doc2.entities.append(LabeledSpan(start=0, end=5, label="company")) + assert str(doc2.entities[0]) == "Apple" + + documents = [doc1, doc2] + + # add predictions + # correct + documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) + # correct, but duplicate, this should not be counted + documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) + # correct + documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="animal")) + # wrong label + documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="cat")) + # correct + documents[1].entities.predictions.append(LabeledSpan(start=0, end=5, label="company")) + # wrong span + documents[1].entities.predictions.append(LabeledSpan(start=10, end=15, label="company")) + + return documents + + +def test_f1(documents): + metric = F1Metric(layer="entities") + metric(documents) + # tp, fp, fn for micro + assert dict(metric.counts) == {"MICRO": (3, 2, 0)} + assert metric.compute() == {"MICRO": {"f1": 0.7499999999999999, "p": 0.6, "r": 1.0}} + + +def test_f1_per_label(documents): + metric = F1Metric(layer="entities", labels=["animal", "company", "cat"]) + metric(documents) + # tp, fp, fn for micro and per label + assert dict(metric.counts) == { + "MICRO": (3, 2, 0), + "cat": (0, 1, 0), + "company": (1, 1, 0), + "animal": (2, 0, 0), + } + assert metric.compute() == { + "MACRO": {"f1": 0.5555555555555556, "p": 0.5, "r": 0.6666666666666666}, + "MICRO": {"f1": 0.7499999999999999, "p": 0.6, "r": 1.0}, + "cat": {"f1": 0.0, "p": 0.0, "r": 0.0}, + "company": {"f1": 0.6666666666666666, "p": 0.5, "r": 1.0}, + "animal": {"f1": 1.0, "p": 1.0, "r": 1.0}, + } + + +def test_f1_per_label_no_labels(documents): + with pytest.raises(ValueError) as excinfo: + F1Metric(layer="entities", labels=[]) + assert str(excinfo.value) == "labels cannot be empty" + + +def test_f1_per_label_not_allowed(): + with pytest.raises(ValueError) as excinfo: + F1Metric(layer="entities", labels=["animal", "MICRO"]) + assert ( + str(excinfo.value) + == "labels cannot contain 'MICRO' or 'MACRO' because they are used to capture aggregated metrics" + ) + + +# def test_f1_show_as_markdown(documents, caplog): +# metric = F1Metric(layer="entities", labels=["animal", "company", "cat"], show_as_markdown=True) +# metric(documents) +# caplog.set_level(logging.INFO) +# caplog.clear() +# metric.compute() +# assert len(caplog.records) == 1 +# assert ( +# caplog.records[0].message == "\n" +# "entities:\n" +# "| | f1 | p | r |\n" +# "|:--------|------:|----:|------:|\n" +# "| MACRO | 0.556 | 0.5 | 0.667 |\n" +# "| MICRO | 0.75 | 0.6 | 1 |\n" +# "| animal | 1 | 1 | 1 |\n" +# "| company | 0.667 | 0.5 | 1 |\n" +# "| cat | 0 | 0 | 0 |" +# ) diff --git a/tests/unit/metrics/test_statistic.py b/tests/unit/metrics/test_statistic.py new file mode 100644 index 00000000..a67d83ce --- /dev/null +++ b/tests/unit/metrics/test_statistic.py @@ -0,0 +1,225 @@ +import dataclasses + +import pytest +from pytorch_ie.annotations import LabeledSpan +from pytorch_ie.core import AnnotationList, annotation_field +from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument + +from pie_datasets import DatasetDict +from pie_datasets.metrics.statistics import ( + DummyCollector, + FieldLengthCollector, + LabelCountCollector, + SpanLengthCollector, + SubFieldLengthCollector, + TokenCountCollector, +) +from tests import FIXTURES_ROOT + + +@pytest.fixture +def dataset(): + @dataclasses.dataclass + class Conll2003Document(TextBasedDocument): + entities: AnnotationList[LabeledSpan] = annotation_field(target="text") + + return DatasetDict.from_json( + data_dir=FIXTURES_ROOT / "dataset_dict" / "conll2003_extract", + document_type=Conll2003Document, + ) + + +def test_statistics(dataset): + statistic = DummyCollector() + values = statistic(dataset) + assert values == {"train": {"sum": 3}, "test": {"sum": 3}, "validation": {"sum": 3}} + + statistic = LabelCountCollector(field="entities", labels=["LOC", "PER", "ORG", "MISC"]) + values = statistic(dataset) + assert values == { + "train": { + "LOC": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + "PER": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + "ORG": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + "MISC": { + "mean": 0.6666666666666666, + "std": 0.9428090415820634, + "min": 0, + "max": 2, + "len": 3, + "sum": 2, + }, + }, + "validation": { + "LOC": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + "PER": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + "ORG": {"mean": 1.0, "std": 0.816496580927726, "min": 0, "max": 2, "len": 3, "sum": 3}, + "MISC": { + "mean": 0.3333333333333333, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 1, + }, + }, + "test": { + "LOC": {"mean": 1.0, "std": 0.816496580927726, "min": 0, "max": 2, "len": 3, "sum": 3}, + "PER": { + "mean": 0.6666666666666666, + "std": 0.4714045207910317, + "min": 0, + "max": 1, + "len": 3, + "sum": 2, + }, + "ORG": {"mean": 0.0, "std": 0.0, "min": 0, "max": 0, "len": 3, "sum": 0}, + "MISC": {"mean": 0.0, "std": 0.0, "min": 0, "max": 0, "len": 3, "sum": 0}, + }, + } + + statistic = LabelCountCollector(field="entities", labels="INFERRED") + values = statistic(dataset) + assert values == { + "train": { + "ORG": {"max": 1, "len": 1, "sum": 1}, + "MISC": {"max": 2, "len": 1, "sum": 2}, + "PER": {"max": 1, "len": 1, "sum": 1}, + "LOC": {"max": 1, "len": 1, "sum": 1}, + }, + "validation": { + "ORG": {"max": 2, "len": 2, "sum": 3}, + "LOC": {"max": 1, "len": 1, "sum": 1}, + "MISC": {"max": 1, "len": 1, "sum": 1}, + "PER": {"max": 1, "len": 1, "sum": 1}, + }, + "test": {"LOC": {"max": 2, "len": 2, "sum": 3}, "PER": {"max": 1, "len": 2, "sum": 2}}, + } + + statistic = FieldLengthCollector(field="text") + values = statistic(dataset) + assert values == { + "test": {"max": 57, "mean": 36.0, "min": 11, "std": 18.991226044325487}, + "train": {"max": 48, "mean": 27.333333333333332, "min": 15, "std": 14.70449666674185}, + "validation": {"max": 187, "mean": 89.66666666666667, "min": 17, "std": 71.5603863103665}, + } + + statistic = SpanLengthCollector(layer="entities") + values = statistic(dataset) + assert values == { + "train": {"len": 5, "mean": 7.6, "std": 4.223742416388575, "min": 2, "max": 15}, + "validation": { + "len": 6, + "mean": 10.833333333333334, + "std": 2.9674156357941426, + "min": 6, + "max": 14, + }, + "test": {"len": 5, "mean": 9.4, "std": 5.748043145279966, "min": 5, "max": 20}, + } + + statistic = SpanLengthCollector(layer="entities", labels="INFERRED") + values = statistic(dataset) + assert values == { + "train": { + "ORG": {"max": 2, "len": 1}, + "MISC": {"max": 7, "len": 2}, + "PER": {"max": 15, "len": 1}, + "LOC": {"max": 8, "len": 1}, + }, + "test": { + "LOC": { + "max": 20, + "len": 3, + }, + "PER": {"max": 11, "len": 2}, + }, + "validation": { + "ORG": {"max": 14, "len": 3}, + "LOC": {"max": 6, "len": 1}, + "MISC": {"max": 11, "len": 1}, + "PER": {"max": 12, "len": 1}, + }, + } + + # this is not super useful, we just collect the lengths of the labels, but it is enough to test the code + statistic = SubFieldLengthCollector(field="entities", subfield="label") + values = statistic(dataset) + assert values == { + "test": {"max": 3, "mean": 3.0, "min": 3, "std": 0.0}, + "train": {"max": 4, "mean": 3.4, "min": 3, "std": 0.4898979485566356}, + "validation": {"max": 4, "mean": 3.1666666666666665, "min": 3, "std": 0.3726779962499649}, + } + + +@pytest.mark.slow +def test_statistics_with_tokenize(dataset): + statistic = TokenCountCollector( + text_field="text", + tokenizer="bert-base-uncased", + tokenizer_kwargs=dict(add_special_tokens=False), + ) + values = statistic(dataset) + assert values == { + "test": {"max": 12, "mean": 9.333333333333334, "min": 4, "std": 3.7712361663282534}, + "train": {"max": 9, "mean": 5.666666666666667, "min": 2, "std": 2.8674417556808756}, + "validation": {"max": 38, "mean": 18.333333333333332, "min": 6, "std": 14.055445761538678}, + } + + @dataclasses.dataclass + class TokenDocumentWithLabeledEntities(TokenBasedDocument): + entities: AnnotationList[LabeledSpan] = annotation_field(target="tokens") + + statistic = SpanLengthCollector( + layer="entities", + tokenize=True, + tokenizer="bert-base-uncased", + tokenized_document_type=TokenDocumentWithLabeledEntities, + ) + values = statistic(dataset) + assert values == { + "test": {"len": 5, "max": 4, "mean": 2.4, "min": 1, "std": 1.2000000000000002}, + "train": {"len": 5, "max": 2, "mean": 1.2, "min": 1, "std": 0.4}, + "validation": { + "len": 6, + "max": 2, + "mean": 1.3333333333333333, + "min": 1, + "std": 0.4714045207910317, + }, + } diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py new file mode 100644 index 00000000..1b51ac67 --- /dev/null +++ b/tests/unit/test_metric.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass +from typing import Optional + +import pytest +from pytorch_ie.annotations import LabeledSpan +from pytorch_ie.core import AnnotationList, Document, annotation_field +from pytorch_ie.documents import TextBasedDocument + +from pie_datasets import DocumentMetric + + +@pytest.fixture +def documents(): + @dataclass + class TextDocumentWithEntities(TextBasedDocument): + entities: AnnotationList[LabeledSpan] = annotation_field(target="text") + + # a test sentence with two entities + doc1 = TextDocumentWithEntities( + text="The quick brown fox jumps over the lazy dog.", + ) + doc1.entities.append(LabeledSpan(start=4, end=19, label="animal")) + doc1.entities.append(LabeledSpan(start=35, end=43, label="animal")) + assert str(doc1.entities[0]) == "quick brown fox" + assert str(doc1.entities[1]) == "lazy dog" + + # a second test sentence with a different text and a single entity (a company) + doc2 = TextDocumentWithEntities(text="Apple is a great company.") + doc2.entities.append(LabeledSpan(start=0, end=5, label="company")) + assert str(doc2.entities[0]) == "Apple" + + documents = [doc1, doc2] + + # add predictions + # correct + documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) + # correct, but duplicate, this should not be counted + documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) + # correct + documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="animal")) + # wrong label + documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="cat")) + # correct + documents[1].entities.predictions.append(LabeledSpan(start=0, end=5, label="company")) + # wrong span + documents[1].entities.predictions.append(LabeledSpan(start=10, end=15, label="company")) + + return documents + + +class Accuracy(DocumentMetric): + def __init__(self, layer: str): + super().__init__() + self.layer = layer + + def reset(self) -> None: + self.total = 0 + self.correct = 0 + + def _update(self, document: Document) -> None: + layer = document[self.layer] + predictions = layer.predictions + self.total += len(set(predictions)) + self.correct += len(set(layer) & set(predictions)) + + def _compute(self) -> Optional[float]: + if self.total == 0: + return None + return self.correct / self.total + + +def test_document_metric(documents): + accuracy = Accuracy(layer="entities") + accuracy(documents[0]) + assert accuracy.total == 3 + assert accuracy.correct == 2 + assert accuracy.compute() == 2 / 3 + assert accuracy.total == 0 + assert accuracy.correct == 0 + + +def test_document_metric_iterable(documents): + accuracy = Accuracy(layer="entities") + accuracy(documents) + assert accuracy.total == 5 + assert accuracy.correct == 3 + assert accuracy.compute() == 3 / 5 + assert accuracy.total == 0 + assert accuracy.correct == 0 + + +def test_document_metric_wrong_iterable(): + accuracy = Accuracy(layer="entities") + with pytest.raises(TypeError) as excinfo: + accuracy([1, 2]) + assert ( + str(excinfo.value) + == "document_or_collection contains an object that is not a document: " + ) + + +def test_document_metric_dict(documents): + dummy_dataset_dict = {"train": [documents[0]], "val": [], "test": [documents[1]]} + accuracy = Accuracy(layer="entities") + result = accuracy(dummy_dataset_dict) + + assert result["train"] == 2 / 3 + assert result["test"] == 0.5 + assert result["val"] is None + + +def test_document_metric_wrong_type(): + accuracy = Accuracy(layer="entities") + with pytest.raises(TypeError) as excinfo: + accuracy(1) + assert str(excinfo.value) == "document_or_collection has unknown type: " From 13ecd17e4601b3c3552b40bf35762b535695efbb Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 23:22:25 +0100 Subject: [PATCH 02/10] add mixins to dataset_dict.py; do not directly export mixins; remove metrics --- src/pie_datasets/__init__.py | 15 +-- src/pie_datasets/common.py | 77 ------------ src/pie_datasets/dataset_dict.py | 47 +++++-- .../document/processing/regex_partitioner.py | 2 +- src/pie_datasets/metric.py | 76 ----------- src/pie_datasets/metrics/__init__.py | 3 - src/pie_datasets/metrics/f1.py | 118 ------------------ src/pie_datasets/statistic.py | 3 +- src/pie_datasets/{metrics => }/statistics.py | 0 tests/unit/metrics/__init__.py | 0 tests/unit/metrics/test_f1.py | 109 ---------------- tests/unit/test_dataset_dict.py | 6 +- tests/unit/test_metric.py | 116 ----------------- .../test_statistic.py => test_statistics.py} | 2 +- 14 files changed, 47 insertions(+), 527 deletions(-) delete mode 100644 src/pie_datasets/common.py delete mode 100644 src/pie_datasets/metric.py delete mode 100644 src/pie_datasets/metrics/__init__.py delete mode 100644 src/pie_datasets/metrics/f1.py rename src/pie_datasets/{metrics => }/statistics.py (100%) delete mode 100644 tests/unit/metrics/__init__.py delete mode 100644 tests/unit/metrics/test_f1.py delete mode 100644 tests/unit/test_metric.py rename tests/unit/{metrics/test_statistic.py => test_statistics.py} (99%) diff --git a/src/pie_datasets/__init__.py b/src/pie_datasets/__init__.py index b4f2086f..1732fec7 100644 --- a/src/pie_datasets/__init__.py +++ b/src/pie_datasets/__init__.py @@ -1,24 +1,15 @@ -from .builder import GeneratorBasedBuilder -from .common import ( - EnterDatasetDictMixin, - EnterDatasetMixin, - ExitDatasetDictMixin, - ExitDatasetMixin, -) +from .builder import ArrowBasedBuilder, GeneratorBasedBuilder from .dataset import Dataset, IterableDataset from .dataset_dict import DatasetDict from .document_formatter import DocumentFormatter -from .metric import DocumentMetric from .statistic import DocumentStatistic __all__ = [ "GeneratorBasedBuilder", + "ArrowBasedBuilder", "Dataset", "IterableDataset", "DatasetDict", "DocumentFormatter", - "EnterDatasetMixin", - "ExitDatasetMixin", - "EnterDatasetDictMixin", - "ExitDatasetDictMixin", + "DocumentStatistic", ] diff --git a/src/pie_datasets/common.py b/src/pie_datasets/common.py deleted file mode 100644 index 5eee2cf9..00000000 --- a/src/pie_datasets/common.py +++ /dev/null @@ -1,77 +0,0 @@ -import logging -from abc import ABC, abstractmethod -from typing import Optional, Type, Union - -from pytorch_ie.core.document import Document - -from pie_datasets import DatasetDict -from pie_datasets.dataset import Dataset, IterableDataset - -logger = logging.getLogger(__name__) - - -class EnterDatasetMixin(ABC): - """Mixin for processors that enter a dataset context.""" - - @abstractmethod - def enter_dataset( - self, dataset: Union[Dataset, IterableDataset], name: Optional[str] = None - ) -> None: - """Enter dataset context.""" - - -class ExitDatasetMixin(ABC): - """Mixin for processors that exit a dataset context.""" - - @abstractmethod - def exit_dataset( - self, dataset: Union[Dataset, IterableDataset], name: Optional[str] = None - ) -> None: - """Exit dataset context.""" - - -class EnterDatasetDictMixin(ABC): - """Mixin for processors that enter a dataset dict context.""" - - @abstractmethod - def enter_dataset_dict(self, dataset_dict) -> None: - """Enter dataset dict context.""" - - -class ExitDatasetDictMixin(ABC): - """Mixin for processors that exit a dataset dict context.""" - - @abstractmethod - def exit_dataset_dict(self, dataset_dict) -> None: - """Exit dataset dict context.""" - - -class RequiresDocumentTypeMixin: - DOCUMENT_TYPE: Optional[Type[Document]] = None - - @property - def document_type(self) -> Optional[Type[Document]]: - return self.DOCUMENT_TYPE - - def convert_dataset(self, dataset: DatasetDict) -> DatasetDict: - name = type(self).__name__ - # auto-convert the dataset if a document type is specified - if self.document_type is not None: - if issubclass(dataset.document_type, self.document_type): - logger.info( - f"the dataset is already of the document type that is specified by {name}: " - f"{self.document_type}" - ) - else: - logger.info( - f"convert the dataset to the document type that is specified by {name}: " - f"{self.document_type}" - ) - dataset = dataset.to_document_type(self.document_type) - else: - logger.warning( - f"{name} does not specify a document type. The dataset can not be automatically converted " - f"to a document type." - ) - - return dataset diff --git a/src/pie_datasets/dataset_dict.py b/src/pie_datasets/dataset_dict.py index ef0d5467..df2ff3c2 100644 --- a/src/pie_datasets/dataset_dict.py +++ b/src/pie_datasets/dataset_dict.py @@ -1,6 +1,7 @@ import json import logging import os +from abc import ABC, abstractmethod from pathlib import Path from typing import ( Any, @@ -15,16 +16,10 @@ ) import datasets -from pytorch_ie.core import Document +from pytorch_ie.core.document import Document from pytorch_ie.utils.hydra import resolve_target, serialize_document_type -from .common import ( - EnterDatasetDictMixin, - EnterDatasetMixin, - ExitDatasetDictMixin, - ExitDatasetMixin, -) -from .dataset import Dataset, IterableDataset, get_pie_dataset_type +from pie_datasets.dataset import Dataset, IterableDataset, get_pie_dataset_type logger = logging.getLogger(__name__) @@ -34,6 +29,42 @@ D = TypeVar("D", bound=Document) +class EnterDatasetMixin(ABC): + """Mixin for processors that enter a dataset context.""" + + @abstractmethod + def enter_dataset( + self, dataset: Union[Dataset, IterableDataset], name: Optional[str] = None + ) -> None: + """Enter dataset context.""" + + +class ExitDatasetMixin(ABC): + """Mixin for processors that exit a dataset context.""" + + @abstractmethod + def exit_dataset( + self, dataset: Union[Dataset, IterableDataset], name: Optional[str] = None + ) -> None: + """Exit dataset context.""" + + +class EnterDatasetDictMixin(ABC): + """Mixin for processors that enter a dataset dict context.""" + + @abstractmethod + def enter_dataset_dict(self, dataset_dict) -> None: + """Enter dataset dict context.""" + + +class ExitDatasetDictMixin(ABC): + """Mixin for processors that exit a dataset dict context.""" + + @abstractmethod + def exit_dataset_dict(self, dataset_dict) -> None: + """Exit dataset dict context.""" + + class DatasetDict(datasets.DatasetDict): def __getitem__(self, k) -> Union[Dataset, IterableDataset]: # type: ignore """Returns an individual dataset split.""" diff --git a/src/pie_datasets/document/processing/regex_partitioner.py b/src/pie_datasets/document/processing/regex_partitioner.py index ae831db9..e99dcf16 100644 --- a/src/pie_datasets/document/processing/regex_partitioner.py +++ b/src/pie_datasets/document/processing/regex_partitioner.py @@ -9,7 +9,7 @@ from pytorch_ie.annotations import LabeledSpan from pytorch_ie.documents import TextBasedDocument -from pie_datasets import Dataset, EnterDatasetMixin, ExitDatasetMixin, IterableDataset +from pie_datasets.dataset_dict import Dataset, EnterDatasetMixin, ExitDatasetMixin, IterableDataset logger = logging.getLogger(__name__) diff --git a/src/pie_datasets/metric.py b/src/pie_datasets/metric.py deleted file mode 100644 index 31f44dc6..00000000 --- a/src/pie_datasets/metric.py +++ /dev/null @@ -1,76 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Dict, Generic, Iterable, Optional, TypeVar, Union - -from pytorch_ie.core.document import Document - -from pie_datasets.common import RequiresDocumentTypeMixin - -T = TypeVar("T") - - -class DocumentMetric(ABC, RequiresDocumentTypeMixin, Generic[T]): - """This defines the interface for a document metric.""" - - def __init__(self): - self.reset() - self._current_split: Optional[str] = None - - @abstractmethod - def reset(self) -> None: - """Any reset logic that needs to be performed before the metric is called again.""" - - def __call__( - self, - document_or_collection: Union[Iterable[Document], Document, Dict[str, Iterable[Document]]], - ) -> Union[Dict[str, T], T]: - """This method is called to update the metric with a document or collection of documents. - - If a collection is passed, the metric is also computed and the result is returned. If the - collection is a dictionary, the metric is computed for each split and the result is - returned as a dictionary. - """ - if isinstance(document_or_collection, Document): - # do not reset here to allow for multiple calls - self._update(document_or_collection) - return self.compute(reset=False) - elif isinstance(document_or_collection, dict): - result: Dict[str, T] = {} - for split_name, split in document_or_collection.items(): - self._current_split = split_name - self.reset() - split_values: T = self(split) # type: ignore - result[split_name] = split_values - self._current_split = None - return result - elif isinstance(document_or_collection, Iterable): - for doc in document_or_collection: - if not isinstance(doc, Document): - raise TypeError( - f"document_or_collection contains an object that is not a document: {type(doc)}" - ) - self._update(doc) - # do not reset here to allow for multiple calls - return self.compute(reset=False) - else: - raise TypeError( - f"document_or_collection has unknown type: {type(document_or_collection)}" - ) - - def compute(self, reset: bool = True) -> T: - metric_values = self._compute() - if reset: - self.reset() - return metric_values - - @abstractmethod - def _update(self, document: Document) -> None: - """This method is called to update the metric with the new document.""" - - @abstractmethod - def _compute(self) -> T: - """This method is called to get the metric values.""" - - @property - def current_split(self) -> Optional[str]: - """The current split that is being processed.""" - return self._current_split diff --git a/src/pie_datasets/metrics/__init__.py b/src/pie_datasets/metrics/__init__.py deleted file mode 100644 index ec2a853a..00000000 --- a/src/pie_datasets/metrics/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .f1 import F1Metric - -__all__ = ["F1Metric"] diff --git a/src/pie_datasets/metrics/f1.py b/src/pie_datasets/metrics/f1.py deleted file mode 100644 index 7b2fbf78..00000000 --- a/src/pie_datasets/metrics/f1.py +++ /dev/null @@ -1,118 +0,0 @@ -import logging -from collections import defaultdict -from functools import partial -from typing import Callable, Collection, Dict, Optional, Tuple - -import pandas as pd -from pytorch_ie.core import Annotation, Document - -from pie_datasets.metric import DocumentMetric - -logger = logging.getLogger(__name__) - - -def has_one_of_the_labels(ann: Annotation, label_field: str, labels: Collection[str]) -> bool: - return getattr(ann, label_field) in labels - - -def has_this_label(ann: Annotation, label_field: str, label: str) -> bool: - return getattr(ann, label_field) == label - - -class F1Metric(DocumentMetric): - """Computes the (micro aggregated) F1 score for a given layer. If labels are provided, it also - computes the F1 score for each label separately and the macro F1 score. - - Args: - layer: The layer to compute the F1 score for. - labels: If provided, calculate F1 score for each label. - label_field: The field to use for the label. Defaults to "label". - show_as_markdown: If True, logs the F1 score as markdown on the console when calling compute(). - """ - - def __init__( - self, - layer: str, - labels: Optional[Collection[str]] = None, - label_field: str = "label", - show_as_markdown: bool = False, - ): - super().__init__() - self.layer = layer - self.label_field = label_field - self.show_as_markdown = show_as_markdown - - self.per_label = labels is not None - self.labels = labels or [] - if self.per_label: - if "MICRO" in self.labels or "MACRO" in self.labels: - raise ValueError( - "labels cannot contain 'MICRO' or 'MACRO' because they are used to capture aggregated metrics" - ) - if len(self.labels) == 0: - raise ValueError("labels cannot be empty") - - def reset(self): - self.counts = defaultdict(lambda: (0, 0, 0)) - - def calculate_counts( - self, - document: Document, - annotation_filter: Optional[Callable[[Annotation], bool]] = None, - ) -> Tuple[int, int, int]: - annotation_filter = annotation_filter or (lambda ann: True) - predicted_annotations = { - ann for ann in document[self.layer].predictions if annotation_filter(ann) - } - gold_annotations = {ann for ann in document[self.layer] if annotation_filter(ann)} - tp = len([ann for ann in predicted_annotations & gold_annotations]) - fn = len([ann for ann in gold_annotations - predicted_annotations]) - fp = len([ann for ann in predicted_annotations - gold_annotations]) - return tp, fp, fn - - def add_counts(self, counts: Tuple[int, int, int], label: str): - self.counts[label] = ( - self.counts[label][0] + counts[0], - self.counts[label][1] + counts[1], - self.counts[label][2] + counts[2], - ) - - def _update(self, document: Document): - new_counts = self.calculate_counts( - document=document, - annotation_filter=partial( - has_one_of_the_labels, label_field=self.label_field, labels=self.labels - ) - if self.per_label - else None, - ) - self.add_counts(new_counts, label="MICRO") - for label in self.labels: - new_counts = self.calculate_counts( - document=document, - annotation_filter=partial( - has_this_label, label_field=self.label_field, label=label - ), - ) - self.add_counts(new_counts, label=label) - - def _compute(self) -> Dict[str, Dict[str, float]]: - res = dict() - if self.per_label: - res["MACRO"] = {"f1": 0.0, "p": 0.0, "r": 0.0} - for label, counts in self.counts.items(): - tp, fp, fn = counts - if tp == 0: - p, r, f1 = 0.0, 0.0, 0.0 - else: - p = tp / (tp + fp) - r = tp / (tp + fn) - f1 = 2 * p * r / (p + r) - res[label] = {"f1": f1, "p": p, "r": r} - if label in self.labels: - res["MACRO"]["f1"] += f1 / len(self.labels) - res["MACRO"]["p"] += p / len(self.labels) - res["MACRO"]["r"] += r / len(self.labels) - if self.show_as_markdown: - logger.info(f"\n{self.layer}:\n{pd.DataFrame(res).round(3).T.to_markdown()}") - return res diff --git a/src/pie_datasets/statistic.py b/src/pie_datasets/statistic.py index 9feeb20d..b4518eaf 100644 --- a/src/pie_datasets/statistic.py +++ b/src/pie_datasets/statistic.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union +from pytorch_ie.core import DocumentMetric from pytorch_ie.core.document import Document from pytorch_ie.utils.hydra import ( InstantiationException, @@ -10,8 +11,6 @@ resolve_target, ) -from pie_datasets.metric import DocumentMetric - logger = logging.getLogger(__name__) diff --git a/src/pie_datasets/metrics/statistics.py b/src/pie_datasets/statistics.py similarity index 100% rename from src/pie_datasets/metrics/statistics.py rename to src/pie_datasets/statistics.py diff --git a/tests/unit/metrics/__init__.py b/tests/unit/metrics/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/metrics/test_f1.py b/tests/unit/metrics/test_f1.py deleted file mode 100644 index 3dd3acb5..00000000 --- a/tests/unit/metrics/test_f1.py +++ /dev/null @@ -1,109 +0,0 @@ -from dataclasses import dataclass - -import pytest -from pytorch_ie.annotations import LabeledSpan -from pytorch_ie.core import AnnotationList, annotation_field -from pytorch_ie.documents import TextBasedDocument - -from pie_datasets.metrics import F1Metric - - -@pytest.fixture -def documents(): - @dataclass - class TextDocumentWithEntities(TextBasedDocument): - entities: AnnotationList[LabeledSpan] = annotation_field(target="text") - - # a test sentence with two entities - doc1 = TextDocumentWithEntities( - text="The quick brown fox jumps over the lazy dog.", - ) - doc1.entities.append(LabeledSpan(start=4, end=19, label="animal")) - doc1.entities.append(LabeledSpan(start=35, end=43, label="animal")) - assert str(doc1.entities[0]) == "quick brown fox" - assert str(doc1.entities[1]) == "lazy dog" - - # a second test sentence with a different text and a single entity (a company) - doc2 = TextDocumentWithEntities(text="Apple is a great company.") - doc2.entities.append(LabeledSpan(start=0, end=5, label="company")) - assert str(doc2.entities[0]) == "Apple" - - documents = [doc1, doc2] - - # add predictions - # correct - documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) - # correct, but duplicate, this should not be counted - documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) - # correct - documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="animal")) - # wrong label - documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="cat")) - # correct - documents[1].entities.predictions.append(LabeledSpan(start=0, end=5, label="company")) - # wrong span - documents[1].entities.predictions.append(LabeledSpan(start=10, end=15, label="company")) - - return documents - - -def test_f1(documents): - metric = F1Metric(layer="entities") - metric(documents) - # tp, fp, fn for micro - assert dict(metric.counts) == {"MICRO": (3, 2, 0)} - assert metric.compute() == {"MICRO": {"f1": 0.7499999999999999, "p": 0.6, "r": 1.0}} - - -def test_f1_per_label(documents): - metric = F1Metric(layer="entities", labels=["animal", "company", "cat"]) - metric(documents) - # tp, fp, fn for micro and per label - assert dict(metric.counts) == { - "MICRO": (3, 2, 0), - "cat": (0, 1, 0), - "company": (1, 1, 0), - "animal": (2, 0, 0), - } - assert metric.compute() == { - "MACRO": {"f1": 0.5555555555555556, "p": 0.5, "r": 0.6666666666666666}, - "MICRO": {"f1": 0.7499999999999999, "p": 0.6, "r": 1.0}, - "cat": {"f1": 0.0, "p": 0.0, "r": 0.0}, - "company": {"f1": 0.6666666666666666, "p": 0.5, "r": 1.0}, - "animal": {"f1": 1.0, "p": 1.0, "r": 1.0}, - } - - -def test_f1_per_label_no_labels(documents): - with pytest.raises(ValueError) as excinfo: - F1Metric(layer="entities", labels=[]) - assert str(excinfo.value) == "labels cannot be empty" - - -def test_f1_per_label_not_allowed(): - with pytest.raises(ValueError) as excinfo: - F1Metric(layer="entities", labels=["animal", "MICRO"]) - assert ( - str(excinfo.value) - == "labels cannot contain 'MICRO' or 'MACRO' because they are used to capture aggregated metrics" - ) - - -# def test_f1_show_as_markdown(documents, caplog): -# metric = F1Metric(layer="entities", labels=["animal", "company", "cat"], show_as_markdown=True) -# metric(documents) -# caplog.set_level(logging.INFO) -# caplog.clear() -# metric.compute() -# assert len(caplog.records) == 1 -# assert ( -# caplog.records[0].message == "\n" -# "entities:\n" -# "| | f1 | p | r |\n" -# "|:--------|------:|----:|------:|\n" -# "| MACRO | 0.556 | 0.5 | 0.667 |\n" -# "| MICRO | 0.75 | 0.6 | 1 |\n" -# "| animal | 1 | 1 | 1 |\n" -# "| company | 0.667 | 0.5 | 1 |\n" -# "| cat | 0 | 0 | 0 |" -# ) diff --git a/tests/unit/test_dataset_dict.py b/tests/unit/test_dataset_dict.py index 2f7b97eb..aa382343 100644 --- a/tests/unit/test_dataset_dict.py +++ b/tests/unit/test_dataset_dict.py @@ -9,14 +9,12 @@ from pytorch_ie.core import AnnotationList, Document, annotation_field from pytorch_ie.documents import TextBasedDocument, TextDocument -from pie_datasets import ( - Dataset, - DatasetDict, +from pie_datasets import Dataset, DatasetDict, IterableDataset +from pie_datasets.dataset_dict import ( EnterDatasetDictMixin, EnterDatasetMixin, ExitDatasetDictMixin, ExitDatasetMixin, - IterableDataset, ) from tests import DATASET_BUILDERS_ROOT, FIXTURES_ROOT from tests.conftest import CREATE_FIXTURE_DATA, TestDocument diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py deleted file mode 100644 index 1b51ac67..00000000 --- a/tests/unit/test_metric.py +++ /dev/null @@ -1,116 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -import pytest -from pytorch_ie.annotations import LabeledSpan -from pytorch_ie.core import AnnotationList, Document, annotation_field -from pytorch_ie.documents import TextBasedDocument - -from pie_datasets import DocumentMetric - - -@pytest.fixture -def documents(): - @dataclass - class TextDocumentWithEntities(TextBasedDocument): - entities: AnnotationList[LabeledSpan] = annotation_field(target="text") - - # a test sentence with two entities - doc1 = TextDocumentWithEntities( - text="The quick brown fox jumps over the lazy dog.", - ) - doc1.entities.append(LabeledSpan(start=4, end=19, label="animal")) - doc1.entities.append(LabeledSpan(start=35, end=43, label="animal")) - assert str(doc1.entities[0]) == "quick brown fox" - assert str(doc1.entities[1]) == "lazy dog" - - # a second test sentence with a different text and a single entity (a company) - doc2 = TextDocumentWithEntities(text="Apple is a great company.") - doc2.entities.append(LabeledSpan(start=0, end=5, label="company")) - assert str(doc2.entities[0]) == "Apple" - - documents = [doc1, doc2] - - # add predictions - # correct - documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) - # correct, but duplicate, this should not be counted - documents[0].entities.predictions.append(LabeledSpan(start=4, end=19, label="animal")) - # correct - documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="animal")) - # wrong label - documents[0].entities.predictions.append(LabeledSpan(start=35, end=43, label="cat")) - # correct - documents[1].entities.predictions.append(LabeledSpan(start=0, end=5, label="company")) - # wrong span - documents[1].entities.predictions.append(LabeledSpan(start=10, end=15, label="company")) - - return documents - - -class Accuracy(DocumentMetric): - def __init__(self, layer: str): - super().__init__() - self.layer = layer - - def reset(self) -> None: - self.total = 0 - self.correct = 0 - - def _update(self, document: Document) -> None: - layer = document[self.layer] - predictions = layer.predictions - self.total += len(set(predictions)) - self.correct += len(set(layer) & set(predictions)) - - def _compute(self) -> Optional[float]: - if self.total == 0: - return None - return self.correct / self.total - - -def test_document_metric(documents): - accuracy = Accuracy(layer="entities") - accuracy(documents[0]) - assert accuracy.total == 3 - assert accuracy.correct == 2 - assert accuracy.compute() == 2 / 3 - assert accuracy.total == 0 - assert accuracy.correct == 0 - - -def test_document_metric_iterable(documents): - accuracy = Accuracy(layer="entities") - accuracy(documents) - assert accuracy.total == 5 - assert accuracy.correct == 3 - assert accuracy.compute() == 3 / 5 - assert accuracy.total == 0 - assert accuracy.correct == 0 - - -def test_document_metric_wrong_iterable(): - accuracy = Accuracy(layer="entities") - with pytest.raises(TypeError) as excinfo: - accuracy([1, 2]) - assert ( - str(excinfo.value) - == "document_or_collection contains an object that is not a document: " - ) - - -def test_document_metric_dict(documents): - dummy_dataset_dict = {"train": [documents[0]], "val": [], "test": [documents[1]]} - accuracy = Accuracy(layer="entities") - result = accuracy(dummy_dataset_dict) - - assert result["train"] == 2 / 3 - assert result["test"] == 0.5 - assert result["val"] is None - - -def test_document_metric_wrong_type(): - accuracy = Accuracy(layer="entities") - with pytest.raises(TypeError) as excinfo: - accuracy(1) - assert str(excinfo.value) == "document_or_collection has unknown type: " diff --git a/tests/unit/metrics/test_statistic.py b/tests/unit/test_statistics.py similarity index 99% rename from tests/unit/metrics/test_statistic.py rename to tests/unit/test_statistics.py index a67d83ce..85496850 100644 --- a/tests/unit/metrics/test_statistic.py +++ b/tests/unit/test_statistics.py @@ -6,7 +6,7 @@ from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument from pie_datasets import DatasetDict -from pie_datasets.metrics.statistics import ( +from pie_datasets.statistics import ( DummyCollector, FieldLengthCollector, LabelCountCollector, From 546f0b1dc6e769cd3f4b1b78f6be2e909107eb20 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 23:47:57 +0100 Subject: [PATCH 03/10] remove DocumentStatistic (use it from pytorch-ie) --- src/pie_datasets/__init__.py | 2 - src/pie_datasets/statistic.py | 233 --------------------------------- src/pie_datasets/statistics.py | 3 +- 3 files changed, 1 insertion(+), 237 deletions(-) delete mode 100644 src/pie_datasets/statistic.py diff --git a/src/pie_datasets/__init__.py b/src/pie_datasets/__init__.py index 1732fec7..2d1b01fc 100644 --- a/src/pie_datasets/__init__.py +++ b/src/pie_datasets/__init__.py @@ -2,7 +2,6 @@ from .dataset import Dataset, IterableDataset from .dataset_dict import DatasetDict from .document_formatter import DocumentFormatter -from .statistic import DocumentStatistic __all__ = [ "GeneratorBasedBuilder", @@ -11,5 +10,4 @@ "IterableDataset", "DatasetDict", "DocumentFormatter", - "DocumentStatistic", ] diff --git a/src/pie_datasets/statistic.py b/src/pie_datasets/statistic.py deleted file mode 100644 index b4518eaf..00000000 --- a/src/pie_datasets/statistic.py +++ /dev/null @@ -1,233 +0,0 @@ -import logging -from abc import abstractmethod -from collections import defaultdict -from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union - -from pytorch_ie.core import DocumentMetric -from pytorch_ie.core.document import Document -from pytorch_ie.utils.hydra import ( - InstantiationException, - resolve_optional_document_type, - resolve_target, -) - -logger = logging.getLogger(__name__) - - -def _flatten_dict_gen(d, parent_key: Tuple[str, ...] = ()) -> Generator: - for k, v in d.items(): - new_key = parent_key + (k,) - if isinstance(v, dict): - yield from dict(_flatten_dict_gen(v, new_key)).items() - else: - yield new_key, v - - -def flatten_dict(d: Dict[str, Any]) -> Dict[Tuple[str, ...], Any]: - return dict(_flatten_dict_gen(d)) - - -def unflatten_dict(d: Dict[Tuple[str, ...], Any]) -> Union[Dict[str, Any], Any]: - """Unflattens a dictionary with nested keys. - - Example: - >>> d = {("a", "b", "c"): 1, ("a", "b", "d"): 2, ("a", "e"): 3} - >>> unflatten_dict(d) - {'a': {'b': {'c': 1, 'd': 2}, 'e': 3}} - """ - result: Dict[str, Any] = {} - for k, v in d.items(): - if len(k) == 0: - if len(result) > 1: - raise ValueError("Cannot unflatten dictionary with multiple root keys.") - return v - current = result - for key in k[:-1]: - current = current.setdefault(key, {}) - current[k[-1]] = v - return result - - -def _min(values: List[float]) -> Optional[float]: - if len(values) == 0: - return None - return min(values) - - -def _max(values: List[float]) -> Optional[float]: - if len(values) == 0: - return None - return max(values) - - -def _mean(values: List[float]) -> Optional[float]: - if len(values) == 0: - return None - return sum(values) / len(values) - - -def _median(values: List[float]) -> Optional[float]: - if len(values) == 0: - return None - return sorted(values)[len(values) // 2] - - -def _std(values: List[float]) -> Optional[float]: - mean_value = _mean(values) - if mean_value is None: - return None - return (sum((x - mean_value) ** 2 for x in values) / len(values)) ** 0.5 - - -AGGREGATION_FUNCTIONS = { - "min": _min, - "max": _max, - "mean": _mean, - "median": _median, - "std": _std, -} - - -def resolve_agg_function(name: str): - if name in AGGREGATION_FUNCTIONS: - return AGGREGATION_FUNCTIONS[name] - else: - try: - return resolve_target(name) - except InstantiationException: - try: - return resolve_target(f"builtins.{name}") - except InstantiationException: - raise ImportError(f"Cannot resolve aggregation function: {name}") - - -class DocumentStatistic(DocumentMetric): - """A special type of metric that collects statistics from a document. - - Usage: - - ```python - from transformers import AutoTokenizer, PreTrainedTokenizer - from pytorch_ie import DatasetDict - from pytorch_ie.core import Document, DocumentStatistic - - class TokenCountCollector(DocumentStatistic): - - def __init__( - self, - tokenizer: Union[str, PreTrainedTokenizer], - text_field: str, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = ( - AutoTokenizer.from_pretrained(tokenizer) if isinstance(tokenizer, str) else tokenizer - ) - self.tokenizer_kwargs = tokenizer_kwargs or {} - self.text_field = text_field - - def _collect(self, doc: Document) -> int: - text = getattr(doc, self.text_field) - encodings = self.tokenizer(text, **self.tokenizer_kwargs) - tokens = encodings.tokens() - return len(tokens) - - dataset = DatasetDict.load_dataset("pie/conll2003") - statistic = TokenCountCollector( - text_field="text", - tokenizer="bert-base-uncased", - tokenizer_kwargs=dict(add_special_tokens=False), - ) - values = statistic(dataset) - assert values == { - 'train': {'mean': 17.950502100989958, 'std': 13.016237876955675, 'min': 1, 'max': 162}, - 'validation': {'mean': 19.368307692307692, 'std': 14.583363922289669, 'min': 1, 'max': 144}, - 'test': {'mean': 16.774978279756734, 'std': 13.176981022988947, 'min': 1, 'max': 138} - } - ``` - """ - - DEFAULT_AGGREGATION_FUNCTIONS = ["mean", "std", "min", "max"] - - def __init__( - self, - show_histogram: bool = False, - show_as_markdown: bool = False, - aggregation_functions: Optional[List[str]] = None, - title: Optional[str] = None, - document_type: Optional[Union[Type[Document], str]] = None, - ) -> None: - super().__init__() - self.aggregation_functions = { - f_name: resolve_agg_function(f_name) - for f_name in aggregation_functions or self.DEFAULT_AGGREGATION_FUNCTIONS - } - self.show_histogram = show_histogram - self.show_as_markdown = show_as_markdown - self.title = title or self.__class__.__name__ - self._document_type = resolve_optional_document_type(document_type) - - @property - def document_type(self) -> Optional[Type[Document]]: - return self._document_type or super().document_type - - def reset(self) -> None: - self._values: List[Any] = [] - - @abstractmethod - def _collect(self, doc: Document) -> Any: - """Collect any values from a document.""" - - def _update(self, document: Document) -> None: - values = self._collect(document) - self._values.append(values) - - def _compute(self) -> Any: - """We just integrate the values by creating lists for each leaf of the (nested) - dictionary.""" - stats = defaultdict(list) - for collected_result in self._values: - if isinstance(collected_result, dict): - collected_result_flat = flatten_dict(collected_result) - for k, v in collected_result_flat.items(): - if isinstance(v, list): - stats[k].extend(v) - else: - stats[k].append(v) - else: - if isinstance(collected_result, list): - stats[()].extend(collected_result) - else: - stats[()].append(collected_result) - if self.current_split is not None: - title = f"{self.title} (split: {self.current_split}, {len(self._values)} documents)" - else: - title = f"{self.title} ({len(self._values)} documents)" - if self.show_histogram: - import plotext as plt - - for k, values in stats.items(): - if isinstance(values, list): - plt.hist(values, label=".".join(k) if len(k) > 0 else None) - plt.title(title) - plt.show() - plt.clear_figure() - - aggregated_stats = {} - for k, v in stats.items(): - for f_name, f in self.aggregation_functions.items(): - aggregated_stats[k + (f_name,)] = f(v) - - if self.show_as_markdown: - import pandas as pd - - series = pd.Series(aggregated_stats) - if len(series.index.levels) > 1: - df = series.unstack(-1) - logger.info(f"{title}\n{df.round(3).to_markdown()}") - else: - series.index = series.index.get_level_values(0) - logger.info(f"{title}\n{series.round(3).to_markdown()}") - - return unflatten_dict(aggregated_stats) diff --git a/src/pie_datasets/statistics.py b/src/pie_datasets/statistics.py index 549a4fef..3c0c8850 100644 --- a/src/pie_datasets/statistics.py +++ b/src/pie_datasets/statistics.py @@ -3,13 +3,12 @@ from typing import Any, Dict, List, Optional, Type, Union from pytorch_ie.annotations import Span -from pytorch_ie.core import Document +from pytorch_ie.core import Document, DocumentStatistic from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument from pytorch_ie.utils.hydra import resolve_optional_document_type from transformers import AutoTokenizer, PreTrainedTokenizer from pie_datasets.document.conversion import tokenize_document -from pie_datasets.statistic import DocumentStatistic logger = logging.getLogger(__name__) From ac54515385e040bf3da7beee8efa16d2f975de13 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 23:59:51 +0100 Subject: [PATCH 04/10] make mypy happy --- src/pie_datasets/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pie_datasets/statistics.py b/src/pie_datasets/statistics.py index 3c0c8850..0a1ae0e4 100644 --- a/src/pie_datasets/statistics.py +++ b/src/pie_datasets/statistics.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Callable, Dict, List, Optional, Type, Union from pytorch_ie.annotations import Span from pytorch_ie.core import Document, DocumentStatistic @@ -110,7 +110,7 @@ def __init__( f"this collector, but be aware that the results may be wrong for your own aggregation " f"functions that rely on zero values." ) - self.aggregation_functions = { + self.aggregation_functions: Dict[str, Callable[[List], Any]] = { name: func for name, func in self.aggregation_functions.items() if name not in ["mean", "std", "min"] @@ -226,7 +226,7 @@ def __init__( f"this collector, but be aware that the results may be wrong for your own aggregation " f"functions that rely on zero values." ) - self.aggregation_functions = { + self.aggregation_functions: Dict[str, Callable[[List], Any]] = { name: func for name, func in self.aggregation_functions.items() if name not in ["mean", "std", "min"] From b84e2455a3424065a594bfe28076c9f77148d76c Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 00:33:57 +0100 Subject: [PATCH 05/10] use branch "remove_datasets" of pytorch-ie --- poetry.lock | 275 +++---------------------------------------------- pyproject.toml | 3 +- 2 files changed, 17 insertions(+), 261 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3c4aa10d..1f16e66e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -361,62 +361,6 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] -[[package]] -name = "datasets" -version = "2.14.6" -description = "HuggingFace community-driven open-source library of datasets" -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "datasets-2.14.6-py3-none-any.whl", hash = "sha256:4de857ffce21cfc847236745c69f102e33cd1f0fa8398e7be9964525fd4cd5db"}, - {file = "datasets-2.14.6.tar.gz", hash = "sha256:97ebbace8ec7af11434a87d1215379927f8fee2beab2c4a674003756ecfe920c"}, -] - -[package.dependencies] -aiohttp = "*" -dill = ">=0.3.0,<0.3.8" -fsspec = {version = ">=2023.1.0,<=2023.10.0", extras = ["http"]} -huggingface-hub = ">=0.14.0,<1.0.0" -multiprocess = "*" -numpy = ">=1.17" -packaging = "*" -pandas = "*" -pyarrow = ">=8.0.0" -pyyaml = ">=5.1" -requests = ">=2.19.0" -tqdm = ">=4.62.1" -xxhash = "*" - -[package.extras] -apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"] -audio = ["librosa", "soundfile (>=0.12.1)"] -benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "black (>=23.1,<24.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "pyyaml (>=5.3.1)", "rarfile (>=4.0)", "ruff (>=0.0.241)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"] -docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"] -jax = ["jax (>=0.2.8,!=0.3.2,<=0.3.25)", "jaxlib (>=0.1.65,<=0.3.25)"] -metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] -quality = ["black (>=23.1,<24.0)", "pyyaml (>=5.3.1)", "ruff (>=0.0.241)"] -s3 = ["s3fs"] -tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"] -tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"] -tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"] -torch = ["torch"] -vision = ["Pillow (>=6.2.1)"] - -[[package]] -name = "dill" -version = "0.3.7" -description = "serialize all of Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"}, - {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"}, -] - -[package.extras] -graph = ["objgraph (>=1.7.2)"] - [[package]] name = "distlib" version = "0.3.7" @@ -842,34 +786,6 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] -[[package]] -name = "multiprocess" -version = "0.70.15" -description = "better multiprocessing and multithreading in Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"}, - {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"}, - {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"}, - {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"}, - {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"}, - {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"}, - {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"}, - {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"}, - {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"}, - {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"}, - {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"}, - {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"}, - {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"}, - {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"}, - {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"}, - {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"}, -] - -[package.dependencies] -dill = ">=0.3.7" - [[package]] name = "networkx" version = "3.2.1" @@ -1120,54 +1036,6 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" -[[package]] -name = "pyarrow" -version = "14.0.0" -description = "Python library for Apache Arrow" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pyarrow-14.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fce1db17efbc453080c5b306f021926de7c636456a128328797e574c151f81a"}, - {file = "pyarrow-14.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:28de7c05b4d7a71ec660360639cc9b65ceb1175e0e9d4dfccd879a1545bc38f7"}, - {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1541e9209c094e7f4d7b43fdd9de3a8c71d3069cf6fc03b59bf5774042411849"}, - {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c05e6c45d303c80e41ab04996430a0251321f70986ed51213903ea7bc0b7efd"}, - {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:426ffec63ab9b4dff23dec51be2150e3a4a99eb38e66c10a70e2c48779fe9c9d"}, - {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:968844f591902160bd3c9ee240ce8822a3b4e7de731e91daea76ad43fe0ff062"}, - {file = "pyarrow-14.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dcedbc0b4ea955c530145acfe99e324875c386419a09db150291a24cb01aeb81"}, - {file = "pyarrow-14.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:97993a12aacc781efad9c92d4545a877e803c4d106d34237ec4ce987bec825a3"}, - {file = "pyarrow-14.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:80225768d94024d59a31320374f5e6abf8899866c958dfb4f4ea8e2d9ec91bde"}, - {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61546977a8bd7e3d0c697ede723341ef4737e761af2239aef6e1db447f97727"}, - {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42509e6c93b4a1c8ae8ccd939a43f437097783fe130a1991497a6a1abbba026f"}, - {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3eccce331a1392e46573f2ce849a9ee3c074e0d7008e9be0b44566ac149fd6a1"}, - {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ecc463c45f2b6b36431f5f2025842245e8c15afe4d42072230575785f3bb00c6"}, - {file = "pyarrow-14.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:4362ed90def81640addcd521811dd16a13015f0a8255bec324a41262c1524b6c"}, - {file = "pyarrow-14.0.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:2fbb7ab62537782c5ab31aa08db0e1f6de92c2c515fdfc0790128384e919adcb"}, - {file = "pyarrow-14.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad7095f8f0fe0bfa3d3fca1909b8fa15c70e630b0cc1ff8d35e143f5e2704064"}, - {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6602272fce71c0fb64f266e7cdbe51b93b00c22fc1bb57f2b0cb681c4aeedf4"}, - {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2b8f87951b08a3e72265c8963da3fe4f737bb81290269037e047dd172aa591"}, - {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a1c9675966662a042caebbaafa1ae7fc26291287ebc3da06aa63ad74c323ec30"}, - {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:771079fddc0b4440c41af541dbdebc711a7062c93d3c4764476a9442606977db"}, - {file = "pyarrow-14.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:c4096136318de1c4937370c0c365f949961c371201c396d8cc94a353f342069d"}, - {file = "pyarrow-14.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:6c94056fb5f0ee0bae2206c3f776881e1db2bd0d133d06805755ae7ac5145349"}, - {file = "pyarrow-14.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:687d0df1e08876b2d24d42abae129742fc655367e3fe6700aa4d79fcf2e3215e"}, - {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f4054e5ee6c88ca256a67fc8b27f9c59bcd385216346265831d462a6069033f"}, - {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:768b962e4c042ab2c96576ca0757935472e220d11af855c7d0be3279d7fced5f"}, - {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:77293b1319c7044f68ebfa43db8c929a0a5254ce371f1a0873d343f1460171d0"}, - {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d2bc7c53941d85f0133b1bd5a814bca0af213922f50d8a8dc0eed4d9ed477845"}, - {file = "pyarrow-14.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:378955365dd087c285ef4f34ad939d7e551b7715326710e8cd21cfa2ce511bd7"}, - {file = "pyarrow-14.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:f05e81b4c621e6ad4bcd8f785e3aa1d6c49a935818b809ea6e7bf206a5b1a4e8"}, - {file = "pyarrow-14.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6867f6a8057eaef5a7ac6d27fe5518133f67973c5d4295d79a943458350e7c61"}, - {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca54b87c46abdfe027f18f959ca388102bd7326c344838f72244807462d091b2"}, - {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35abf61bd0cc9daca3afc715f6ba74ea83d792fa040025352624204bec66bf6a"}, - {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:65c377523b369f7ef1ba02be814e832443bb3b15065010838f02dae5bdc0f53c"}, - {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:e8a1e470e4b5f7bda7bede0410291daec55ab69f346d77795d34fd6a45b41579"}, - {file = "pyarrow-14.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:466c1a5a7a4b279cfa363ac34dedd0c3c6af388cec9e6a468ffc095a6627849a"}, - {file = "pyarrow-14.0.0.tar.gz", hash = "sha256:45d3324e1c9871a07de6b4d514ebd73225490963a6dd46c64c465c4b6079fe1e"}, -] - -[package.dependencies] -numpy = ">=1.16.6" - [[package]] name = "pytest" version = "7.4.3" @@ -1227,20 +1095,24 @@ name = "pytorch-ie" version = "0.27.0" description = "State-of-the-art Information Extraction in PyTorch" optional = false -python-versions = ">=3.9,<4.0" -files = [ - {file = "pytorch_ie-0.27.0-py3-none-any.whl", hash = "sha256:d8eec1183d260e2ad13b3aeea10342bd46ef2b3cefb64fafdbddecc91181c14e"}, - {file = "pytorch_ie-0.27.0.tar.gz", hash = "sha256:6711d8afe63c7754e70dc6bf20427f005edd0b0a60d1d670290b4d81068614a4"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -absl-py = ">=1.0.0,<2.0.0" -datasets = ">=2.13,<3.0" +absl-py = "^1.0.0" fsspec = "<2023.9.0" -pytorch-lightning = ">=2,<3" +pandas = "^2.0.0" +pytorch-lightning = "^2" torch = ">=1.10" -torchmetrics = ">=1,<2" -transformers = ">=4.18,<5.0" +torchmetrics = "^1" +transformers = "^4.18" + +[package.source] +type = "git" +url = "https://github.com/ChristophAlt/pytorch-ie.git" +reference = "remove_datasets" +resolved_reference = "d53ee0a1245b45336e6c978abc5744e906a6ac80" [[package]] name = "pytorch-lightning" @@ -1956,123 +1828,6 @@ platformdirs = ">=3.9.1,<4" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] -[[package]] -name = "xxhash" -version = "3.4.1" -description = "Python binding for xxHash" -optional = false -python-versions = ">=3.7" -files = [ - {file = "xxhash-3.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91dbfa55346ad3e18e738742236554531a621042e419b70ad8f3c1d9c7a16e7f"}, - {file = "xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:665a65c2a48a72068fcc4d21721510df5f51f1142541c890491afc80451636d2"}, - {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb11628470a6004dc71a09fe90c2f459ff03d611376c1debeec2d648f44cb693"}, - {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bef2a7dc7b4f4beb45a1edbba9b9194c60a43a89598a87f1a0226d183764189"}, - {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0f7b2d547d72c7eda7aa817acf8791f0146b12b9eba1d4432c531fb0352228"}, - {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00f2fdef6b41c9db3d2fc0e7f94cb3db86693e5c45d6de09625caad9a469635b"}, - {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23cfd9ca09acaf07a43e5a695143d9a21bf00f5b49b15c07d5388cadf1f9ce11"}, - {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6a9ff50a3cf88355ca4731682c168049af1ca222d1d2925ef7119c1a78e95b3b"}, - {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f1d7c69a1e9ca5faa75546fdd267f214f63f52f12692f9b3a2f6467c9e67d5e7"}, - {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:672b273040d5d5a6864a36287f3514efcd1d4b1b6a7480f294c4b1d1ee1b8de0"}, - {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4178f78d70e88f1c4a89ff1ffe9f43147185930bb962ee3979dba15f2b1cc799"}, - {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9804b9eb254d4b8cc83ab5a2002128f7d631dd427aa873c8727dba7f1f0d1c2b"}, - {file = "xxhash-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c09c49473212d9c87261d22c74370457cfff5db2ddfc7fd1e35c80c31a8c14ce"}, - {file = "xxhash-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ebbb1616435b4a194ce3466d7247df23499475c7ed4eb2681a1fa42ff766aff6"}, - {file = "xxhash-3.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:25dc66be3db54f8a2d136f695b00cfe88018e59ccff0f3b8f545869f376a8a46"}, - {file = "xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58c49083801885273e262c0f5bbeac23e520564b8357fbb18fb94ff09d3d3ea5"}, - {file = "xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b526015a973bfbe81e804a586b703f163861da36d186627e27524f5427b0d520"}, - {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36ad4457644c91a966f6fe137d7467636bdc51a6ce10a1d04f365c70d6a16d7e"}, - {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:248d3e83d119770f96003271fe41e049dd4ae52da2feb8f832b7a20e791d2920"}, - {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2070b6d5bbef5ee031666cf21d4953c16e92c2f8a24a94b5c240f8995ba3b1d0"}, - {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2746035f518f0410915e247877f7df43ef3372bf36cfa52cc4bc33e85242641"}, - {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ba6181514681c2591840d5632fcf7356ab287d4aff1c8dea20f3c78097088"}, - {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aac5010869240e95f740de43cd6a05eae180c59edd182ad93bf12ee289484fa"}, - {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4cb11d8debab1626181633d184b2372aaa09825bde709bf927704ed72765bed1"}, - {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b29728cff2c12f3d9f1d940528ee83918d803c0567866e062683f300d1d2eff3"}, - {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a15cbf3a9c40672523bdb6ea97ff74b443406ba0ab9bca10ceccd9546414bd84"}, - {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e66df260fed01ed8ea790c2913271641c58481e807790d9fca8bfd5a3c13844"}, - {file = "xxhash-3.4.1-cp311-cp311-win32.whl", hash = "sha256:e867f68a8f381ea12858e6d67378c05359d3a53a888913b5f7d35fbf68939d5f"}, - {file = "xxhash-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:200a5a3ad9c7c0c02ed1484a1d838b63edcf92ff538770ea07456a3732c577f4"}, - {file = "xxhash-3.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:1d03f1c0d16d24ea032e99f61c552cb2b77d502e545187338bea461fde253583"}, - {file = "xxhash-3.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c4bbba9b182697a52bc0c9f8ec0ba1acb914b4937cd4a877ad78a3b3eeabefb3"}, - {file = "xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9fd28a9da300e64e434cfc96567a8387d9a96e824a9be1452a1e7248b7763b78"}, - {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6066d88c9329ab230e18998daec53d819daeee99d003955c8db6fc4971b45ca3"}, - {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93805bc3233ad89abf51772f2ed3355097a5dc74e6080de19706fc447da99cd3"}, - {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64da57d5ed586ebb2ecdde1e997fa37c27fe32fe61a656b77fabbc58e6fbff6e"}, - {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a97322e9a7440bf3c9805cbaac090358b43f650516486746f7fa482672593df"}, - {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbe750d512982ee7d831838a5dee9e9848f3fb440e4734cca3f298228cc957a6"}, - {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fd79d4087727daf4d5b8afe594b37d611ab95dc8e29fe1a7517320794837eb7d"}, - {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:743612da4071ff9aa4d055f3f111ae5247342931dedb955268954ef7201a71ff"}, - {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b41edaf05734092f24f48c0958b3c6cbaaa5b7e024880692078c6b1f8247e2fc"}, - {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:a90356ead70d715fe64c30cd0969072de1860e56b78adf7c69d954b43e29d9fa"}, - {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac56eebb364e44c85e1d9e9cc5f6031d78a34f0092fea7fc80478139369a8b4a"}, - {file = "xxhash-3.4.1-cp312-cp312-win32.whl", hash = "sha256:911035345932a153c427107397c1518f8ce456f93c618dd1c5b54ebb22e73747"}, - {file = "xxhash-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:f31ce76489f8601cc7b8713201ce94b4bd7b7ce90ba3353dccce7e9e1fee71fa"}, - {file = "xxhash-3.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:b5beb1c6a72fdc7584102f42c4d9df232ee018ddf806e8c90906547dfb43b2da"}, - {file = "xxhash-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d42b24d1496deb05dee5a24ed510b16de1d6c866c626c2beb11aebf3be278b9"}, - {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b685fab18876b14a8f94813fa2ca80cfb5ab6a85d31d5539b7cd749ce9e3624"}, - {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419ffe34c17ae2df019a4685e8d3934d46b2e0bbe46221ab40b7e04ed9f11137"}, - {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e041ce5714f95251a88670c114b748bca3bf80cc72400e9f23e6d0d59cf2681"}, - {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc860d887c5cb2f524899fb8338e1bb3d5789f75fac179101920d9afddef284b"}, - {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:312eba88ffe0a05e332e3a6f9788b73883752be63f8588a6dc1261a3eaaaf2b2"}, - {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e01226b6b6a1ffe4e6bd6d08cfcb3ca708b16f02eb06dd44f3c6e53285f03e4f"}, - {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9f3025a0d5d8cf406a9313cd0d5789c77433ba2004b1c75439b67678e5136537"}, - {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6d3472fd4afef2a567d5f14411d94060099901cd8ce9788b22b8c6f13c606a93"}, - {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:43984c0a92f06cac434ad181f329a1445017c33807b7ae4f033878d860a4b0f2"}, - {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a55e0506fdb09640a82ec4f44171273eeabf6f371a4ec605633adb2837b5d9d5"}, - {file = "xxhash-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:faec30437919555b039a8bdbaba49c013043e8f76c999670aef146d33e05b3a0"}, - {file = "xxhash-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c9e1b646af61f1fc7083bb7b40536be944f1ac67ef5e360bca2d73430186971a"}, - {file = "xxhash-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:961d948b7b1c1b6c08484bbce3d489cdf153e4122c3dfb07c2039621243d8795"}, - {file = "xxhash-3.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:719a378930504ab159f7b8e20fa2aa1896cde050011af838af7e7e3518dd82de"}, - {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74fb5cb9406ccd7c4dd917f16630d2e5e8cbbb02fc2fca4e559b2a47a64f4940"}, - {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5dab508ac39e0ab988039bc7f962c6ad021acd81fd29145962b068df4148c476"}, - {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c59f3e46e7daf4c589e8e853d700ef6607afa037bfad32c390175da28127e8c"}, - {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc07256eff0795e0f642df74ad096f8c5d23fe66bc138b83970b50fc7f7f6c5"}, - {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9f749999ed80f3955a4af0eb18bb43993f04939350b07b8dd2f44edc98ffee9"}, - {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7688d7c02149a90a3d46d55b341ab7ad1b4a3f767be2357e211b4e893efbaaf6"}, - {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a8b4977963926f60b0d4f830941c864bed16aa151206c01ad5c531636da5708e"}, - {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8106d88da330f6535a58a8195aa463ef5281a9aa23b04af1848ff715c4398fb4"}, - {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4c76a77dbd169450b61c06fd2d5d436189fc8ab7c1571d39265d4822da16df22"}, - {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:11f11357c86d83e53719c592021fd524efa9cf024dc7cb1dfb57bbbd0d8713f2"}, - {file = "xxhash-3.4.1-cp38-cp38-win32.whl", hash = "sha256:0c786a6cd74e8765c6809892a0d45886e7c3dc54de4985b4a5eb8b630f3b8e3b"}, - {file = "xxhash-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:aabf37fb8fa27430d50507deeab2ee7b1bcce89910dd10657c38e71fee835594"}, - {file = "xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6127813abc1477f3a83529b6bbcfeddc23162cece76fa69aee8f6a8a97720562"}, - {file = "xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef2e194262f5db16075caea7b3f7f49392242c688412f386d3c7b07c7733a70a"}, - {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71be94265b6c6590f0018bbf73759d21a41c6bda20409782d8117e76cd0dfa8b"}, - {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10e0a619cdd1c0980e25eb04e30fe96cf8f4324758fa497080af9c21a6de573f"}, - {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa122124d2e3bd36581dd78c0efa5f429f5220313479fb1072858188bc2d5ff1"}, - {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17032f5a4fea0a074717fe33477cb5ee723a5f428de7563e75af64bfc1b1e10"}, - {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca7783b20e3e4f3f52f093538895863f21d18598f9a48211ad757680c3bd006f"}, - {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d77d09a1113899fad5f354a1eb4f0a9afcf58cefff51082c8ad643ff890e30cf"}, - {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:21287bcdd299fdc3328cc0fbbdeaa46838a1c05391264e51ddb38a3f5b09611f"}, - {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dfd7a6cc483e20b4ad90224aeb589e64ec0f31e5610ab9957ff4314270b2bf31"}, - {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:543c7fcbc02bbb4840ea9915134e14dc3dc15cbd5a30873a7a5bf66039db97ec"}, - {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe0a98d990e433013f41827b62be9ab43e3cf18e08b1483fcc343bda0d691182"}, - {file = "xxhash-3.4.1-cp39-cp39-win32.whl", hash = "sha256:b9097af00ebf429cc7c0e7d2fdf28384e4e2e91008130ccda8d5ae653db71e54"}, - {file = "xxhash-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:d699b921af0dcde50ab18be76c0d832f803034d80470703700cb7df0fbec2832"}, - {file = "xxhash-3.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:2be491723405e15cc099ade1280133ccfbf6322d2ef568494fb7d07d280e7eee"}, - {file = "xxhash-3.4.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:431625fad7ab5649368c4849d2b49a83dc711b1f20e1f7f04955aab86cd307bc"}, - {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc6dbd5fc3c9886a9e041848508b7fb65fd82f94cc793253990f81617b61fe49"}, - {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ff8dbd0ec97aec842476cb8ccc3e17dd288cd6ce3c8ef38bff83d6eb927817"}, - {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef73a53fe90558a4096e3256752268a8bdc0322f4692ed928b6cd7ce06ad4fe3"}, - {file = "xxhash-3.4.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:450401f42bbd274b519d3d8dcf3c57166913381a3d2664d6609004685039f9d3"}, - {file = "xxhash-3.4.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a162840cf4de8a7cd8720ff3b4417fbc10001eefdd2d21541a8226bb5556e3bb"}, - {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b736a2a2728ba45017cb67785e03125a79d246462dfa892d023b827007412c52"}, - {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0ae4c2e7698adef58710d6e7a32ff518b66b98854b1c68e70eee504ad061d8"}, - {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6322c4291c3ff174dcd104fae41500e75dad12be6f3085d119c2c8a80956c51"}, - {file = "xxhash-3.4.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd59ed668801c3fae282f8f4edadf6dc7784db6d18139b584b6d9677ddde1b6b"}, - {file = "xxhash-3.4.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92693c487e39523a80474b0394645b393f0ae781d8db3474ccdcead0559ccf45"}, - {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4603a0f642a1e8d7f3ba5c4c25509aca6a9c1cc16f85091004a7028607ead663"}, - {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa45e8cbfbadb40a920fe9ca40c34b393e0b067082d94006f7f64e70c7490a6"}, - {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595b252943b3552de491ff51e5bb79660f84f033977f88f6ca1605846637b7c6"}, - {file = "xxhash-3.4.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:562d8b8f783c6af969806aaacf95b6c7b776929ae26c0cd941d54644ea7ef51e"}, - {file = "xxhash-3.4.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:41ddeae47cf2828335d8d991f2d2b03b0bdc89289dc64349d712ff8ce59d0647"}, - {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c44d584afdf3c4dbb3277e32321d1a7b01d6071c1992524b6543025fb8f4206f"}, - {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7bddb3a5b86213cc3f2c61500c16945a1b80ecd572f3078ddbbe68f9dabdfb"}, - {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ecb6c987b62437c2f99c01e97caf8d25660bf541fe79a481d05732e5236719c"}, - {file = "xxhash-3.4.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:696b4e18b7023527d5c50ed0626ac0520edac45a50ec7cf3fc265cd08b1f4c03"}, - {file = "xxhash-3.4.1.tar.gz", hash = "sha256:0379d6cf1ff987cd421609a264ce025e74f346e3e145dd106c0cc2e3ec3f99a9"}, -] - [[package]] name = "yarl" version = "1.9.2" @@ -2163,4 +1918,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "05fdb17a8a21088696573d3a3356c1b815446af17e6589c28c11eb80d2e8788e" +content-hash = "b51eeb301b26a29c59cef01b4ff3877cf7e67c59fa40a48179be633908ee5e62" diff --git a/pyproject.toml b/pyproject.toml index ae4f83ee..6036c7dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,8 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.9" -pytorch-ie = ">=0.27.0,<0.28.0" +#pytorch-ie = ">=0.28.0,<0.29.0" +pytorch-ie = { git = "https://github.com/ChristophAlt/pytorch-ie.git", branch = "remove_datasets" } [tool.poetry.group.dev.dependencies] torch = {version = "^2.1.0+cpu", source = "pytorch"} From 818ce8bce067b3903d7d7ee1f8925027e0124dd6 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 00:53:35 +0100 Subject: [PATCH 06/10] fix: use branch "remove_datasets" of pytorch-ie --- poetry.lock | 253 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 252 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1f16e66e..2ca390cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -361,6 +361,62 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "datasets" +version = "2.14.6" +description = "HuggingFace community-driven open-source library of datasets" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "datasets-2.14.6-py3-none-any.whl", hash = "sha256:4de857ffce21cfc847236745c69f102e33cd1f0fa8398e7be9964525fd4cd5db"}, + {file = "datasets-2.14.6.tar.gz", hash = "sha256:97ebbace8ec7af11434a87d1215379927f8fee2beab2c4a674003756ecfe920c"}, +] + +[package.dependencies] +aiohttp = "*" +dill = ">=0.3.0,<0.3.8" +fsspec = {version = ">=2023.1.0,<=2023.10.0", extras = ["http"]} +huggingface-hub = ">=0.14.0,<1.0.0" +multiprocess = "*" +numpy = ">=1.17" +packaging = "*" +pandas = "*" +pyarrow = ">=8.0.0" +pyyaml = ">=5.1" +requests = ">=2.19.0" +tqdm = ">=4.62.1" +xxhash = "*" + +[package.extras] +apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"] +audio = ["librosa", "soundfile (>=0.12.1)"] +benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] +dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "black (>=23.1,<24.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "pyyaml (>=5.3.1)", "rarfile (>=4.0)", "ruff (>=0.0.241)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"] +docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"] +jax = ["jax (>=0.2.8,!=0.3.2,<=0.3.25)", "jaxlib (>=0.1.65,<=0.3.25)"] +metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] +quality = ["black (>=23.1,<24.0)", "pyyaml (>=5.3.1)", "ruff (>=0.0.241)"] +s3 = ["s3fs"] +tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"] +tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"] +tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"] +torch = ["torch"] +vision = ["Pillow (>=6.2.1)"] + +[[package]] +name = "dill" +version = "0.3.7" +description = "serialize all of Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"}, + {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"}, +] + +[package.extras] +graph = ["objgraph (>=1.7.2)"] + [[package]] name = "distlib" version = "0.3.7" @@ -786,6 +842,34 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "multiprocess" +version = "0.70.15" +description = "better multiprocessing and multithreading in Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"}, + {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"}, + {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"}, + {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"}, + {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"}, + {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"}, + {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"}, +] + +[package.dependencies] +dill = ">=0.3.7" + [[package]] name = "networkx" version = "3.2.1" @@ -1036,6 +1120,54 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" +[[package]] +name = "pyarrow" +version = "14.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fce1db17efbc453080c5b306f021926de7c636456a128328797e574c151f81a"}, + {file = "pyarrow-14.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:28de7c05b4d7a71ec660360639cc9b65ceb1175e0e9d4dfccd879a1545bc38f7"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1541e9209c094e7f4d7b43fdd9de3a8c71d3069cf6fc03b59bf5774042411849"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c05e6c45d303c80e41ab04996430a0251321f70986ed51213903ea7bc0b7efd"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:426ffec63ab9b4dff23dec51be2150e3a4a99eb38e66c10a70e2c48779fe9c9d"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:968844f591902160bd3c9ee240ce8822a3b4e7de731e91daea76ad43fe0ff062"}, + {file = "pyarrow-14.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dcedbc0b4ea955c530145acfe99e324875c386419a09db150291a24cb01aeb81"}, + {file = "pyarrow-14.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:97993a12aacc781efad9c92d4545a877e803c4d106d34237ec4ce987bec825a3"}, + {file = "pyarrow-14.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:80225768d94024d59a31320374f5e6abf8899866c958dfb4f4ea8e2d9ec91bde"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61546977a8bd7e3d0c697ede723341ef4737e761af2239aef6e1db447f97727"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42509e6c93b4a1c8ae8ccd939a43f437097783fe130a1991497a6a1abbba026f"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3eccce331a1392e46573f2ce849a9ee3c074e0d7008e9be0b44566ac149fd6a1"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ecc463c45f2b6b36431f5f2025842245e8c15afe4d42072230575785f3bb00c6"}, + {file = "pyarrow-14.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:4362ed90def81640addcd521811dd16a13015f0a8255bec324a41262c1524b6c"}, + {file = "pyarrow-14.0.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:2fbb7ab62537782c5ab31aa08db0e1f6de92c2c515fdfc0790128384e919adcb"}, + {file = "pyarrow-14.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad7095f8f0fe0bfa3d3fca1909b8fa15c70e630b0cc1ff8d35e143f5e2704064"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6602272fce71c0fb64f266e7cdbe51b93b00c22fc1bb57f2b0cb681c4aeedf4"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2b8f87951b08a3e72265c8963da3fe4f737bb81290269037e047dd172aa591"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a1c9675966662a042caebbaafa1ae7fc26291287ebc3da06aa63ad74c323ec30"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:771079fddc0b4440c41af541dbdebc711a7062c93d3c4764476a9442606977db"}, + {file = "pyarrow-14.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:c4096136318de1c4937370c0c365f949961c371201c396d8cc94a353f342069d"}, + {file = "pyarrow-14.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:6c94056fb5f0ee0bae2206c3f776881e1db2bd0d133d06805755ae7ac5145349"}, + {file = "pyarrow-14.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:687d0df1e08876b2d24d42abae129742fc655367e3fe6700aa4d79fcf2e3215e"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f4054e5ee6c88ca256a67fc8b27f9c59bcd385216346265831d462a6069033f"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:768b962e4c042ab2c96576ca0757935472e220d11af855c7d0be3279d7fced5f"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:77293b1319c7044f68ebfa43db8c929a0a5254ce371f1a0873d343f1460171d0"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d2bc7c53941d85f0133b1bd5a814bca0af213922f50d8a8dc0eed4d9ed477845"}, + {file = "pyarrow-14.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:378955365dd087c285ef4f34ad939d7e551b7715326710e8cd21cfa2ce511bd7"}, + {file = "pyarrow-14.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:f05e81b4c621e6ad4bcd8f785e3aa1d6c49a935818b809ea6e7bf206a5b1a4e8"}, + {file = "pyarrow-14.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6867f6a8057eaef5a7ac6d27fe5518133f67973c5d4295d79a943458350e7c61"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca54b87c46abdfe027f18f959ca388102bd7326c344838f72244807462d091b2"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35abf61bd0cc9daca3afc715f6ba74ea83d792fa040025352624204bec66bf6a"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:65c377523b369f7ef1ba02be814e832443bb3b15065010838f02dae5bdc0f53c"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:e8a1e470e4b5f7bda7bede0410291daec55ab69f346d77795d34fd6a45b41579"}, + {file = "pyarrow-14.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:466c1a5a7a4b279cfa363ac34dedd0c3c6af388cec9e6a468ffc095a6627849a"}, + {file = "pyarrow-14.0.0.tar.gz", hash = "sha256:45d3324e1c9871a07de6b4d514ebd73225490963a6dd46c64c465c4b6079fe1e"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pytest" version = "7.4.3" @@ -1112,7 +1244,7 @@ transformers = "^4.18" type = "git" url = "https://github.com/ChristophAlt/pytorch-ie.git" reference = "remove_datasets" -resolved_reference = "d53ee0a1245b45336e6c978abc5744e906a6ac80" +resolved_reference = "f35fa328fe9fb036f1d628d00468e196684d37bc" [[package]] name = "pytorch-lightning" @@ -1828,6 +1960,123 @@ platformdirs = ">=3.9.1,<4" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +[[package]] +name = "xxhash" +version = "3.4.1" +description = "Python binding for xxHash" +optional = false +python-versions = ">=3.7" +files = [ + {file = "xxhash-3.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91dbfa55346ad3e18e738742236554531a621042e419b70ad8f3c1d9c7a16e7f"}, + {file = "xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:665a65c2a48a72068fcc4d21721510df5f51f1142541c890491afc80451636d2"}, + {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb11628470a6004dc71a09fe90c2f459ff03d611376c1debeec2d648f44cb693"}, + {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bef2a7dc7b4f4beb45a1edbba9b9194c60a43a89598a87f1a0226d183764189"}, + {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0f7b2d547d72c7eda7aa817acf8791f0146b12b9eba1d4432c531fb0352228"}, + {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00f2fdef6b41c9db3d2fc0e7f94cb3db86693e5c45d6de09625caad9a469635b"}, + {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23cfd9ca09acaf07a43e5a695143d9a21bf00f5b49b15c07d5388cadf1f9ce11"}, + {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6a9ff50a3cf88355ca4731682c168049af1ca222d1d2925ef7119c1a78e95b3b"}, + {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f1d7c69a1e9ca5faa75546fdd267f214f63f52f12692f9b3a2f6467c9e67d5e7"}, + {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:672b273040d5d5a6864a36287f3514efcd1d4b1b6a7480f294c4b1d1ee1b8de0"}, + {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4178f78d70e88f1c4a89ff1ffe9f43147185930bb962ee3979dba15f2b1cc799"}, + {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9804b9eb254d4b8cc83ab5a2002128f7d631dd427aa873c8727dba7f1f0d1c2b"}, + {file = "xxhash-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c09c49473212d9c87261d22c74370457cfff5db2ddfc7fd1e35c80c31a8c14ce"}, + {file = "xxhash-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ebbb1616435b4a194ce3466d7247df23499475c7ed4eb2681a1fa42ff766aff6"}, + {file = "xxhash-3.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:25dc66be3db54f8a2d136f695b00cfe88018e59ccff0f3b8f545869f376a8a46"}, + {file = "xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58c49083801885273e262c0f5bbeac23e520564b8357fbb18fb94ff09d3d3ea5"}, + {file = "xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b526015a973bfbe81e804a586b703f163861da36d186627e27524f5427b0d520"}, + {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36ad4457644c91a966f6fe137d7467636bdc51a6ce10a1d04f365c70d6a16d7e"}, + {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:248d3e83d119770f96003271fe41e049dd4ae52da2feb8f832b7a20e791d2920"}, + {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2070b6d5bbef5ee031666cf21d4953c16e92c2f8a24a94b5c240f8995ba3b1d0"}, + {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2746035f518f0410915e247877f7df43ef3372bf36cfa52cc4bc33e85242641"}, + {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ba6181514681c2591840d5632fcf7356ab287d4aff1c8dea20f3c78097088"}, + {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aac5010869240e95f740de43cd6a05eae180c59edd182ad93bf12ee289484fa"}, + {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4cb11d8debab1626181633d184b2372aaa09825bde709bf927704ed72765bed1"}, + {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b29728cff2c12f3d9f1d940528ee83918d803c0567866e062683f300d1d2eff3"}, + {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a15cbf3a9c40672523bdb6ea97ff74b443406ba0ab9bca10ceccd9546414bd84"}, + {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e66df260fed01ed8ea790c2913271641c58481e807790d9fca8bfd5a3c13844"}, + {file = "xxhash-3.4.1-cp311-cp311-win32.whl", hash = "sha256:e867f68a8f381ea12858e6d67378c05359d3a53a888913b5f7d35fbf68939d5f"}, + {file = "xxhash-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:200a5a3ad9c7c0c02ed1484a1d838b63edcf92ff538770ea07456a3732c577f4"}, + {file = "xxhash-3.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:1d03f1c0d16d24ea032e99f61c552cb2b77d502e545187338bea461fde253583"}, + {file = "xxhash-3.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c4bbba9b182697a52bc0c9f8ec0ba1acb914b4937cd4a877ad78a3b3eeabefb3"}, + {file = "xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9fd28a9da300e64e434cfc96567a8387d9a96e824a9be1452a1e7248b7763b78"}, + {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6066d88c9329ab230e18998daec53d819daeee99d003955c8db6fc4971b45ca3"}, + {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93805bc3233ad89abf51772f2ed3355097a5dc74e6080de19706fc447da99cd3"}, + {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64da57d5ed586ebb2ecdde1e997fa37c27fe32fe61a656b77fabbc58e6fbff6e"}, + {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a97322e9a7440bf3c9805cbaac090358b43f650516486746f7fa482672593df"}, + {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbe750d512982ee7d831838a5dee9e9848f3fb440e4734cca3f298228cc957a6"}, + {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fd79d4087727daf4d5b8afe594b37d611ab95dc8e29fe1a7517320794837eb7d"}, + {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:743612da4071ff9aa4d055f3f111ae5247342931dedb955268954ef7201a71ff"}, + {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b41edaf05734092f24f48c0958b3c6cbaaa5b7e024880692078c6b1f8247e2fc"}, + {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:a90356ead70d715fe64c30cd0969072de1860e56b78adf7c69d954b43e29d9fa"}, + {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac56eebb364e44c85e1d9e9cc5f6031d78a34f0092fea7fc80478139369a8b4a"}, + {file = "xxhash-3.4.1-cp312-cp312-win32.whl", hash = "sha256:911035345932a153c427107397c1518f8ce456f93c618dd1c5b54ebb22e73747"}, + {file = "xxhash-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:f31ce76489f8601cc7b8713201ce94b4bd7b7ce90ba3353dccce7e9e1fee71fa"}, + {file = "xxhash-3.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:b5beb1c6a72fdc7584102f42c4d9df232ee018ddf806e8c90906547dfb43b2da"}, + {file = "xxhash-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d42b24d1496deb05dee5a24ed510b16de1d6c866c626c2beb11aebf3be278b9"}, + {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b685fab18876b14a8f94813fa2ca80cfb5ab6a85d31d5539b7cd749ce9e3624"}, + {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419ffe34c17ae2df019a4685e8d3934d46b2e0bbe46221ab40b7e04ed9f11137"}, + {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e041ce5714f95251a88670c114b748bca3bf80cc72400e9f23e6d0d59cf2681"}, + {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc860d887c5cb2f524899fb8338e1bb3d5789f75fac179101920d9afddef284b"}, + {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:312eba88ffe0a05e332e3a6f9788b73883752be63f8588a6dc1261a3eaaaf2b2"}, + {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e01226b6b6a1ffe4e6bd6d08cfcb3ca708b16f02eb06dd44f3c6e53285f03e4f"}, + {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9f3025a0d5d8cf406a9313cd0d5789c77433ba2004b1c75439b67678e5136537"}, + {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6d3472fd4afef2a567d5f14411d94060099901cd8ce9788b22b8c6f13c606a93"}, + {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:43984c0a92f06cac434ad181f329a1445017c33807b7ae4f033878d860a4b0f2"}, + {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a55e0506fdb09640a82ec4f44171273eeabf6f371a4ec605633adb2837b5d9d5"}, + {file = "xxhash-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:faec30437919555b039a8bdbaba49c013043e8f76c999670aef146d33e05b3a0"}, + {file = "xxhash-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c9e1b646af61f1fc7083bb7b40536be944f1ac67ef5e360bca2d73430186971a"}, + {file = "xxhash-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:961d948b7b1c1b6c08484bbce3d489cdf153e4122c3dfb07c2039621243d8795"}, + {file = "xxhash-3.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:719a378930504ab159f7b8e20fa2aa1896cde050011af838af7e7e3518dd82de"}, + {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74fb5cb9406ccd7c4dd917f16630d2e5e8cbbb02fc2fca4e559b2a47a64f4940"}, + {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5dab508ac39e0ab988039bc7f962c6ad021acd81fd29145962b068df4148c476"}, + {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c59f3e46e7daf4c589e8e853d700ef6607afa037bfad32c390175da28127e8c"}, + {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc07256eff0795e0f642df74ad096f8c5d23fe66bc138b83970b50fc7f7f6c5"}, + {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9f749999ed80f3955a4af0eb18bb43993f04939350b07b8dd2f44edc98ffee9"}, + {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7688d7c02149a90a3d46d55b341ab7ad1b4a3f767be2357e211b4e893efbaaf6"}, + {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a8b4977963926f60b0d4f830941c864bed16aa151206c01ad5c531636da5708e"}, + {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8106d88da330f6535a58a8195aa463ef5281a9aa23b04af1848ff715c4398fb4"}, + {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4c76a77dbd169450b61c06fd2d5d436189fc8ab7c1571d39265d4822da16df22"}, + {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:11f11357c86d83e53719c592021fd524efa9cf024dc7cb1dfb57bbbd0d8713f2"}, + {file = "xxhash-3.4.1-cp38-cp38-win32.whl", hash = "sha256:0c786a6cd74e8765c6809892a0d45886e7c3dc54de4985b4a5eb8b630f3b8e3b"}, + {file = "xxhash-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:aabf37fb8fa27430d50507deeab2ee7b1bcce89910dd10657c38e71fee835594"}, + {file = "xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6127813abc1477f3a83529b6bbcfeddc23162cece76fa69aee8f6a8a97720562"}, + {file = "xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef2e194262f5db16075caea7b3f7f49392242c688412f386d3c7b07c7733a70a"}, + {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71be94265b6c6590f0018bbf73759d21a41c6bda20409782d8117e76cd0dfa8b"}, + {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10e0a619cdd1c0980e25eb04e30fe96cf8f4324758fa497080af9c21a6de573f"}, + {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa122124d2e3bd36581dd78c0efa5f429f5220313479fb1072858188bc2d5ff1"}, + {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17032f5a4fea0a074717fe33477cb5ee723a5f428de7563e75af64bfc1b1e10"}, + {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca7783b20e3e4f3f52f093538895863f21d18598f9a48211ad757680c3bd006f"}, + {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d77d09a1113899fad5f354a1eb4f0a9afcf58cefff51082c8ad643ff890e30cf"}, + {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:21287bcdd299fdc3328cc0fbbdeaa46838a1c05391264e51ddb38a3f5b09611f"}, + {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dfd7a6cc483e20b4ad90224aeb589e64ec0f31e5610ab9957ff4314270b2bf31"}, + {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:543c7fcbc02bbb4840ea9915134e14dc3dc15cbd5a30873a7a5bf66039db97ec"}, + {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe0a98d990e433013f41827b62be9ab43e3cf18e08b1483fcc343bda0d691182"}, + {file = "xxhash-3.4.1-cp39-cp39-win32.whl", hash = "sha256:b9097af00ebf429cc7c0e7d2fdf28384e4e2e91008130ccda8d5ae653db71e54"}, + {file = "xxhash-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:d699b921af0dcde50ab18be76c0d832f803034d80470703700cb7df0fbec2832"}, + {file = "xxhash-3.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:2be491723405e15cc099ade1280133ccfbf6322d2ef568494fb7d07d280e7eee"}, + {file = "xxhash-3.4.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:431625fad7ab5649368c4849d2b49a83dc711b1f20e1f7f04955aab86cd307bc"}, + {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc6dbd5fc3c9886a9e041848508b7fb65fd82f94cc793253990f81617b61fe49"}, + {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ff8dbd0ec97aec842476cb8ccc3e17dd288cd6ce3c8ef38bff83d6eb927817"}, + {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef73a53fe90558a4096e3256752268a8bdc0322f4692ed928b6cd7ce06ad4fe3"}, + {file = "xxhash-3.4.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:450401f42bbd274b519d3d8dcf3c57166913381a3d2664d6609004685039f9d3"}, + {file = "xxhash-3.4.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a162840cf4de8a7cd8720ff3b4417fbc10001eefdd2d21541a8226bb5556e3bb"}, + {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b736a2a2728ba45017cb67785e03125a79d246462dfa892d023b827007412c52"}, + {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0ae4c2e7698adef58710d6e7a32ff518b66b98854b1c68e70eee504ad061d8"}, + {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6322c4291c3ff174dcd104fae41500e75dad12be6f3085d119c2c8a80956c51"}, + {file = "xxhash-3.4.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd59ed668801c3fae282f8f4edadf6dc7784db6d18139b584b6d9677ddde1b6b"}, + {file = "xxhash-3.4.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92693c487e39523a80474b0394645b393f0ae781d8db3474ccdcead0559ccf45"}, + {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4603a0f642a1e8d7f3ba5c4c25509aca6a9c1cc16f85091004a7028607ead663"}, + {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa45e8cbfbadb40a920fe9ca40c34b393e0b067082d94006f7f64e70c7490a6"}, + {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595b252943b3552de491ff51e5bb79660f84f033977f88f6ca1605846637b7c6"}, + {file = "xxhash-3.4.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:562d8b8f783c6af969806aaacf95b6c7b776929ae26c0cd941d54644ea7ef51e"}, + {file = "xxhash-3.4.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:41ddeae47cf2828335d8d991f2d2b03b0bdc89289dc64349d712ff8ce59d0647"}, + {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c44d584afdf3c4dbb3277e32321d1a7b01d6071c1992524b6543025fb8f4206f"}, + {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7bddb3a5b86213cc3f2c61500c16945a1b80ecd572f3078ddbbe68f9dabdfb"}, + {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ecb6c987b62437c2f99c01e97caf8d25660bf541fe79a481d05732e5236719c"}, + {file = "xxhash-3.4.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:696b4e18b7023527d5c50ed0626ac0520edac45a50ec7cf3fc265cd08b1f4c03"}, + {file = "xxhash-3.4.1.tar.gz", hash = "sha256:0379d6cf1ff987cd421609a264ce025e74f346e3e145dd106c0cc2e3ec3f99a9"}, +] + [[package]] name = "yarl" version = "1.9.2" @@ -1918,4 +2167,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "b51eeb301b26a29c59cef01b4ff3877cf7e67c59fa40a48179be633908ee5e62" +content-hash = "c47c186281af81e139759fa9ef042e6b4e4b39706937212b6d334ddefad7d0c5" diff --git a/pyproject.toml b/pyproject.toml index 6036c7dd..08170474 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ python = "^3.9" #pytorch-ie = ">=0.28.0,<0.29.0" pytorch-ie = { git = "https://github.com/ChristophAlt/pytorch-ie.git", branch = "remove_datasets" } +datasets = "^2" [tool.poetry.group.dev.dependencies] torch = {version = "^2.1.0+cpu", source = "pytorch"} From d2f3a666bc2de39ad5419ac34f7100fce0dc45d1 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 00:59:23 +0100 Subject: [PATCH 07/10] use Dataset and IterableDataset from here --- src/pie_datasets/document/processing/regex_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pie_datasets/document/processing/regex_partitioner.py b/src/pie_datasets/document/processing/regex_partitioner.py index e99dcf16..2e12bc9d 100644 --- a/src/pie_datasets/document/processing/regex_partitioner.py +++ b/src/pie_datasets/document/processing/regex_partitioner.py @@ -9,7 +9,7 @@ from pytorch_ie.annotations import LabeledSpan from pytorch_ie.documents import TextBasedDocument -from pie_datasets.dataset_dict import Dataset, EnterDatasetMixin, ExitDatasetMixin, IterableDataset +from pie_datasets import Dataset, IterableDataset logger = logging.getLogger(__name__) From 14d7f384dcca741e04512a59fa1a2ae34e953ad0 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 01:01:49 +0100 Subject: [PATCH 08/10] revert: use branch remove_datasets of pytorch-ie --- poetry.lock | 26 +++++++++++--------------- pyproject.toml | 4 +--- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2ca390cb..3c4aa10d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1227,24 +1227,20 @@ name = "pytorch-ie" version = "0.27.0" description = "State-of-the-art Information Extraction in PyTorch" optional = false -python-versions = "^3.9" -files = [] -develop = false +python-versions = ">=3.9,<4.0" +files = [ + {file = "pytorch_ie-0.27.0-py3-none-any.whl", hash = "sha256:d8eec1183d260e2ad13b3aeea10342bd46ef2b3cefb64fafdbddecc91181c14e"}, + {file = "pytorch_ie-0.27.0.tar.gz", hash = "sha256:6711d8afe63c7754e70dc6bf20427f005edd0b0a60d1d670290b4d81068614a4"}, +] [package.dependencies] -absl-py = "^1.0.0" +absl-py = ">=1.0.0,<2.0.0" +datasets = ">=2.13,<3.0" fsspec = "<2023.9.0" -pandas = "^2.0.0" -pytorch-lightning = "^2" +pytorch-lightning = ">=2,<3" torch = ">=1.10" -torchmetrics = "^1" -transformers = "^4.18" - -[package.source] -type = "git" -url = "https://github.com/ChristophAlt/pytorch-ie.git" -reference = "remove_datasets" -resolved_reference = "f35fa328fe9fb036f1d628d00468e196684d37bc" +torchmetrics = ">=1,<2" +transformers = ">=4.18,<5.0" [[package]] name = "pytorch-lightning" @@ -2167,4 +2163,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c47c186281af81e139759fa9ef042e6b4e4b39706937212b6d334ddefad7d0c5" +content-hash = "05fdb17a8a21088696573d3a3356c1b815446af17e6589c28c11eb80d2e8788e" diff --git a/pyproject.toml b/pyproject.toml index 08170474..ae4f83ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,7 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.9" -#pytorch-ie = ">=0.28.0,<0.29.0" -pytorch-ie = { git = "https://github.com/ChristophAlt/pytorch-ie.git", branch = "remove_datasets" } -datasets = "^2" +pytorch-ie = ">=0.27.0,<0.28.0" [tool.poetry.group.dev.dependencies] torch = {version = "^2.1.0+cpu", source = "pytorch"} From fe751a8f2f2b4838fc787f6dc79cf985debd10af Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 01:13:56 +0100 Subject: [PATCH 09/10] fix import --- src/pie_datasets/document/processing/regex_partitioner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pie_datasets/document/processing/regex_partitioner.py b/src/pie_datasets/document/processing/regex_partitioner.py index 2e12bc9d..02e76b74 100644 --- a/src/pie_datasets/document/processing/regex_partitioner.py +++ b/src/pie_datasets/document/processing/regex_partitioner.py @@ -10,6 +10,7 @@ from pytorch_ie.documents import TextBasedDocument from pie_datasets import Dataset, IterableDataset +from pie_datasets.dataset_dict import EnterDatasetMixin, ExitDatasetMixin logger = logging.getLogger(__name__) From 25fe52dc6eb63134ee4919d0edd95de2faafc39c Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 01:18:25 +0100 Subject: [PATCH 10/10] remove slow marker from test_statistics_with_tokenize() --- tests/unit/test_statistics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_statistics.py b/tests/unit/test_statistics.py index 85496850..d201dd65 100644 --- a/tests/unit/test_statistics.py +++ b/tests/unit/test_statistics.py @@ -187,7 +187,6 @@ def test_statistics(dataset): } -@pytest.mark.slow def test_statistics_with_tokenize(dataset): statistic = TokenCountCollector( text_field="text",