From 5ef42049790c85bba2100cee9f6cd240961cf623 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 27 Aug 2024 13:36:08 -0700 Subject: [PATCH 1/2] Add data mixes --- CHANGELOG.md | 2 +- MANIFEST.in | 1 + pyproject.toml | 3 +- src/olmo_core/data/__init__.py | 6 +- src/olmo_core/data/memmap_dataset.py | 49 +- src/olmo_core/data/mixes/OLMoE-mix-0824.txt | 1135 +++++++++++++++++++ src/olmo_core/data/mixes/__init__.py | 58 + src/olmo_core/data/mixes/dolma17.txt | 1057 +++++++++++++++++ src/olmo_core/data/tokenizer.py | 10 +- src/scripts/train/OLMo-7B.py | 11 +- src/test/data/mixes_test.py | 20 + 11 files changed, 2331 insertions(+), 21 deletions(-) create mode 100644 MANIFEST.in create mode 100644 src/olmo_core/data/mixes/OLMoE-mix-0824.txt create mode 100644 src/olmo_core/data/mixes/__init__.py create mode 100644 src/olmo_core/data/mixes/dolma17.txt create mode 100644 src/test/data/mixes_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0baa0f52..e9196587 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added support for unsharding model state into `safetensors` format with `olmo_core.distributed.checkpoint.unshard_checkpoint(..., use_safetensors=True)`. -- Added `data.TokenizerConfig` config class and `data.TokenizerNames` enumeration. +- Added `data.TokenizerConfig` config class and `data.TokenizerName` enumeration. ## [v1.0.1](https://github.com/allenai/OLMo-core/releases/tag/v1.0.1) - 2024-08-26 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..01c8d0fc --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include src/olmo_core/data/mixes/*.txt diff --git a/pyproject.toml b/pyproject.toml index 5cc90f34..5e99e6f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "rich", "omegaconf", "safetensors", + "importlib_resources", ] [project.urls] @@ -65,7 +66,7 @@ all = [ include-package-data = true [tool.setuptools.package-data] -olmo_core = ["py.typed"] +olmo_core = ["py.typed", "*.txt"] [tool.setuptools.dynamic] version = { attr = "olmo_core.version.VERSION" } diff --git a/src/olmo_core/data/__init__.py b/src/olmo_core/data/__init__.py index e5abb8b7..ce1a6b36 100644 --- a/src/olmo_core/data/__init__.py +++ b/src/olmo_core/data/__init__.py @@ -1,14 +1,16 @@ from .collator import DataCollator, PaddingDirection from .iterable_dataset import IterableDataset from .memmap_dataset import MemMapDataset, MemMapDatasetConfig, MemMapDType -from .tokenizer import TokenizerConfig, TokenizerNames +from .mixes import DataMix +from .tokenizer import TokenizerConfig, TokenizerName __all__ = [ "MemMapDatasetConfig", "MemMapDataset", "MemMapDType", "TokenizerConfig", - "TokenizerNames", + "TokenizerName", + "DataMix", "DataCollator", "PaddingDirection", "IterableDataset", diff --git a/src/olmo_core/data/memmap_dataset.py b/src/olmo_core/data/memmap_dataset.py index 1b1940a7..584cd0e4 100644 --- a/src/olmo_core/data/memmap_dataset.py +++ b/src/olmo_core/data/memmap_dataset.py @@ -9,12 +9,13 @@ import torch from torch.utils.data import Dataset -from olmo_core.exceptions import OLMoEnvironmentError +from olmo_core.exceptions import OLMoConfigurationError, OLMoEnvironmentError from ..aliases import PathOrStr from ..config import Config, StrEnum from ..io import _get_s3_client, file_size, get_bytes_range from ..utils import get_document_lengths +from .mixes import DataMix from .tokenizer import TokenizerConfig __all__ = ["MemMapDatasetConfig", "MemMapDataset"] @@ -41,9 +42,10 @@ class MemMapDatasetConfig(Config): A config class for easily building :class:`MemMapDataset` classes. """ - paths: List[str] sequence_length: int tokenizer: TokenizerConfig + paths: Optional[List[str]] = None + mix: Optional[DataMix] = None memmap_dtype: Optional[MemMapDType] = None metadata: Optional[List[Dict[str, Any]]] = None include_instance_metadata: bool = True @@ -62,10 +64,28 @@ def glob(cls, *glob_paths: str, **kwargs) -> "MemMapDatasetConfig": If any of the globs don't expand to any matches a :class:`FileNotFoundError` error is raised - :returns: A new config. + :returns: A new dataset config. """ return cls(paths=list(glob_paths), expand_glob=True, **kwargs) + @classmethod + def from_data_mix( + cls, mix: DataMix, *, tokenizer: TokenizerConfig, **kwargs + ) -> "MemMapDatasetConfig": + """ + Initialize a dataset config from an official data mix. + + :param mix: The data mix. + :param tokenizer: The tokenizer config. + + :returns: A new dataset config. + """ + if tokenizer.identifier is None: + raise OLMoConfigurationError( + "Missing tokenizer identifier required to construct data mix" + ) + return cls(mix=mix, tokenizer=tokenizer, **kwargs) + def get_memmap_dtype( self, ) -> Union[Type[np.uint8], Type[np.uint16], Type[np.uint32], Type[np.uint64]]: @@ -85,12 +105,18 @@ def get_memmap_dtype( raise ValueError("vocab size too big!") - def build(self) -> MemMapDataset: + def build(self, mix_base_dir: Optional[str] = None) -> MemMapDataset: """ Construct the corresponding :class:`MemMapDataset`. + + :param mix_base_dir: The base directory for the :data:`mix`, e.g. "s3://ai2-llm". + Required if initializing from a data mix. """ + if (self.paths is None) == (self.mix is None): + raise OLMoConfigurationError("Exactly one of 'paths' or 'mix' is required") + paths: List[str] = [] - if self.expand_glob: + if self.paths and self.expand_glob: from glob import glob for glob_path in self.paths: @@ -101,8 +127,19 @@ def build(self) -> MemMapDataset: for path in matches: log.info(f" - '{path}'") paths.extend(matches) - else: + elif self.paths: paths = self.paths + else: + assert self.mix is not None + if mix_base_dir is None: + raise OLMoConfigurationError( + "'mix_base_dir' is required to build a dataset from a mix" + ) + if self.tokenizer.identifier is None: + raise OLMoConfigurationError( + "Missing tokenizer identifier required to construct data mix" + ) + paths = self.mix.build(mix_base_dir, self.tokenizer.identifier) dataset = MemMapDataset( *paths, diff --git a/src/olmo_core/data/mixes/OLMoE-mix-0824.txt b/src/olmo_core/data/mixes/OLMoE-mix-0824.txt new file mode 100644 index 00000000..638f550e --- /dev/null +++ b/src/olmo_core/data/mixes/OLMoE-mix-0824.txt @@ -0,0 +1,1135 @@ +# ProofPile 2: Algebraic Stack Data +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-00-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-01-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-02-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-03-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-04-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-05-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-06-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-07-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-08-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-09-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-10-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-11-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-12-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-13-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-14-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-15-00000.npy + +# ProofPile 2: Arxiv Data +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-00-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-01-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-02-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-03-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-04-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-05-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-06-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-07-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-08-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-09-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-10-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-11-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-12-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-13-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-14-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-15-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-16-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-17-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-18-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/{TOKENIZER}/part-19-00000.npy + +# ProofPile 2: Open Web Math Data +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-00-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-01-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-02-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-03-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-04-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-05-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-06-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-07-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-08-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-09-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-10-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-11-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-12-00000.npy + +# Pes2o Data +preprocessed/pes2o/{TOKENIZER}/part-00-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-01-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-02-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-03-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-04-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-05-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-06-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-07-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-08-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-09-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-10-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-11-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-12-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-13-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-14-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-15-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-16-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-17-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-18-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-19-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-20-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-21-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-22-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-23-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-24-00000.npy +preprocessed/pes2o/{TOKENIZER}/part-25-00000.npy + +# Starcoder Data (fixed!) +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-000-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-001-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-002-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-003-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-004-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-005-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-006-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-007-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-008-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-009-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-010-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-011-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-012-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-013-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-014-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-015-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-016-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-017-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-018-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-019-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-020-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-021-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-022-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-023-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-024-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-025-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-026-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-027-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-028-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-029-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-030-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-031-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-032-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-033-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-034-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-035-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-036-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-037-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-038-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-039-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-040-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-041-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-042-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-043-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-044-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-045-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-046-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-047-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-048-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-049-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-050-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-051-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-052-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-053-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-054-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-055-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-056-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-057-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-058-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-059-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-060-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-061-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-062-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-063-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-064-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-065-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-066-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-067-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-068-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-069-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-070-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-071-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-072-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-073-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-074-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-075-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-076-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-077-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-078-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-079-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-080-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-081-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-082-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-083-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-084-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-085-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-086-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-087-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-088-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-089-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-090-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-091-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-092-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-093-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-094-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-095-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-096-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-097-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-098-00000.npy +preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/{TOKENIZER}/part-099-00000.npy + +# DCLM Data +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-000-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-000-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-000-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-000-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-000-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-001-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-001-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-001-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-001-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-001-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-002-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-002-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-002-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-002-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-002-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-003-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-003-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-003-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-003-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-003-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-004-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-004-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-004-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-004-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-004-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-005-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-005-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-005-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-005-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-005-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-006-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-006-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-006-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-006-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-006-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-007-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-007-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-007-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-007-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-007-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-008-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-008-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-008-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-008-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-008-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-009-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-009-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-009-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-009-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-009-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-010-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-010-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-010-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-010-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-010-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-011-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-011-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-011-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-011-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-011-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-012-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-012-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-012-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-012-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-012-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-013-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-013-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-013-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-013-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-013-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-014-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-014-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-014-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-014-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-014-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-015-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-015-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-015-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-015-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-015-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-016-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-016-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-016-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-016-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-016-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-017-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-017-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-017-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-017-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-017-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-018-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-018-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-018-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-018-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-018-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-019-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-019-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-019-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-019-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-019-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-020-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-020-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-020-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-020-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-020-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-021-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-021-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-021-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-021-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-021-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-022-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-022-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-022-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-022-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-022-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-023-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-023-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-023-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-023-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-023-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-024-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-024-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-024-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-024-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-024-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-025-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-025-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-025-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-025-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-025-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-026-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-026-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-026-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-026-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-026-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-027-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-027-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-027-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-027-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-027-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-028-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-028-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-028-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-028-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-028-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-029-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-029-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-029-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-029-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-029-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-030-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-030-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-030-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-030-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-030-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-031-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-031-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-031-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-031-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-031-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-032-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-032-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-032-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-032-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-032-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-033-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-033-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-033-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-033-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-033-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-034-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-034-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-034-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-034-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-034-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-035-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-035-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-035-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-035-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-035-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-036-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-036-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-036-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-036-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-036-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-037-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-037-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-037-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-037-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-037-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-038-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-038-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-038-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-038-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-038-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-039-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-039-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-039-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-039-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-039-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-040-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-040-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-040-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-040-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-040-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-041-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-041-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-041-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-041-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-041-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-042-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-042-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-042-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-042-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-042-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-043-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-043-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-043-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-043-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-043-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-044-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-044-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-044-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-044-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-044-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-045-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-045-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-045-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-045-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-045-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-046-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-046-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-046-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-046-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-046-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-047-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-047-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-047-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-047-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-047-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-048-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-048-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-048-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-048-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-048-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-049-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-049-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-049-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-049-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-049-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-050-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-050-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-050-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-050-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-050-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-051-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-051-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-051-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-051-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-051-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-052-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-052-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-052-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-052-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-052-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-053-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-053-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-053-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-053-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-053-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-054-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-054-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-054-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-054-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-054-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-055-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-055-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-055-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-055-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-055-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-056-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-056-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-056-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-056-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-056-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-057-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-057-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-057-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-057-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-057-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-058-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-058-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-058-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-058-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-058-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-059-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-059-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-059-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-059-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-059-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-060-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-060-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-060-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-060-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-060-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-061-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-061-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-061-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-061-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-061-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-062-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-062-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-062-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-062-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-062-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-063-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-063-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-063-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-063-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-063-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-064-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-064-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-064-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-064-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-064-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-065-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-065-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-065-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-065-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-065-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-066-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-066-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-066-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-066-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-066-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-067-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-067-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-067-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-067-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-067-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-068-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-068-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-068-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-068-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-068-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-069-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-069-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-069-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-069-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-069-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-070-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-070-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-070-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-070-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-070-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-071-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-071-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-071-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-071-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-071-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-072-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-072-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-072-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-072-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-072-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-073-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-073-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-073-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-073-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-073-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-074-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-074-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-074-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-074-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-074-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-075-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-075-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-075-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-075-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-075-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-076-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-076-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-076-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-076-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-076-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-077-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-077-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-077-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-077-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-077-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-078-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-078-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-078-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-078-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-078-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-079-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-079-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-079-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-079-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-079-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-080-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-080-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-080-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-080-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-080-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-081-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-081-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-081-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-081-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-081-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-082-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-082-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-082-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-082-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-082-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-083-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-083-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-083-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-083-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-083-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-084-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-084-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-084-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-084-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-084-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-085-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-085-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-085-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-085-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-085-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-086-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-086-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-086-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-086-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-086-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-087-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-087-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-087-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-087-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-087-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-088-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-088-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-088-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-088-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-088-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-089-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-089-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-089-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-089-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-089-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-090-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-090-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-090-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-090-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-090-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-091-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-091-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-091-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-091-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-091-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-092-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-092-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-092-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-092-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-092-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-093-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-093-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-093-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-093-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-093-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-094-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-094-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-094-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-094-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-094-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-095-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-095-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-095-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-095-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-095-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-096-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-096-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-096-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-096-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-096-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-097-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-097-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-097-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-097-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-097-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-098-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-098-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-098-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-098-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-098-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-099-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-099-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-099-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-099-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-099-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-100-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-100-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-100-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-100-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-100-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-101-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-101-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-101-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-101-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-101-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-102-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-102-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-102-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-102-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-102-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-103-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-103-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-103-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-103-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-103-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-104-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-105-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-105-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-105-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-105-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-105-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-106-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-106-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-106-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-106-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-106-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-107-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-107-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-107-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-107-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-107-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-108-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-108-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-108-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-108-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-108-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-109-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-109-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-109-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-109-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-109-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-110-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-110-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-110-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-110-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-110-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-111-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-111-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-111-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-111-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-111-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-112-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-112-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-112-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-112-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-112-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-113-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-113-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-113-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-113-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-113-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-114-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-114-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-114-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-114-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-114-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-115-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-115-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-115-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-115-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-115-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-116-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-116-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-116-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-116-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-116-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-117-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-117-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-117-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-117-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-117-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-118-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-118-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-118-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-118-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-118-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-119-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-119-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-119-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-119-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-119-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-120-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-120-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-120-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-120-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-120-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-121-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-121-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-121-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-121-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-121-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-122-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-122-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-122-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-122-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-122-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-123-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-123-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-123-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-123-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-123-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-124-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-124-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-124-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-124-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-124-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-125-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-125-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-125-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-125-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-125-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-126-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-126-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-126-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-126-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-126-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-127-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-127-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-127-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-127-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-127-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-128-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-128-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-128-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-128-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-128-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-129-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-129-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-129-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-129-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-129-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-130-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-130-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-130-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-130-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-130-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-131-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-131-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-131-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-131-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-131-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-132-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-132-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-132-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-132-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-132-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-133-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-133-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-133-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-133-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-133-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-134-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-134-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-134-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-134-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-134-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-135-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-135-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-135-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-135-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-135-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-136-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-136-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-136-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-136-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-136-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-137-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-137-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-137-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-137-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-137-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-138-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-138-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-138-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-138-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-138-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-139-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-139-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-139-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-139-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-139-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-140-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-140-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-140-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-140-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-140-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-141-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-141-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-141-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-141-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-141-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-142-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-142-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-142-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-142-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-142-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-143-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-143-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-143-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-143-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-143-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-144-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-144-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-144-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-144-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-144-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-145-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-145-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-145-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-145-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-145-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-146-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-146-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-146-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-146-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-146-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-147-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-147-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-147-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-147-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-147-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-148-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-148-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-148-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-148-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-148-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-149-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-149-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-149-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-149-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-149-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-150-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-150-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-150-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-150-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-150-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-151-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-151-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-151-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-151-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-151-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-152-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-152-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-152-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-152-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-152-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-153-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-153-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-153-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-153-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-153-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-154-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-154-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-154-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-154-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-154-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-155-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-155-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-155-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-155-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-155-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-156-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-156-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-156-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-156-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-156-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-157-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-157-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-157-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-157-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-157-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-158-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-158-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-158-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-158-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-158-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-159-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-159-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-159-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-159-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-159-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-160-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-160-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-160-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-160-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-160-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-161-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-161-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-161-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-161-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-161-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-162-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-162-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-162-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-162-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-162-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-163-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-163-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-163-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-163-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-163-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-164-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-164-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-164-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-164-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-164-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-165-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-165-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-165-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-165-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-165-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-166-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-166-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-166-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-166-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-166-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-167-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-167-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-167-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-167-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-167-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-168-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-168-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-168-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-168-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-168-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-169-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-169-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-169-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-169-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-169-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-170-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-170-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-170-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-170-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-170-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-171-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-171-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-171-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-171-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-171-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-172-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-172-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-172-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-172-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-172-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-173-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-173-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-173-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-173-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-173-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-174-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-174-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-174-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-174-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-174-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-175-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-175-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-175-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-175-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-175-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-176-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-176-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-176-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-176-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-176-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-177-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-177-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-177-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-177-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-177-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-178-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-178-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-178-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-178-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-178-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-179-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-179-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-179-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-179-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-179-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-180-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-180-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-180-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-180-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-180-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-181-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-181-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-181-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-181-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-181-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-182-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-182-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-182-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-182-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-182-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-183-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-183-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-183-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-183-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-183-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-184-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-184-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-184-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-184-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-184-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-185-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-185-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-185-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-185-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-185-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-186-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-186-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-186-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-186-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-186-00004.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-187-00000.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-187-00001.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-187-00002.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-187-00003.npy +preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/{TOKENIZER}/part-187-00004.npy + +# Wikipedia +preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/{TOKENIZER}/part-0-00000.npy +preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/{TOKENIZER}/part-1-00000.npy diff --git a/src/olmo_core/data/mixes/__init__.py b/src/olmo_core/data/mixes/__init__.py new file mode 100644 index 00000000..dbdfe1b2 --- /dev/null +++ b/src/olmo_core/data/mixes/__init__.py @@ -0,0 +1,58 @@ +import os +from contextlib import contextmanager +from pathlib import Path +from typing import Generator, List + +from olmo_core.config import StrEnum + +from ..tokenizer import TokenizerName + + +class DataMix(StrEnum): + """ + An enumeration of data mix names. + """ + + OLMoE_mix_0824 = "OLMoE-mix-0824" + dolma17 = "dolma17" + + def build(self, base_dir: str, tokenizer: TokenizerName) -> List[str]: + """ + Construct the data mix. + + :param base_dir: Where the mix is stored, e.g. "s3://ai2-llm" or "/weka/oe-training-default/ai2-llm". + :param tokenizer: The tokenizer identifier. + + :returns: A list of paths/URLs to the tokenized numpy data files in the mix. + """ + if not base_dir.endswith("/"): + base_dir = base_dir + "/" + + tokenizer_id: str = tokenizer + if tokenizer == TokenizerName.gpt_neox_olmo_dolma_v1_5: + tokenizer_id = "gpt-neox-olmo-dolma-v1_5" + + paths = [] + with self._get_data_mix_path() as mix_path: + with mix_path.open() as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + line = line.replace("{TOKENIZER}", tokenizer_id) + paths.append(f"{base_dir}{line}") + return paths + + @contextmanager + def _get_data_mix_path(self) -> Generator[Path, None, None]: + import importlib_resources + + try: + with importlib_resources.as_file( + importlib_resources.files("olmo_core").joinpath( + f"data/mixes/{os.path.basename(self)}.txt" + ) + ) as path: + yield path + finally: + pass diff --git a/src/olmo_core/data/mixes/dolma17.txt b/src/olmo_core/data/mixes/dolma17.txt new file mode 100644 index 00000000..ee4bd94f --- /dev/null +++ b/src/olmo_core/data/mixes/dolma17.txt @@ -0,0 +1,1057 @@ +######### NON WEB DATA ######### +# ~> GUTENBERG BOOKS (5.256 GT) +preprocessed/olmo-mix/v1_6-decontaminated/books/{TOKENIZER}/part-0-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/books/{TOKENIZER}/part-1-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/books/{TOKENIZER}/part-2-00000.npy +# ~> PES2O STEM PAPERS (57.21 GT) +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-00-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-01-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-02-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-03-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-04-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-05-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-06-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-07-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-08-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-09-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-10-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-11-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-12-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-13-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-14-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-15-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-16-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-17-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-18-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-19-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-20-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-21-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-22-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-23-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-24-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/pes2o/{TOKENIZER}/part-25-00000.npy +# ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) repeated twice to up-sample +preprocessed/olmo-mix/v1_6-decontaminated/wiki/{TOKENIZER}/part-0-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/wiki/{TOKENIZER}/part-1-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/wiki/{TOKENIZER}/part-0-00000.npy +preprocessed/olmo-mix/v1_6-decontaminated/wiki/{TOKENIZER}/part-1-00000.npy +# MEGAWIKA v1 (4.6 GT) +preprocessed/megawika/v1/{TOKENIZER}/part-00-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-01-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-01-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-02-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-02-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-03-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-03-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-04-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-04-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-05-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-05-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-06-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-06-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-07-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-07-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-08-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-08-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-09-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-09-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-10-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-10-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-11-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-11-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-12-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-12-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-13-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-13-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-14-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-14-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-15-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-16-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-16-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-17-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-17-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-18-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-18-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-19-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-19-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-20-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-20-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-21-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-21-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-22-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-22-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-23-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-23-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-23-00002.npy +preprocessed/megawika/v1/{TOKENIZER}/part-24-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-24-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-25-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-25-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-26-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-26-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-27-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-27-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-28-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-29-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-30-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-30-00001.npy +preprocessed/megawika/v1/{TOKENIZER}/part-31-00000.npy +preprocessed/megawika/v1/{TOKENIZER}/part-31-00001.npy +# ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-00-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-01-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-02-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-03-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-04-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-05-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-06-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-07-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-08-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-09-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-10-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-11-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-12-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-13-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-14-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-15-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-16-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-17-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-18-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-19-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-20-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-21-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-22-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-23-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-24-00000.npy +preprocessed/redpajama_v1_decon_fix/stackexchange/{TOKENIZER}/part-25-00000.npy +# ~> REDPAJAMA ARXIV (27.97 GT) +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-00-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-01-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-02-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-03-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-04-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-05-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-06-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-07-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-08-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-09-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-10-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-11-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-12-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-13-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-14-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-15-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-16-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-17-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-18-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-19-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-20-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-21-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-22-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-23-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-24-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-25-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-26-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-27-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-28-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-29-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-30-00000.npy +preprocessed/redpajama_v1_decon_fix/arxiv/{TOKENIZER}/part-31-00000.npy +# ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-00-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-01-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-02-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-03-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-04-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-05-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-06-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-07-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-08-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-09-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-10-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-11-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-12-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-13-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-14-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/{TOKENIZER}/part-15-00000.npy +# ~> PROOFPILE2 OPENWEBMATH (12.734 GT) +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-00-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-01-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-02-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-03-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-04-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-05-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-06-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-07-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-08-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-09-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-10-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-11-00000.npy +preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/{TOKENIZER}/part-12-00000.npy +# ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-00-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-01-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-02-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-03-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-04-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-05-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-06-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-07-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-08-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-09-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-10-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-11-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-12-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-13-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-14-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-15-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-16-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-17-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-18-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-19-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-20-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-21-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-22-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-23-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-24-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-25-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-26-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-27-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-28-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-29-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-30-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-31-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-32-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-33-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-34-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-35-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-36-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-37-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-38-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-39-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-40-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-41-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-42-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-43-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-44-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-45-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-46-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-47-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-48-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-49-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-50-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-51-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-52-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-53-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-54-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-55-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-56-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-57-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-58-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-59-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-60-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-61-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-62-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-63-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-64-00000.npy +preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/{TOKENIZER}/part-65-00000.npy +# ~> CC NEWS (14.3 GT) +preprocessed/cc-news/v3/{TOKENIZER}/part-0-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-0-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-0-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-0-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-1-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-1-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-1-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-1-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-2-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-2-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-2-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-2-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-3-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-3-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-3-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-3-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-4-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-4-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-4-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-4-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-5-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-5-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-6-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-6-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-6-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-6-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-7-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-7-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-7-00002.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-7-00003.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-8-00000.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-8-00001.npy +preprocessed/cc-news/v3/{TOKENIZER}/part-8-00002.npy +#################################### +######### CODE ######### +# ~> STARCODER (263.775 GT) +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-00-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-00-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-01-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-02-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-03-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-03-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-04-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-04-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-05-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-05-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-06-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-06-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-07-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-07-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-08-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-08-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-09-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-09-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-10-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-10-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-11-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-11-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-12-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-12-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-13-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-13-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-14-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-14-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-15-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-15-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-16-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-16-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-17-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-17-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-18-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-18-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-19-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-19-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-20-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-20-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-21-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-21-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-22-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-22-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-23-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-23-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-24-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-24-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-25-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-25-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-26-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-26-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-27-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-27-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-28-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-29-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-30-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-30-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-31-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-31-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-32-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-32-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-33-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-33-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-34-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-34-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-35-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-35-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-36-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-36-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-37-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-37-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-38-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-38-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-39-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-39-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-40-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-40-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-41-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-41-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-42-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-42-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-43-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-43-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-44-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-44-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-45-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-46-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-46-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-47-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-47-00001.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-48-00000.npy +preprocessed/starcoder/v0_decontaminated_doc_only/{TOKENIZER}/part-48-00001.npy +#################################### +######### WEB HIGH QUALITY ######### +# ~> C4 (138.4 GT) +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-000-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-001-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-002-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-003-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-004-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-005-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-006-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-007-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-008-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-009-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-010-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-011-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-012-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-013-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-014-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-015-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-016-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-017-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-018-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-019-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-020-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-021-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-022-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-023-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-024-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-025-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-026-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-027-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-028-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-029-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-030-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-031-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-032-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-033-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-034-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-035-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-036-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-037-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-038-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-039-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-040-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-041-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-042-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-043-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-044-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-045-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-046-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-047-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-048-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-049-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-050-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-051-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-052-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-053-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-054-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-055-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-056-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-057-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-058-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-059-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-060-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-061-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-062-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-063-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-064-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-065-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-066-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-067-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-068-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-069-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-070-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-071-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-072-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-073-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-074-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-075-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-076-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-077-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-078-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-079-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-080-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-081-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-082-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-083-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-084-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-085-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-086-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-087-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-088-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-089-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-090-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-091-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-092-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-093-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-094-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-095-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-096-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-097-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-098-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-099-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-100-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-101-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-102-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-103-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-104-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-105-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-106-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-107-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-108-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-109-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-110-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-111-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-112-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-113-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-114-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-115-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-116-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-117-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-118-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-119-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-120-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-121-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-122-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-123-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-124-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-125-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-126-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-127-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-128-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-129-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-130-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-131-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-132-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-133-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-134-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-135-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-136-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-137-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-138-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-139-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-140-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-141-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-142-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-143-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-144-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-145-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-146-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-147-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-148-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-149-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-150-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-151-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-152-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-153-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-154-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-155-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-156-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-157-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-158-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-159-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-160-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-161-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-162-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-163-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-164-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-165-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-166-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-167-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-168-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-169-00000.npy +preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/{TOKENIZER}/part-170-00000.npy +# ~> REDDIT (79.9 GT) +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-00-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-01-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-02-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-03-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-04-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-05-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-06-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-07-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-08-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-09-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-10-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-11-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-12-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-13-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-14-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-15-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-16-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-17-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-18-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-19-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-20-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-21-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-22-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-23-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-24-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-25-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-26-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-27-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-28-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-29-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-30-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-31-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-32-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-33-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-34-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-35-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-36-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-37-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-38-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-39-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-40-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-41-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-42-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-43-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-44-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-45-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-46-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-47-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-48-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-49-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-50-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-51-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-52-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-53-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-54-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-55-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-56-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-57-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-58-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-59-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-60-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-61-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-62-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-63-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-64-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-65-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-66-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-67-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-68-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-69-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-70-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-71-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-72-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-73-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-74-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-75-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-76-00000.npy +preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/{TOKENIZER}/part-77-00000.npy +# ~> FALCON (547.341 GT) +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-000-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-001-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-002-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-003-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-004-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-005-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-006-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-007-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-008-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-009-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-010-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-011-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-012-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-013-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-014-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-015-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-016-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-017-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-018-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-019-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-020-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-021-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-022-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-023-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-024-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-025-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-026-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-027-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-028-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-029-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-030-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-031-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-032-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-033-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-034-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-035-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-036-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-037-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-038-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-039-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-040-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-041-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-042-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-043-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-044-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-045-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-046-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-047-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-048-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-049-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-050-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-051-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-052-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-053-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-054-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-055-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-056-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-057-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-058-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-059-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-060-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-061-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-062-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-063-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-064-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-065-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-066-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-067-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-068-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-069-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-070-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-071-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-072-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-073-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-074-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-075-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-076-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-077-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-078-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-079-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-080-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-081-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-082-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-083-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-084-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-085-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-086-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-087-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-088-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-089-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-090-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-091-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-092-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-093-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-094-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-095-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-096-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-097-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-098-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-099-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-100-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-101-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-102-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-103-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-104-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-105-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-106-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-107-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-108-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-109-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-110-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-111-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-112-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-113-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-114-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-115-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-116-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-117-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-118-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-119-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-120-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-121-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-122-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-123-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-124-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-125-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-126-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-127-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-128-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-129-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-130-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-131-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-132-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-133-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-134-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-135-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-136-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-137-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-138-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-139-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-140-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-141-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-142-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-143-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-144-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-145-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-146-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-147-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-148-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-149-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-150-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-151-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-152-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-153-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-154-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-155-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-156-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-157-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-158-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-159-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-160-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-161-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-162-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-163-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-164-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-165-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-166-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-167-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-168-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-169-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-170-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-171-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-172-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-173-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-174-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-175-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-176-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-177-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-178-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-179-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-180-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-181-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-182-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-183-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-184-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-185-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-186-00000.npy +preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/{TOKENIZER}/part-187-00000.npy +#################################### +######### WEB REST ######### +# ~> DOLMA CC HEAD 50% (178.4 GT) +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-000-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-001-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-002-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-003-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-004-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-005-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-006-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-006-00001.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-007-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-008-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-009-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-010-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-011-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-012-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-013-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-014-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-015-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-016-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-017-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-018-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-019-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-020-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-021-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-022-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-023-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-024-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-025-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-026-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-027-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-028-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-029-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-030-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-031-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-032-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-033-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-034-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-035-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-036-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-037-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-038-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-039-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-040-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-041-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-042-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-043-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-044-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-045-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-046-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-047-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-048-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-049-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-050-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-051-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-052-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-053-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-054-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-055-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-056-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-057-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-058-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-059-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-060-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-061-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-062-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/{TOKENIZER}/part-063-00000.npy +# ~> DOLMA CC MIDDLE 33% (242.05 GT) +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-000-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-001-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-002-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-003-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-004-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-005-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-006-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-007-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-008-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-009-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-010-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-011-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-012-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-013-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-014-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-015-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-016-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-017-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-018-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-019-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-020-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-021-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-022-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-023-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-024-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-025-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-026-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-027-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-028-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-029-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-030-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-031-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-032-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-033-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-034-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-035-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-036-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-037-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-038-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-039-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-040-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-041-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-042-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-043-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-044-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-045-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-046-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-047-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-048-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-049-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-050-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-051-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-052-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-053-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-054-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-055-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-056-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-057-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-058-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-059-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-060-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-061-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-062-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-063-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/{TOKENIZER}/part-064-00000.npy +# ~> DOLMA CC TAIL 33% (191.4 GT) +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-000-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-001-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-002-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-003-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-004-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-005-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-006-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-007-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-008-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-009-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-010-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-011-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-012-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-013-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-014-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-015-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-016-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-017-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-018-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-019-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-020-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-021-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-022-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-023-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-024-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-025-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-026-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-027-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-028-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-029-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-030-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-031-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-032-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-033-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-034-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-035-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-036-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-037-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-038-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-039-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-040-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-041-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-042-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-043-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-044-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-045-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-046-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-047-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-048-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-049-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-050-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-051-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-052-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-053-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-054-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-055-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-056-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-057-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-058-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-059-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-060-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-061-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-062-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-063-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-064-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-065-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-066-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-067-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-068-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-069-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-070-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-071-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-072-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-073-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-074-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-075-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-076-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-077-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-078-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-079-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-080-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-081-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-082-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-083-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-084-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-085-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-086-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-087-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-088-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-089-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-090-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-091-00000.npy +preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/{TOKENIZER}/part-092-00000.npy diff --git a/src/olmo_core/data/tokenizer.py b/src/olmo_core/data/tokenizer.py index 86701130..4a66e504 100644 --- a/src/olmo_core/data/tokenizer.py +++ b/src/olmo_core/data/tokenizer.py @@ -4,7 +4,7 @@ from ..config import Config, StrEnum -class TokenizerNames(StrEnum): +class TokenizerName(StrEnum): """ An enumeration of supported tokenizer names. """ @@ -24,7 +24,7 @@ class TokenizerConfig(Config): eos_token_id: int pad_token_id: int bos_token_id: Optional[int] = None - identifier: Optional[str] = None + identifier: Optional[TokenizerName] = None def padded_vocab_size(self, pad_multiple: int = 128) -> int: """ @@ -39,7 +39,7 @@ def dolma2(cls) -> "TokenizerConfig": vocab_size=100278, eos_token_id=100257, pad_token_id=100277, - identifier=TokenizerNames.dolma2, + identifier=TokenizerName.dolma2, ) @classmethod @@ -48,7 +48,7 @@ def gpt_neox_olmo_dolma_v1_5(cls) -> "TokenizerConfig": vocab_size=50280, eos_token_id=50279, pad_token_id=1, - identifier=TokenizerNames.gpt_neox_olmo_dolma_v1_5, + identifier=TokenizerName.gpt_neox_olmo_dolma_v1_5, ) @classmethod @@ -57,5 +57,5 @@ def gpt2(cls) -> "TokenizerConfig": vocab_size=50280, eos_token_id=50256, pad_token_id=50256, - identifier=TokenizerNames.gpt2, + identifier=TokenizerName.gpt2, ) diff --git a/src/scripts/train/OLMo-7B.py b/src/scripts/train/OLMo-7B.py index 6b2c1a2f..817bd3f4 100644 --- a/src/scripts/train/OLMo-7B.py +++ b/src/scripts/train/OLMo-7B.py @@ -10,7 +10,7 @@ from beaker import Beaker from olmo_core.config import Config, DType, StrEnum -from olmo_core.data import MemMapDatasetConfig, TokenizerConfig +from olmo_core.data import DataMix, MemMapDatasetConfig, TokenizerConfig from olmo_core.distributed.parallel import DataParallelConfig, DataParallelType from olmo_core.distributed.utils import get_num_nodes, get_rank, init_hybrid_shard_mesh from olmo_core.launch.beaker import ( @@ -112,11 +112,10 @@ def build_config(run_name: str, overrides: List[str]) -> ExperimentConfig: ], ) - dataset_config = MemMapDatasetConfig.glob( - # Wikipedia - "/weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/*.npy", - sequence_length=4096, + dataset_config = MemMapDatasetConfig.from_data_mix( + DataMix.OLMoE_mix_0824, tokenizer=tokenizer_config, + sequence_length=4096, ) save_folder = f"/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/{beaker_user.lower()}/{run_name}" @@ -190,7 +189,7 @@ def train(config: ExperimentConfig): dp_mesh=None if get_num_nodes() == 1 else init_hybrid_shard_mesh(), ) optim = config.optim.build(model) - dataset = config.dataset.build() + dataset = config.dataset.build(mix_base_dir="/weka/oe-training-default/ai2-llm") trainer = config.trainer.build(model, optim, dataset) # Save config to file. diff --git a/src/test/data/mixes_test.py b/src/test/data/mixes_test.py new file mode 100644 index 00000000..c4a4930c --- /dev/null +++ b/src/test/data/mixes_test.py @@ -0,0 +1,20 @@ +from olmo_core.data import DataMix, TokenizerName +from olmo_core.io import file_exists + + +def test_olmoe_mix(): + mix = DataMix.OLMoE_mix_0824.build("s3://ai2-llm", TokenizerName.dolma2) + assert ( + mix[-1] + == "s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy" + ) + assert file_exists(mix[-1]) + + +def test_dolma17_mix(): + mix = DataMix.dolma17.build("s3://ai2-llm", TokenizerName.gpt_neox_olmo_dolma_v1_5) + assert ( + mix[-1] + == "s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy" + ) + assert file_exists(mix[-1]) From 10a03da19e52916fe4b9915d63cd1dd4a13a30ca Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 27 Aug 2024 13:37:54 -0700 Subject: [PATCH 2/2] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9196587..3a09d500 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for unsharding model state into `safetensors` format with `olmo_core.distributed.checkpoint.unshard_checkpoint(..., use_safetensors=True)`. - Added `data.TokenizerConfig` config class and `data.TokenizerName` enumeration. +- Added data mixes with `data.DataMix` API. ## [v1.0.1](https://github.com/allenai/OLMo-core/releases/tag/v1.0.1) - 2024-08-26