diff --git a/README.md b/README.md index ce529582c..20638d1db 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Then, follow these simple steps: ```shell tar -chf downstream_dataset.tar /path/to/downstream/dataset/image/folder ``` - + 4. Infer the auxiliary files `query_entries.npy` and `query_file_indices.npy` : ``` @@ -143,19 +143,20 @@ Then run: ```shell python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \ --config-file dinov2/configs/train/vitl14.yaml \ - train.dataset_path=Pathology:root={path/to/data/root}:subset={subset} + train.dataset_path=Pathology:root={path/to/tarball/root}:extra={path/to/entry/root}:subset={subset} ``` -Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `Pathology:root=/root/data`).
+Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `Pathology:root=/root/data:extra=/root/data`).
Leave out `:subset={subset}` if you didn't restrict the dataset to a specific subset when preparing data.
-Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:subset=train`). +Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:extra=/root/data:subset=train`). In case you want to run downstream tuning, make sure to update the following two parameters in your config: ```shell tune: - query_dataset_path: KNN:root={path/to/data/root}:split=query - test_dataset_path: KNN:root={path/to/data/root}:split=test + query_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=query + test_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=test ``` -Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` file and `.npy` files during data preparation. +Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` files. +Replace `{path/to/entry/root}` with the folder where you dumped the downstream `.npy` entry files. diff --git a/README_foundation.md b/README_foundation.md index 138ddcf38..85239ce9e 100644 --- a/README_foundation.md +++ b/README_foundation.md @@ -118,7 +118,7 @@ Update `dinov2/configs/train/vitl14.yaml` if you want to change some parameters, ```shell python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \ --config-file dinov2/configs/train/vitl14.yaml \ - train.dataset_path=PathologyFoundation:root={path/to/data/root} + train.dataset_path=PathologyFoundation:root={path/to/data/root}:extra={path/to/entry/root} ``` -Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `PathologyFoundation:root=/root/data`). \ No newline at end of file +Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `PathologyFoundation:root=/root/data:extra=/root/data`).
\ No newline at end of file diff --git a/dinov2/data/datasets/foundation.py b/dinov2/data/datasets/foundation.py index 1d8087eb2..e0c1a42ec 100644 --- a/dinov2/data/datasets/foundation.py +++ b/dinov2/data/datasets/foundation.py @@ -42,6 +42,7 @@ def __init__( self, *, root: str, + extra: str, subset: Optional["PathologyFoundationDataset.Subset"] = None, transforms: Optional[Callable] = None, transform: Optional[Callable] = None, @@ -49,6 +50,7 @@ def __init__( mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE, ) -> None: super().__init__(root, transforms, transform, target_transform) + self.extra = extra self._subset = subset self._get_entries() self._get_cohort_names() @@ -70,14 +72,14 @@ def _get_entries(self) -> np.ndarray: self._entries = self._load_entries(self._entries_name) def _load_entries(self, _entries_name: str) -> np.ndarray: - entries_path = Path(self.root, _entries_name) + entries_path = Path(self.extra, _entries_name) return np.load(entries_path, mmap_mode="r") def _get_cohort_names(self) -> dict: self._cohort_names = self._load_cohort_names() def _load_cohort_names(self) -> dict: - cohort_dict_path = Path(self.root, "cohort_indices.npy") + cohort_dict_path = Path(self.extra, "cohort_indices.npy") return np.load(cohort_dict_path, allow_pickle=True).item() def get_image_data(self, index: int) -> bytes: diff --git a/dinov2/data/datasets/pathology.py b/dinov2/data/datasets/pathology.py index fadaef830..5f9cb6cbe 100644 --- a/dinov2/data/datasets/pathology.py +++ b/dinov2/data/datasets/pathology.py @@ -29,12 +29,14 @@ def __init__( self, *, root: str, + extra: str, subset: Optional["PathologyDataset.Subset"] = None, transforms: Optional[Callable] = None, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: super().__init__(root, transforms, transform, target_transform) + self.extra = extra self._subset = subset self._get_entries() self._mmap_tarball = _make_mmap_tarball(Path(root, "pretrain_dataset.tar")) @@ -51,7 +53,7 @@ def _get_entries(self) -> np.ndarray: self._entries = self._load_entries(self._entries_name) def _load_entries(self, _entries_name: str) -> np.ndarray: - entries_path = Path(self.root, _entries_name) + entries_path = Path(self.extra, _entries_name) return np.load(entries_path, mmap_mode="r") def get_image_data(self, index: int) -> bytes: