diff --git a/README.md b/README.md
index ce529582c..20638d1db 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ Then, follow these simple steps:
```shell
tar -chf downstream_dataset.tar /path/to/downstream/dataset/image/folder
```
-
+
4. Infer the auxiliary files `query_entries.npy` and `query_file_indices.npy` :
```
@@ -143,19 +143,20 @@ Then run:
```shell
python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
--config-file dinov2/configs/train/vitl14.yaml \
- train.dataset_path=Pathology:root={path/to/data/root}:subset={subset}
+ train.dataset_path=Pathology:root={path/to/tarball/root}:extra={path/to/entry/root}:subset={subset}
```
-Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `Pathology:root=/root/data`).
+Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `Pathology:root=/root/data:extra=/root/data`).
Leave out `:subset={subset}` if you didn't restrict the dataset to a specific subset when preparing data.
-Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:subset=train`).
+Otherwise, replace `{subset}` with the suffix you chose for `--suffix` in data preparation (e.g. `Pathology:root=/root/data:extra=/root/data:subset=train`).
In case you want to run downstream tuning, make sure to update the following two parameters in your config:
```shell
tune:
- query_dataset_path: KNN:root={path/to/data/root}:split=query
- test_dataset_path: KNN:root={path/to/data/root}:split=test
+ query_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=query
+ test_dataset_path: KNN:root={path/to/data/root}:extra={path/to/entry/root}:split=test
```
-Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` file and `.npy` files during data preparation.
+Replace `{path/to/data/root}` with the folder where you dumped the downstream `.tar` files.
+Replace `{path/to/entry/root}` with the folder where you dumped the downstream `.npy` entry files.
diff --git a/README_foundation.md b/README_foundation.md
index 138ddcf38..85239ce9e 100644
--- a/README_foundation.md
+++ b/README_foundation.md
@@ -118,7 +118,7 @@ Update `dinov2/configs/train/vitl14.yaml` if you want to change some parameters,
```shell
python -m torch.distributed.run --nproc_per_node=gpu dinov2/train/train.py \
--config-file dinov2/configs/train/vitl14.yaml \
- train.dataset_path=PathologyFoundation:root={path/to/data/root}
+ train.dataset_path=PathologyFoundation:root={path/to/data/root}:extra={path/to/entry/root}
```
-Replace `{path/to/data/root}` with the folder you chose for `--output_root` in data preparation (e.g. `PathologyFoundation:root=/root/data`).
\ No newline at end of file
+Replace `{path/to/data/root}` with the root folder where tarballs are saved, and `{path/to/entry/root}` with the root folder where numpy entry files are saved (e.g. `PathologyFoundation:root=/root/data:extra=/root/data`).
\ No newline at end of file
diff --git a/dinov2/data/datasets/foundation.py b/dinov2/data/datasets/foundation.py
index 1d8087eb2..e0c1a42ec 100644
--- a/dinov2/data/datasets/foundation.py
+++ b/dinov2/data/datasets/foundation.py
@@ -42,6 +42,7 @@ def __init__(
self,
*,
root: str,
+ extra: str,
subset: Optional["PathologyFoundationDataset.Subset"] = None,
transforms: Optional[Callable] = None,
transform: Optional[Callable] = None,
@@ -49,6 +50,7 @@ def __init__(
mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
) -> None:
super().__init__(root, transforms, transform, target_transform)
+ self.extra = extra
self._subset = subset
self._get_entries()
self._get_cohort_names()
@@ -70,14 +72,14 @@ def _get_entries(self) -> np.ndarray:
self._entries = self._load_entries(self._entries_name)
def _load_entries(self, _entries_name: str) -> np.ndarray:
- entries_path = Path(self.root, _entries_name)
+ entries_path = Path(self.extra, _entries_name)
return np.load(entries_path, mmap_mode="r")
def _get_cohort_names(self) -> dict:
self._cohort_names = self._load_cohort_names()
def _load_cohort_names(self) -> dict:
- cohort_dict_path = Path(self.root, "cohort_indices.npy")
+ cohort_dict_path = Path(self.extra, "cohort_indices.npy")
return np.load(cohort_dict_path, allow_pickle=True).item()
def get_image_data(self, index: int) -> bytes:
diff --git a/dinov2/data/datasets/pathology.py b/dinov2/data/datasets/pathology.py
index fadaef830..5f9cb6cbe 100644
--- a/dinov2/data/datasets/pathology.py
+++ b/dinov2/data/datasets/pathology.py
@@ -29,12 +29,14 @@ def __init__(
self,
*,
root: str,
+ extra: str,
subset: Optional["PathologyDataset.Subset"] = None,
transforms: Optional[Callable] = None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
) -> None:
super().__init__(root, transforms, transform, target_transform)
+ self.extra = extra
self._subset = subset
self._get_entries()
self._mmap_tarball = _make_mmap_tarball(Path(root, "pretrain_dataset.tar"))
@@ -51,7 +53,7 @@ def _get_entries(self) -> np.ndarray:
self._entries = self._load_entries(self._entries_name)
def _load_entries(self, _entries_name: str) -> np.ndarray:
- entries_path = Path(self.root, _entries_name)
+ entries_path = Path(self.extra, _entries_name)
return np.load(entries_path, mmap_mode="r")
def get_image_data(self, index: int) -> bytes: