update dataset structure

weecology · Aug 29, 2024 · ac39aeb · ac39aeb
1 parent 326b7bc
commit ac39aeb
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 7 deletions.
diff --git a/data_prep/collect_tasks.py b/data_prep/collect_tasks.py
@@ -90,13 +90,13 @@
 # Save splits
 
 # Clean the columns
-Boxes_columns = ["xmin","ymin","xmax","ymax","filename","split","source","resolution"]
+Boxes_columns = ["xmin","ymin","xmax","ymax","filename","split","source"]
 TreeBoxes_datasets = TreeBoxes_datasets[Boxes_columns]
 
-Polygons_columns = ["polygon","filename","split","source","resolution"]
+Polygons_columns = ["polygon","filename","split","source"]
 TreePolygons_datasets = TreePolygons_datasets[Polygons_columns]
 
-Points_columns = ["x","y","filename","split","source","resolution"]
+Points_columns = ["x","y","filename","split","source"]
 TreePoints_datasets = TreePoints_datasets[Points_columns]
 
 TreePolygons_datasets.to_csv("/orange/ewhite/DeepForest/MillionTrees/TreePolygons_v0.0/official.csv", index=False)
@@ -170,7 +170,6 @@
     if not os.path.exists(destination + os.path.basename(image)):
         shutil.copy("/orange/ewhite/DeepForest/MillionTrees/TreePolygons_v0.0/images/" + image, destination)
 
-
 shutil.make_archive("/orange/ewhite/DeepForest/MillionTrees/MiniTreeBoxes_v0.0", 'zip', "/orange/ewhite/DeepForest/MillionTrees/MiniTreeBoxes_v0.0")
 shutil.make_archive("/orange/ewhite/DeepForest/MillionTrees/MiniTreePoints_v0.0", 'zip', "/orange/ewhite/DeepForest/MillionTrees/MiniTreePoints_v0.0")
 shutil.make_archive("/orange/ewhite/DeepForest/MillionTrees/MiniTreePolygons_v0.0", 'zip', "/orange/ewhite/DeepForest/MillionTrees/MiniTreePolygons_v0.0")
diff --git a/docs/dataset_structure.md b/docs/dataset_structure.md
@@ -1,4 +1,72 @@
 # Dataset structure
 
 The organization of this dataset was inspired by the WILDS benchmark and torchgeo python package.
-There are three overarching datasets. 'Points', 'Polygons' and 'Boxes' based on the annotation geometry. See (datasets)[datasets.md] for the coponent
+There are three overarching datasets. 'Points', 'Polygons' and 'Boxes' based on the annotation geometry.
+
+## Data download
+
+```
+dataset = TreePointsDataset(download=True, root_dir=<directory to save data>) 
+```
+
+## Dataloaders
+
+Part of the inspriation of this package is to keep most users from needing to interact with the filesystem. The dataloaders are built in, and for many applications, the user will never need to mess around with csv files or image paths. All annotations are pytorch dataloaders and can be iterated over. There is a 'label' column, but given that it we have just one class, "Tree", it is mostly a convienance.
+
+```
+for image, label, metadata in dataset:
+    assert image.shape == (3, 100, 100)
+    assert label.shape == (2,)
+    assert len(metadata) == 2
+```
+
+Users can select a subset of the dataset and optionally supply a torchvision transform
+
+```
+transform = transforms.Compose([
+    transforms.Resize((448, 448)),
+    transforms.RandomHorizontalFlip(p=0.5),
+    transforms.ToTensor()
+])
+
+train_dataset = dataset.get_subset("train", transform=transform)
+    
+for image, label, metadata in train_dataset:
+    assert image.shape == (3, 448, 448)
+    assert label.shape == (4,)
+    assert len(metadata) == 2
+```
+## Split Schemes
+
+One of the great things about supplying data as dataloaders is easy access to different ways to combine datasets. The MillionTrees benchmark has multiple tasks, and each of these is a 'split_scheme', following the terminology from the WILDS benchmark. To see supported schemes for each dataset, see the documentation of each dataset, as well as the (leaderboard)[leaderboard.md].
+
+```
+dataset = TreePointsDataset(download=True, root_dir=<directory to save data>, split_scheme="official") 
+```
+This looks at the file official.csv and gets the 'split' column that designates which images are in train/test/val depending on the task.
+
+## Underlying data
+
+If a user does need to inspect the underlying data they will find the following design.
+
+## filename
+
+The filename is the name of the image. All filenames are relative to the data directory. 
+
+## source
+
+The source dataset or author of the images. See See (datasets)[datasets.md] for the component pieces.
+
+## Annotation geometry
+
+### Boxes
+
+Boxes annotations are given as xmin, ymin, xmax, ymax coordinates relative to the image origin (top-left).
+
+### Points
+
+Points annotations are given as x,y coordinate relative to the image origin.
+
+### Polygons
+
+Polygon annotations are given as well-known text coordinates, e.g. "POLYGON((x1 y2, x2 y2, x3, y3 ...))" relative to the image origin.
diff --git a/milliontrees/datasets/TreePolygons.py b/milliontrees/datasets/TreePolygons.py
@@ -20,8 +20,7 @@ class TreePolygonsDataset(MillionTreesDataset):
         - Random: 80% of the data randomly split into train and 20% in test
         - location: 80% of the locations randomly split into train and 20% in test
     Supported `split_scheme`:
-        - 'Random'
-        - 'location'
+        - 'official'
     Input (x):
         RGB images from camera traps
     Label (y):