From 9dbf3ff2e39ba932871218381c510071b1e7b8f0 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:59:29 +0100 Subject: [PATCH 1/6] Remove `task` arg in `load_dataset` in image-classification example --- .../image-classification/run_image_classification.py | 8 ++++++-- .../run_image_classification_no_trainer.py | 9 +++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 95ffdbf04ed61b..829a4bf842a1cf 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -255,7 +255,6 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - task="image-classification", token=model_args.token, ) else: @@ -268,9 +267,14 @@ def main(): "imagefolder", data_files=data_files, cache_dir=model_args.cache_dir, - task="image-classification", ) + # Rename image and label columns if needed (e.g. Cifar10) + if "img" in dataset["train"].features: + dataset = dataset.rename_column("img", "image") + if "label" in dataset["train"].features: + dataset = dataset.rename_column("label", "labels") + # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index a9e0758ee7c24f..a4b9e921d7b37e 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -272,7 +272,7 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - dataset = load_dataset(args.dataset_name, task="image-classification") + dataset = load_dataset(args.dataset_name) else: data_files = {} if args.train_dir is not None: @@ -283,11 +283,16 @@ def main(): "imagefolder", data_files=data_files, cache_dir=args.cache_dir, - task="image-classification", ) # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder. + # Rename image and label columns if needed (e.g. Cifar10) + if "img" in dataset["train"].features: + dataset = dataset.rename_column("img", "image") + if "label" in dataset["train"].features: + dataset = dataset.rename_column("label", "labels") + # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split if isinstance(args.train_val_split, float) and args.train_val_split > 0.0: From 9e6157c1345a9e9c887e02bb907ffa903d0a81a0 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 9 Jan 2024 15:04:16 +0100 Subject: [PATCH 2/6] Manage case where "train" is not in dataset --- .../pytorch/image-classification/run_image_classification.py | 4 ++-- .../run_image_classification_no_trainer.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 829a4bf842a1cf..79bf47fda75dcf 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -270,9 +270,9 @@ def main(): ) # Rename image and label columns if needed (e.g. Cifar10) - if "img" in dataset["train"].features: + if "img" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): dataset = dataset.rename_column("img", "image") - if "label" in dataset["train"].features: + if "label" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): dataset = dataset.rename_column("label", "labels") # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index a4b9e921d7b37e..9d57347e582676 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -288,9 +288,9 @@ def main(): # https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder. # Rename image and label columns if needed (e.g. Cifar10) - if "img" in dataset["train"].features: + if "img" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): dataset = dataset.rename_column("img", "image") - if "label" in dataset["train"].features: + if "label" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): dataset = dataset.rename_column("label", "labels") # If we don't have a validation split, split off a percentage of train as validation. From 53f84f2f1d22b388a77f48fc2742a10a00cf29c4 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 10 Jan 2024 10:32:02 +0100 Subject: [PATCH 3/6] Add new args to manage image and label column names --- .../run_image_classification.py | 15 ++++++++++---- .../run_image_classification_no_trainer.py | 20 +++++++++++++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 79bf47fda75dcf..2d11066ec2d53f 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -111,6 +111,13 @@ class DataTrainingArguments: ) }, ) + image_column_name: str = field( + default="image", + metadata={"help": "The name of the dataset column containing the image data. Defaults to 'image'."}, + ) + label_column_name: str = field( + default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'."} + ) def __post_init__(self): if self.dataset_name is None and (self.train_dir is None and self.validation_dir is None): @@ -270,10 +277,10 @@ def main(): ) # Rename image and label columns if needed (e.g. Cifar10) - if "img" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column("img", "image") - if "label" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column("label", "labels") + if data_args.image_column_name != "image" and data_args.image_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): + dataset = dataset.rename_column(data_args.image_column_name, "image") + if data_args.label_column_name != "labels" and data_args.label_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): + dataset = dataset.rename_column(data_args.label_column_name, "labels") # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 9d57347e582676..aa4e0b43f72882 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -189,6 +189,18 @@ def parse_args(): action="store_true", help="Whether or not to enable to load a pretrained model whose head dimensions are different.", ) + parser.add_argument( + "--image_column_name", + type=str, + default="image", + help="The name of the dataset column containing the image data. Defaults to 'image'.", + ) + parser.add_argument( + "--label_column_name", + type=str, + default="label", + help="The name of the dataset column containing the labels. Defaults to 'label'.", + ) args = parser.parse_args() # Sanity checks @@ -288,10 +300,10 @@ def main(): # https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder. # Rename image and label columns if needed (e.g. Cifar10) - if "img" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column("img", "image") - if "label" in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column("label", "labels") + if args.image_column_name != "image" and args.image_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): + dataset = dataset.rename_column(args.image_column_name, "image") + if args.label_column_name != "labels" and args.label_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): + dataset = dataset.rename_column(args.label_column_name, "labels") # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split From 6916845662cb2d0e5c63c0dc3e86a0fe597d2bc6 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Thu, 11 Jan 2024 00:15:44 +0100 Subject: [PATCH 4/6] Similar to audio-classification example --- .../run_image_classification.py | 40 ++++++++++++------- .../run_image_classification_no_trainer.py | 30 +++++++++----- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 2d11066ec2d53f..4b312b3f31d639 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -116,7 +116,8 @@ class DataTrainingArguments: metadata={"help": "The name of the dataset column containing the image data. Defaults to 'image'."}, ) label_column_name: str = field( - default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'."} + default="label", + metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'."}, ) def __post_init__(self): @@ -182,12 +183,6 @@ class ModelArguments: ) -def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - labels = torch.tensor([example["labels"] for example in examples]) - return {"pixel_values": pixel_values, "labels": labels} - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. @@ -276,11 +271,24 @@ def main(): cache_dir=model_args.cache_dir, ) - # Rename image and label columns if needed (e.g. Cifar10) - if data_args.image_column_name != "image" and data_args.image_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column(data_args.image_column_name, "image") - if data_args.label_column_name != "labels" and data_args.label_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column(data_args.label_column_name, "labels") + dataset_column_names = dataset["train"].column_names if "train" in dataset else dataset["validation"].column_names + if data_args.image_column_name not in dataset_column_names: + raise ValueError( + f"--image_column_name {data_args.image_column_name} not found in dataset '{data_args.dataset_name}'. " + "Make sure to set `--image_column_name` to the correct audio column - one of " + f"{', '.join(dataset_column_names)}." + ) + if data_args.label_column_name not in dataset_column_names: + raise ValueError( + f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. " + "Make sure to set `--label_column_name` to the correct text column - one of " + f"{', '.join(dataset_column_names)}." + ) + + def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + labels = torch.tensor([example[data_args.label_column_name] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split @@ -291,7 +299,7 @@ def main(): # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. - labels = dataset["train"].features["labels"].names + labels = dataset["train"].features[data_args.label_column_name].names label2id, id2label = {}, {} for i, label in enumerate(labels): label2id[label] = str(i) @@ -365,13 +373,15 @@ def compute_metrics(p): def train_transforms(example_batch): """Apply _train_transforms across a batch.""" example_batch["pixel_values"] = [ - _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"] + _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch[data_args.image_column_name] ] return example_batch def val_transforms(example_batch): """Apply _val_transforms across a batch.""" - example_batch["pixel_values"] = [_val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]] + example_batch["pixel_values"] = [ + _val_transforms(pil_img.convert("RGB")) for pil_img in example_batch[data_args.image_column_name] + ] return example_batch if training_args.do_train: diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index aa4e0b43f72882..25506216c4c627 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -299,11 +299,19 @@ def main(): # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder. - # Rename image and label columns if needed (e.g. Cifar10) - if args.image_column_name != "image" and args.image_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column(args.image_column_name, "image") - if args.label_column_name != "labels" and args.label_column_name in (dataset["train"].features if "train" in dataset else dataset["validation"].features): - dataset = dataset.rename_column(args.label_column_name, "labels") + dataset_column_names = dataset["train"].column_names if "train" in dataset else dataset["validation"].column_names + if args.image_column_name not in dataset_column_names: + raise ValueError( + f"--image_column_name {args.image_column_name} not found in dataset '{args.dataset_name}'. " + "Make sure to set `--image_column_name` to the correct audio column - one of " + f"{', '.join(dataset_column_names)}." + ) + if args.label_column_name not in dataset_column_names: + raise ValueError( + f"--label_column_name {args.label_column_name} not found in dataset '{args.dataset_name}'. " + "Make sure to set `--label_column_name` to the correct text column - one of " + f"{', '.join(dataset_column_names)}." + ) # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split @@ -314,7 +322,7 @@ def main(): # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. - labels = dataset["train"].features["labels"].names + labels = dataset["train"].features[args.label_column_name].names label2id = {label: str(i) for i, label in enumerate(labels)} id2label = {str(i): label for i, label in enumerate(labels)} @@ -373,12 +381,16 @@ def main(): def preprocess_train(example_batch): """Apply _train_transforms across a batch.""" - example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]] + example_batch["pixel_values"] = [ + train_transforms(image.convert("RGB")) for image in example_batch[args.image_column_name] + ] return example_batch def preprocess_val(example_batch): """Apply _val_transforms across a batch.""" - example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]] + example_batch["pixel_values"] = [ + val_transforms(image.convert("RGB")) for image in example_batch[args.image_column_name] + ] return example_batch with accelerator.main_process_first(): @@ -394,7 +406,7 @@ def preprocess_val(example_batch): # DataLoaders creation: def collate_fn(examples): pixel_values = torch.stack([example["pixel_values"] for example in examples]) - labels = torch.tensor([example["labels"] for example in examples]) + labels = torch.tensor([example[args.label_column_name] for example in examples]) return {"pixel_values": pixel_values, "labels": labels} train_dataloader = DataLoader( From a7f1d8e396965a4b3b4221acf5dc3ca52d9947ea Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Thu, 11 Jan 2024 00:33:41 +0100 Subject: [PATCH 5/6] Fix README --- examples/pytorch/image-classification/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/image-classification/README.md b/examples/pytorch/image-classification/README.md index 04b4748774ddf7..c95f180d4502cb 100644 --- a/examples/pytorch/image-classification/README.md +++ b/examples/pytorch/image-classification/README.md @@ -41,6 +41,7 @@ python run_image_classification.py \ --dataset_name beans \ --output_dir ./beans_outputs/ \ --remove_unused_columns False \ + --label_column_name labels \ --do_train \ --do_eval \ --push_to_hub \ @@ -197,7 +198,7 @@ accelerate test that will check everything is ready for training. Finally, you can launch training with ```bash -accelerate launch run_image_classification_trainer.py +accelerate launch run_image_classification_no_trainer.py --image_column_name img ``` This command is the same and will work for: From 445f30ab31cc2582a964c4f4d7b2c179097d358d Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Mon, 15 Jan 2024 17:26:11 +0100 Subject: [PATCH 6/6] Update tests --- examples/pytorch/test_accelerate_examples.py | 1 + examples/pytorch/test_pytorch_examples.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py index 8749c8add77950..fc485cf59a2ebb 100644 --- a/examples/pytorch/test_accelerate_examples.py +++ b/examples/pytorch/test_accelerate_examples.py @@ -322,6 +322,7 @@ def test_run_image_classification_no_trainer(self): --output_dir {tmp_dir} --with_tracking --checkpointing_steps 1 + --label_column_name labels """.split() run_command(self._launch_args + testargs) diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py index a0781b356595ba..0aabbb4bcb881c 100644 --- a/examples/pytorch/test_pytorch_examples.py +++ b/examples/pytorch/test_pytorch_examples.py @@ -398,6 +398,7 @@ def test_run_image_classification(self): --max_steps 10 --train_val_split 0.1 --seed 42 + --label_column_name labels """.split() if is_torch_fp16_available_on_device(torch_device):