From 14f2bbe14f11c3efd75b147a2b5798ec980dfbe0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 9 Feb 2024 15:10:56 +0000 Subject: [PATCH 01/64] added assert of torch vs numpy types --- .../data_pipeline/data_sampling/data_analyzer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index cb0d366ce798..014caf75d602 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -84,12 +84,14 @@ def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtyp metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) return metric_results - def update_metric_results(self, data, metric_types, metric_functions, metric_results): + def update_metric_results(self, data, metric_types, metric_dtypes, metric_functions, metric_results): for m_idx in range(len(metric_types)): - metric_type, metric_function, metric_result = metric_types[m_idx], \ - metric_functions[m_idx], metric_results[m_idx] + metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ + metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] + metric_values = metric_function(data) + assert metric_values.numpy().dtype == metric_dtype, \ + f"dtype {type(m_value)} returned by metric_function {metric_function} is not consistent with the metric_dtype {metric_dtype}" if metric_type == 'single_value_per_sample': - metric_values = metric_function(data) for row in range(metric_values.size()[0]): metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) metric_result["metric_to_sample_dict"][metric_values[row].item()].append( @@ -102,7 +104,6 @@ def update_metric_results(self, data, metric_types, metric_functions, metric_res writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) metric_result["metric_to_sample_dict"][m_value] = [] elif metric_type == 'accumulate_value_over_samples': - metric_values = metric_function(data) if metric_result["metric_value"] is None: metric_result["metric_value"] = metric_values else: @@ -158,7 +159,7 @@ def run_map_helper(self, thread_id): try: data = next(iterator) if self.custom_map_update is None: - self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results) + self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, metric_results) else: self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results) processed_sample += self.batch_size @@ -415,3 +416,4 @@ def run_reduce(self): else: self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) + From 796341d939d8b4c9f44a12c562a9c176535a9b57 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 09:24:26 +0000 Subject: [PATCH 02/64] first draft --- .../data_sampling/data_analyzer.py | 167 +++++++++++++++++- 1 file changed, 165 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 014caf75d602..00b095885c2f 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -93,8 +93,9 @@ def update_metric_results(self, data, metric_types, metric_dtypes, metric_functi f"dtype {type(m_value)} returned by metric_function {metric_function} is not consistent with the metric_dtype {metric_dtype}" if metric_type == 'single_value_per_sample': for row in range(metric_values.size()[0]): - metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) - metric_result["metric_to_sample_dict"][metric_values[row].item()].append( + value = metric_values[row].item() + metric_result["sample_to_metric_builder"].add_item(value) + metric_result["metric_to_sample_dict"][value].append( data['index'][row][0].item()) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 100: @@ -417,3 +418,165 @@ def run_reduce(self): self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) + + + + + +class DistributedDataAnalyzer(object): + + @staticmethod + def run_map_reduce( + dataset, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + metric_dtypes=[], + save_path="./", + collate_fn=None, + comm_group=None, + ): + + # setup individual dataloaders + num_workers, worker_id = comm_group.size(), comm_group.rank() + worker_splits, _ = split_dataset( dataset, num_workers, worker_id, num_threads=1) + start_idx, end_idx = worker_splits[worker_id], worker_splits[worker_id+1] + logger.info(f"worker {worker_id}: start working on data subset {start_idx} to {end_idx}") + worker_dataset = Subset(dataset, list(range(start_idx, end_idx))) + sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=batch_size, drop_last=False) + dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, + num_workers=0, collate_fn=collate_fn, pin_memory=False) + + # iterate dataloader and store metric results + sample_idx = start_idx + metric_results = [ []*len(metric_names) ] + for data in dataloader: + for m_idx in range(len(metric_types)): + metric_type, metric_dtype, metric_function, metric_result = \ + metric_types[m_idx], metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] + metric_values = metric_function(data) + assert metric_type == 'single_value_per_sample', f"{metric_type} not implemented." + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ + "metric_function must return a tensor or array" + assert metric_values.dtype == metric_dtype, \ + f"metric_function result dtype {metric_values.dtype} doesnt match metric_dtype {metric_dtype}" + if isinstance(metric_values, np.ndarray): + metric_values = torch.from_numpy(metric_values) + + for row in range(metric_values.size()[0]): + val = metric_values[row].item() + metric_result.append((sample_idx, val)) + sample_idx+=1 + + # compute dtype for sample ids + total_num_samples = len(dataset) + sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) + logger.info(f"Total number of data samples: {total_num_samples}.") + logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") + + metric_results = [ torch.tensor(m) for m in metric_results ] # convert to list of tensors + for m_idx in range(len(metric_names)): + + metric_result = metric_results[m_idx] + metric_name, metric_type = metric_names[m_idx], metric_types[m_idx] + assert metric_type == 'single_value_per_sample', f"{metric_type} not implemented." + metric_save_path = f"{save_path}/{metric_name}/" + + # get unique values across all ranks and compute the values dtype based on min/max + ids, values = metric_result[:,0], metric_result[:,1] + value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) + metric_value_dtype = find_fit_int_dtype(value_min, value_max) + + # sample_to_metric iterated metric_results and stored all metric values in same order + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + DistributedDataAnalyzer.dist_sequential_write([ids], sample_to_metric_fname, metric_value_dtype) + + # index_to_metric outputs a list of unique values (read from an value-ordered set) + # index_to_sample outputs the list of all sample ids for each unique value + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" + metric_result = metric_result[:,[1,0]] # swap columns + metric_result = dist_sample_sort(metric_result, comm_group) + unique_vals, sample_counts = torch.unique(metric_result[:,0], return_counts=True) + + values_buffer, samples_buffer, samples_it = [], [], 0 + for unique_v, count in zip(unique_vals, sample_counts): + values_buffer.append(unique_v.unsqueeze(0), dtype=torch.long) + samples_buffer.append(ids[samples_it:samples_it+count], dtype=type.long) + samples_it += count + DistributedDataAnalyzer.dist_sequential_write(values_buffer, index_to_metric_fname, metric_value_dtype) + DistributedDataAnalyzer.dist_sequential_write(samples_buffer, index_to_sample_fname, sample_idx_dtype) + + + @staticmethod + def dist_sequential_write(tensor_list, fname, dtype, comm_group): + """ save distributed values to files (each rank appends iteratively to the same file) """ + num_workers, worker_id = comm_group.size(), comm_group.rank() + builder = create_mmap_dataset_builder(fname, dtype) + assert isinstance(tensor_list, list), "tensor_list must be a list" + for rank in range(num_workers): + if rank == worker_id: + for tensor in tensor_list: + assert torch.is_tensor(tensor) and tensor.size()==1, "must be 1D tensor" + builder.add_item(tensor) + dist.barrier(comm_group) + close_mmap_dataset_builder(builder, fname) + + + @staticmethod + def dist_min_max(tensor, comm_group): + """ given a 1D tensor, return the min/max values across all ranks""" + assert len(tensor.size()) == 1, "tensor must be single-dimensional" + value_min, value_max = tensor.min(), tensor.max() + dist.all_reduce(value_min, op=dist.reduce_op.MIN, group=comm_group) + dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) + return value_min.item(), value_max.item() + + + @staticmethod + def dist_sample_sort(tensor, comm_group, n_samples=100): + """ perform a distributed random sort of a 2D tensor, and return the sorted partial tensor""" + + world_size, world_rank = comm_group.size(), comm_group.rank() + assert len(tensor.size()) == 2, "tensor must be 2D" + device, dims = tensor.device, tensor.size()[1] + + # 1 - Sort locally + tensor = torch.sort(tensor, dim=0)[0] + + # 2 - collect few samples per rank + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) + samples = tensor[idx[:-1]][:,0].to(device) #only first column, all but last row + + # 2 - Allgather samples + all_samples = [torch.zeros(n_samples, device=device).contiguous()] * world_size + dist.all_gather(all_samples, samples.contiguous(), group=comm_group) + all_samples = torch.cat(all_samples, dim=0).to(device) + + # 3 - Sort all samples and collect the ranges of each rank as equidistant + all_samples = all_samples.sort()[0] + idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges[-1] = +torch.inf #upper limit of last rank. + + # 4 - collect elements to send to each rank, based on the rank ranges + send = [] + for rank in range(world_size): + mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) + send.append(tensor[mask]) + + # 5. all to all to communicate the sizes to be sent/recv + send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64).to(device) for tensor in send] + recv_count = list(torch.empty([world_size], dtype=torch.int64, device=device).chunk(world_size)) + dist.all_to_all(recv_count, send_count, group=comm_group) + + # 6. all to all to communicate the elements to be sent/recv as a single tensor + send = torch.cat(send, dim=0).flatten().to(device) + recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) + dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) + + # 7. the received tensor is the 1D disjoint subset of the distributed tensor + return recv.view(-1, dims) + + From 07aa4b428175f808ca05fed10f1b2784b03271bc Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 09:38:55 +0000 Subject: [PATCH 03/64] reverted to original master --- .../data_sampling/data_analyzer.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 00b095885c2f..66fcd1761f01 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -13,6 +13,7 @@ from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset from deepspeed.utils import logger +import deepspeed.comm as dist from .indexed_dataset import MMapIndexedDataset from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype @@ -84,14 +85,12 @@ def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtyp metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) return metric_results - def update_metric_results(self, data, metric_types, metric_dtypes, metric_functions, metric_results): + def update_metric_results(self, data, metric_types, metric_functions, metric_results): for m_idx in range(len(metric_types)): - metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ - metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] - metric_values = metric_function(data) - assert metric_values.numpy().dtype == metric_dtype, \ - f"dtype {type(m_value)} returned by metric_function {metric_function} is not consistent with the metric_dtype {metric_dtype}" + metric_type, metric_function, metric_result = metric_types[m_idx], \ + metric_functions[m_idx], metric_results[m_idx] if metric_type == 'single_value_per_sample': + metric_values = metric_function(data) for row in range(metric_values.size()[0]): value = metric_values[row].item() metric_result["sample_to_metric_builder"].add_item(value) @@ -105,6 +104,7 @@ def update_metric_results(self, data, metric_types, metric_dtypes, metric_functi writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) metric_result["metric_to_sample_dict"][m_value] = [] elif metric_type == 'accumulate_value_over_samples': + metric_values = metric_function(data) if metric_result["metric_value"] is None: metric_result["metric_value"] = metric_values else: @@ -160,7 +160,7 @@ def run_map_helper(self, thread_id): try: data = next(iterator) if self.custom_map_update is None: - self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, metric_results) + self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results) else: self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results) processed_sample += self.batch_size @@ -421,8 +421,6 @@ def run_reduce(self): - - class DistributedDataAnalyzer(object): @staticmethod @@ -468,7 +466,7 @@ def run_map_reduce( val = metric_values[row].item() metric_result.append((sample_idx, val)) sample_idx+=1 - + # compute dtype for sample ids total_num_samples = len(dataset) sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) @@ -485,7 +483,7 @@ def run_map_reduce( # get unique values across all ranks and compute the values dtype based on min/max ids, values = metric_result[:,0], metric_result[:,1] - value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) + value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) metric_value_dtype = find_fit_int_dtype(value_min, value_max) # sample_to_metric iterated metric_results and stored all metric values in same order @@ -515,7 +513,7 @@ def dist_sequential_write(tensor_list, fname, dtype, comm_group): num_workers, worker_id = comm_group.size(), comm_group.rank() builder = create_mmap_dataset_builder(fname, dtype) assert isinstance(tensor_list, list), "tensor_list must be a list" - for rank in range(num_workers): + for rank in range(num_workers): if rank == worker_id: for tensor in tensor_list: assert torch.is_tensor(tensor) and tensor.size()==1, "must be 1D tensor" @@ -530,7 +528,7 @@ def dist_min_max(tensor, comm_group): assert len(tensor.size()) == 1, "tensor must be single-dimensional" value_min, value_max = tensor.min(), tensor.max() dist.all_reduce(value_min, op=dist.reduce_op.MIN, group=comm_group) - dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) + dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) return value_min.item(), value_max.item() @@ -558,7 +556,7 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] - ranges[-1] = +torch.inf #upper limit of last rank. + ranges[-1] = +torch.inf #upper limit of last rank. # 4 - collect elements to send to each rank, based on the rank ranges send = [] @@ -575,7 +573,7 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): send = torch.cat(send, dim=0).flatten().to(device) recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) - + # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) From 815a7897f4851332a54f82ac301cb3e53e21ce3f Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 10:19:02 +0000 Subject: [PATCH 04/64] added metric type accumulate_value_over_samples --- .../data_sampling/data_analyzer.py | 88 ++++++++++++------- 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 66fcd1761f01..87bd69cefefe 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -446,15 +446,19 @@ def run_map_reduce( dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, num_workers=0, collate_fn=collate_fn, pin_memory=False) - # iterate dataloader and store metric results + # set initial results list sample_idx = start_idx - metric_results = [ []*len(metric_names) ] + if metric_type not in ['single_value_per_sample', 'accumulate_value_over_samples']: + raise ValueError(f"metric_type {metric_type} not implemented.") + init_result = [] if metric_type == 'single_value_per_sample' else None + metric_results = [init_result] * len(metric_names) + + # iterate dataloader and store metric results for data in dataloader: for m_idx in range(len(metric_types)): metric_type, metric_dtype, metric_function, metric_result = \ metric_types[m_idx], metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) - assert metric_type == 'single_value_per_sample', f"{metric_type} not implemented." assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" assert metric_values.dtype == metric_dtype, \ @@ -462,11 +466,17 @@ def run_map_reduce( if isinstance(metric_values, np.ndarray): metric_values = torch.from_numpy(metric_values) - for row in range(metric_values.size()[0]): - val = metric_values[row].item() - metric_result.append((sample_idx, val)) - sample_idx+=1 - + if metric_type == 'single_value_per_sample': + for row in range(metric_values.size()[0]): + value = metric_values[row].item() + metric_result.append((sample_idx, value)) + sample_idx+=1 + elif metric_type == 'accumulate_value_over_samples': + if metric_result is None: + metric_result = metric_values + else: + metric_result.add_(metric_values) + # compute dtype for sample ids total_num_samples = len(dataset) sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) @@ -478,33 +488,42 @@ def run_map_reduce( metric_result = metric_results[m_idx] metric_name, metric_type = metric_names[m_idx], metric_types[m_idx] - assert metric_type == 'single_value_per_sample', f"{metric_type} not implemented." metric_save_path = f"{save_path}/{metric_name}/" - # get unique values across all ranks and compute the values dtype based on min/max - ids, values = metric_result[:,0], metric_result[:,1] - value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) - metric_value_dtype = find_fit_int_dtype(value_min, value_max) + if metric_type == 'single_value_per_sample': - # sample_to_metric iterated metric_results and stored all metric values in same order - sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - DistributedDataAnalyzer.dist_sequential_write([ids], sample_to_metric_fname, metric_value_dtype) + # get unique values across all ranks and compute the values dtype based on min/max + ids, values = metric_result[:,0], metric_result[:,1] + value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) + metric_value_dtype = find_fit_int_dtype(value_min, value_max) - # index_to_metric outputs a list of unique values (read from an value-ordered set) - # index_to_sample outputs the list of all sample ids for each unique value - index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - metric_result = metric_result[:,[1,0]] # swap columns - metric_result = dist_sample_sort(metric_result, comm_group) - unique_vals, sample_counts = torch.unique(metric_result[:,0], return_counts=True) + # sample_to_metric iterated metric_results and stored all metric values in same order + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + DistributedDataAnalyzer.dist_sequential_write([ids], sample_to_metric_fname, metric_value_dtype) - values_buffer, samples_buffer, samples_it = [], [], 0 - for unique_v, count in zip(unique_vals, sample_counts): - values_buffer.append(unique_v.unsqueeze(0), dtype=torch.long) - samples_buffer.append(ids[samples_it:samples_it+count], dtype=type.long) - samples_it += count - DistributedDataAnalyzer.dist_sequential_write(values_buffer, index_to_metric_fname, metric_value_dtype) - DistributedDataAnalyzer.dist_sequential_write(samples_buffer, index_to_sample_fname, sample_idx_dtype) + # index_to_metric outputs a list of unique values (read from an value-ordered set) + # index_to_sample outputs the list of all sample ids for each unique value + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" + metric_result = metric_result[:,[1,0]] # swap columns + metric_result = DistributedDataAnalyzer.dist_sample_sort(metric_result, comm_group) + unique_vals, sample_counts = torch.unique(metric_result[:,0], return_counts=True) + + values_buffer, samples_buffer, samples_it = [], [], 0 + for unique_v, count in zip(unique_vals, sample_counts): + values_buffer.append(unique_v.unsqueeze(0), dtype=torch.long) + samples_buffer.append(ids[samples_it:samples_it+count], dtype=type.long) + samples_it += count + DistributedDataAnalyzer.dist_sequential_write(values_buffer, index_to_metric_fname, metric_value_dtype) + DistributedDataAnalyzer.dist_sequential_write(samples_buffer, index_to_sample_fname, sample_idx_dtype) + elif metric_type == 'accumulate_value_over_samples': + metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" + # gather the sum of all values across all ranks and write to file + dist.all_reduce(metric_result, op=dist.reduce_op.SUM, group=comm_group) + if worker_id == 0: + builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + builder.add_item(metric_result) + close_mmap_dataset_builder(builder, metric_value_fname) @staticmethod @@ -513,7 +532,7 @@ def dist_sequential_write(tensor_list, fname, dtype, comm_group): num_workers, worker_id = comm_group.size(), comm_group.rank() builder = create_mmap_dataset_builder(fname, dtype) assert isinstance(tensor_list, list), "tensor_list must be a list" - for rank in range(num_workers): + for rank in range(num_workers): if rank == worker_id: for tensor in tensor_list: assert torch.is_tensor(tensor) and tensor.size()==1, "must be 1D tensor" @@ -528,7 +547,7 @@ def dist_min_max(tensor, comm_group): assert len(tensor.size()) == 1, "tensor must be single-dimensional" value_min, value_max = tensor.min(), tensor.max() dist.all_reduce(value_min, op=dist.reduce_op.MIN, group=comm_group) - dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) + dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) return value_min.item(), value_max.item() @@ -556,7 +575,7 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] - ranges[-1] = +torch.inf #upper limit of last rank. + ranges[-1] = +torch.inf #upper limit of last rank. # 4 - collect elements to send to each rank, based on the rank ranges send = [] @@ -573,8 +592,9 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): send = torch.cat(send, dim=0).flatten().to(device) recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) - + # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) + From 28a72e7cfad9d93d1d996671a76bcb2bc7eae044 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 10:30:58 +0000 Subject: [PATCH 05/64] pre-commit --- .../data_sampling/data_analyzer.py | 85 +++++++++---------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 87bd69cefefe..f7b4498275d0 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -94,8 +94,7 @@ def update_metric_results(self, data, metric_types, metric_functions, metric_res for row in range(metric_values.size()[0]): value = metric_values[row].item() metric_result["sample_to_metric_builder"].add_item(value) - metric_result["metric_to_sample_dict"][value].append( - data['index'][row][0].item()) + metric_result["metric_to_sample_dict"][value].append(data['index'][row][0].item()) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 100: metric_fname = metric_result["metric_to_sample_fname"] @@ -419,41 +418,43 @@ def run_reduce(self): self.num_threads, self.num_threads_reduce) - - class DistributedDataAnalyzer(object): @staticmethod def run_map_reduce( - dataset, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - metric_dtypes=[], - save_path="./", - collate_fn=None, - comm_group=None, - ): + dataset, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + metric_dtypes=[], + save_path="./", + collate_fn=None, + comm_group=None, + ): # setup individual dataloaders num_workers, worker_id = comm_group.size(), comm_group.rank() - worker_splits, _ = split_dataset( dataset, num_workers, worker_id, num_threads=1) - start_idx, end_idx = worker_splits[worker_id], worker_splits[worker_id+1] + worker_splits, _ = split_dataset(dataset, num_workers, worker_id, num_threads=1) + start_idx, end_idx = worker_splits[worker_id], worker_splits[worker_id + 1] logger.info(f"worker {worker_id}: start working on data subset {start_idx} to {end_idx}") worker_dataset = Subset(dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=batch_size, drop_last=False) - dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, - num_workers=0, collate_fn=collate_fn, pin_memory=False) + dataloader = DataLoader(dataset=worker_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=collate_fn, + pin_memory=False) # set initial results list - sample_idx = start_idx - if metric_type not in ['single_value_per_sample', 'accumulate_value_over_samples']: - raise ValueError(f"metric_type {metric_type} not implemented.") - init_result = [] if metric_type == 'single_value_per_sample' else None - metric_results = [init_result] * len(metric_names) + metric_results = [] + for metric_type in metric_types: + assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ + f"metric_type {metric_type} not implemented." + metric_results.append([] if metric_type == 'single_value_per_sample' else None) # iterate dataloader and store metric results + sample_idx = start_idx for data in dataloader: for m_idx in range(len(metric_types)): metric_type, metric_dtype, metric_function, metric_result = \ @@ -462,7 +463,7 @@ def run_map_reduce( assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" assert metric_values.dtype == metric_dtype, \ - f"metric_function result dtype {metric_values.dtype} doesnt match metric_dtype {metric_dtype}" + f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" if isinstance(metric_values, np.ndarray): metric_values = torch.from_numpy(metric_values) @@ -470,7 +471,7 @@ def run_map_reduce( for row in range(metric_values.size()[0]): value = metric_values[row].item() metric_result.append((sample_idx, value)) - sample_idx+=1 + sample_idx += 1 elif metric_type == 'accumulate_value_over_samples': if metric_result is None: metric_result = metric_values @@ -483,7 +484,7 @@ def run_map_reduce( logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - metric_results = [ torch.tensor(m) for m in metric_results ] # convert to list of tensors + metric_results = [torch.tensor(m) for m in metric_results] # convert to list of tensors for m_idx in range(len(metric_names)): metric_result = metric_results[m_idx] @@ -493,7 +494,7 @@ def run_map_reduce( if metric_type == 'single_value_per_sample': # get unique values across all ranks and compute the values dtype based on min/max - ids, values = metric_result[:,0], metric_result[:,1] + ids, values = metric_result[:, 0], metric_result[:, 1] value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) metric_value_dtype = find_fit_int_dtype(value_min, value_max) @@ -505,14 +506,14 @@ def run_map_reduce( # index_to_sample outputs the list of all sample ids for each unique value index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - metric_result = metric_result[:,[1,0]] # swap columns + metric_result = metric_result[:, [1, 0]] # swap columns metric_result = DistributedDataAnalyzer.dist_sample_sort(metric_result, comm_group) - unique_vals, sample_counts = torch.unique(metric_result[:,0], return_counts=True) + unique_vals, sample_counts = torch.unique(metric_result[:, 0], return_counts=True) values_buffer, samples_buffer, samples_it = [], [], 0 for unique_v, count in zip(unique_vals, sample_counts): values_buffer.append(unique_v.unsqueeze(0), dtype=torch.long) - samples_buffer.append(ids[samples_it:samples_it+count], dtype=type.long) + samples_buffer.append(ids[samples_it:samples_it + count], dtype=type.long) samples_it += count DistributedDataAnalyzer.dist_sequential_write(values_buffer, index_to_metric_fname, metric_value_dtype) DistributedDataAnalyzer.dist_sequential_write(samples_buffer, index_to_sample_fname, sample_idx_dtype) @@ -525,7 +526,6 @@ def run_map_reduce( builder.add_item(metric_result) close_mmap_dataset_builder(builder, metric_value_fname) - @staticmethod def dist_sequential_write(tensor_list, fname, dtype, comm_group): """ save distributed values to files (each rank appends iteratively to the same file) """ @@ -535,36 +535,32 @@ def dist_sequential_write(tensor_list, fname, dtype, comm_group): for rank in range(num_workers): if rank == worker_id: for tensor in tensor_list: - assert torch.is_tensor(tensor) and tensor.size()==1, "must be 1D tensor" + assert torch.is_tensor(tensor) and tensor.size() == 1, "must be 1D tensor" builder.add_item(tensor) dist.barrier(comm_group) close_mmap_dataset_builder(builder, fname) - @staticmethod def dist_min_max(tensor, comm_group): """ given a 1D tensor, return the min/max values across all ranks""" - assert len(tensor.size()) == 1, "tensor must be single-dimensional" value_min, value_max = tensor.min(), tensor.max() dist.all_reduce(value_min, op=dist.reduce_op.MIN, group=comm_group) dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) return value_min.item(), value_max.item() - @staticmethod def dist_sample_sort(tensor, comm_group, n_samples=100): """ perform a distributed random sort of a 2D tensor, and return the sorted partial tensor""" world_size, world_rank = comm_group.size(), comm_group.rank() - assert len(tensor.size()) == 2, "tensor must be 2D" device, dims = tensor.device, tensor.size()[1] # 1 - Sort locally tensor = torch.sort(tensor, dim=0)[0] # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) - samples = tensor[idx[:-1]][:,0].to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) + samples = tensor[idx[:-1]][:, 0].to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, device=device).contiguous()] * world_size @@ -574,27 +570,24 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) - ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] - ranges[-1] = +torch.inf #upper limit of last rank. + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges[-1] = +torch.inf #upper limit of last rank. # 4 - collect elements to send to each rank, based on the rank ranges send = [] for rank in range(world_size): - mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) + mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64).to(device) for tensor in send] + send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64).to(device) for tensor in send] recv_count = list(torch.empty([world_size], dtype=torch.int64, device=device).chunk(world_size)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) - recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) + recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) - - - From 38d7ce666e2263ef1af5b902af74fa9062cdfb53 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 12:18:58 +0100 Subject: [PATCH 06/64] Update data_analyzer.py --- .../runtime/data_pipeline/data_sampling/data_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index b04ba68b3671..ed3807c9b59a 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -94,9 +94,9 @@ def update_metric_results(self, data, metric_types, metric_dtypes, metric_functi f"dtype {metric_values.numpy().dtype} returned by metric_function {metric_function} is not consistent with the metric_dtype {metric_dtype}" if metric_type == 'single_value_per_sample': for row in range(metric_values.size()[0]): - value = metric_values[row].item() - metric_result["sample_to_metric_builder"].add_item(value) - metric_result["metric_to_sample_dict"][value].append(data['index'][row][0].item()) + metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) + metric_result["metric_to_sample_dict"][metric_values[row].item()].append( + data['index'][row][0].item()) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 100: metric_fname = metric_result["metric_to_sample_fname"] From 295fba6797526aee8b4ca475f0e90489e5fed2bb Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 13:31:08 +0000 Subject: [PATCH 07/64] added check for single node reduce. added barriers --- .../data_pipeline/data_sampling/data_analyzer.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 014caf75d602..b1f8e6aaeb24 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -36,7 +36,8 @@ def __init__(self, custom_map_init=None, custom_map_update=None, custom_map_finalize=None, - custom_reduce=None): + custom_reduce=None, + comm_group=None): super().__init__() self.dataset = dataset self.num_workers = num_workers @@ -55,6 +56,7 @@ def __init__(self, self.custom_map_update = custom_map_update self.custom_map_finalize = custom_map_finalize self.custom_reduce = custom_reduce + self.comm_group = comm_group def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id): metric_results = [] @@ -196,6 +198,7 @@ def run_map(self): else: assert self.num_threads == 1 self.run_map_helper(0) + dist.barrier(group=self.comm_group) def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): logger.info(f"Checking the value percentiles of metric {metric_name}...") @@ -410,10 +413,12 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ close_mmap_dataset_builder(metric_value_builder, metric_value_fname) def run_reduce(self): - if self.custom_reduce is None: + if self.worker_id == 0: # only one node does merging of files + if self.custom_reduce is None: self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) - else: + else: self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) + dist.barrier(group=self.comm_group) From 4144e427aaecfbe3837b0aff63aecd9acd8998bc Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 14 Feb 2024 16:46:42 +0000 Subject: [PATCH 08/64] more bug fixes --- .../data_sampling/data_analyzer.py | 125 +++++++++--------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index ed3807c9b59a..c60a7840c235 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -417,60 +417,58 @@ def run_reduce(self): self.num_threads, self.num_threads_reduce) + class DistributedDataAnalyzer(object): @staticmethod def run_map_reduce( - dataset, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - metric_dtypes=[], - save_path="./", - collate_fn=None, - comm_group=None, - ): + dataset, + comm_group, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + ): # setup individual dataloaders num_workers, worker_id = comm_group.size(), comm_group.rank() - worker_splits, _ = split_dataset(dataset, num_workers, worker_id, num_threads=1) - start_idx, end_idx = worker_splits[worker_id], worker_splits[worker_id + 1] + worker_splits, _ = split_dataset( dataset, num_workers, worker_id, num_threads=1) + start_idx, end_idx = worker_splits[worker_id] logger.info(f"worker {worker_id}: start working on data subset {start_idx} to {end_idx}") worker_dataset = Subset(dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=batch_size, drop_last=False) - dataloader = DataLoader(dataset=worker_dataset, - batch_sampler=sampler, - num_workers=0, - collate_fn=collate_fn, - pin_memory=False) + dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, + num_workers=0, collate_fn=collate_fn, pin_memory=False) # set initial results list metric_results = [] for metric_type in metric_types: assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ f"metric_type {metric_type} not implemented." - metric_results.append([] if metric_type == 'single_value_per_sample' else None) + metric_results.append( [] if metric_type == 'single_value_per_sample' else None ) # iterate dataloader and store metric results sample_idx = start_idx + valid_int_dtypes = (torch.uint8, torch.int16, torch.int32, torch.int64, np.uint8, np.int16, np.int32, np.int64) for data in dataloader: for m_idx in range(len(metric_types)): - metric_type, metric_dtype, metric_function, metric_result = \ - metric_types[m_idx], metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] + metric_type, metric_function, metric_result = metric_types[m_idx], metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" - assert metric_values.dtype == metric_dtype, \ - f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" if isinstance(metric_values, np.ndarray): metric_values = torch.from_numpy(metric_values) + assert metric_values.dtype in valid_int_dtypes, \ + f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_int_dtypes}" if metric_type == 'single_value_per_sample': for row in range(metric_values.size()[0]): value = metric_values[row].item() metric_result.append((sample_idx, value)) - sample_idx += 1 + sample_idx+=1 elif metric_type == 'accumulate_value_over_samples': if metric_result is None: metric_result = metric_values @@ -483,70 +481,77 @@ def run_map_reduce( logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - metric_results = [torch.tensor(m) for m in metric_results] # convert to list of tensors + metric_results = [ torch.tensor(m).to(device) for m in metric_results ] for m_idx in range(len(metric_names)): - metric_result = metric_results[m_idx] - metric_name, metric_type = metric_names[m_idx], metric_types[m_idx] + metric_values, metric_name, metric_type = metric_results[m_idx], metric_names[m_idx], metric_types[m_idx] metric_save_path = f"{save_path}/{metric_name}/" - if metric_type == 'single_value_per_sample': # get unique values across all ranks and compute the values dtype based on min/max - ids, values = metric_result[:, 0], metric_result[:, 1] + ids, values = metric_values[:,0], metric_values[:,1] value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) metric_value_dtype = find_fit_int_dtype(value_min, value_max) # sample_to_metric iterated metric_results and stored all metric values in same order sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - DistributedDataAnalyzer.dist_sequential_write([ids], sample_to_metric_fname, metric_value_dtype) + DistributedDataAnalyzer.file_write_ordered(ids, sample_to_metric_fname, metric_value_dtype, comm_group) # index_to_metric outputs a list of unique values (read from an value-ordered set) # index_to_sample outputs the list of all sample ids for each unique value index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - metric_result = metric_result[:, [1, 0]] # swap columns - metric_result = DistributedDataAnalyzer.dist_sample_sort(metric_result, comm_group) - unique_vals, sample_counts = torch.unique(metric_result[:, 0], return_counts=True) + metric_values = metric_values[:,[1,0]] # swap columns + metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, comm_group) + unique_vals, sample_counts = torch.unique(metric_values[:,0], return_counts=True) - values_buffer, samples_buffer, samples_it = [], [], 0 + values = torch.tensor([ v.item() for v in unique_vals]) + DistributedDataAnalyzer.file_write_ordered(values, index_to_metric_fname, type.long, comm_group) + + samples_buffer, samples_it = [], 0 for unique_v, count in zip(unique_vals, sample_counts): - values_buffer.append(unique_v.unsqueeze(0), dtype=torch.long) - samples_buffer.append(ids[samples_it:samples_it + count], dtype=type.long) + values_buffer.append(unique_v) + samples_buffer += ids[samples_it:samples_it+count] samples_it += count - DistributedDataAnalyzer.dist_sequential_write(values_buffer, index_to_metric_fname, metric_value_dtype) - DistributedDataAnalyzer.dist_sequential_write(samples_buffer, index_to_sample_fname, sample_idx_dtype) + values_buffer = torch.cat(values_buffer, dim=0) + samples_buffer = torch.cat(samples_buffer, dim=0) + DistributedDataAnalyzer.file_write_ordered(samples_buffer, index_to_sample_fname, type.long, comm_group) elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" # gather the sum of all values across all ranks and write to file - dist.all_reduce(metric_result, op=dist.reduce_op.SUM, group=comm_group) + dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=comm_group) if worker_id == 0: builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) - builder.add_item(metric_result) + builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) + @staticmethod - def dist_sequential_write(tensor_list, fname, dtype, comm_group): + def file_write_ordered(tensor, fname, numpy_dtype, comm_group): """ save distributed values to files (each rank appends iteratively to the same file) """ num_workers, worker_id = comm_group.size(), comm_group.rank() - builder = create_mmap_dataset_builder(fname, dtype) - assert isinstance(tensor_list, list), "tensor_list must be a list" - for rank in range(num_workers): - if rank == worker_id: - for tensor in tensor_list: - assert torch.is_tensor(tensor) and tensor.size() == 1, "must be 1D tensor" - builder.add_item(tensor) - dist.barrier(comm_group) - close_mmap_dataset_builder(builder, fname) + if worker_id == 0: + builder = create_mmap_dataset_builder(fname, numpy_dtype) + builder.add_item(tensor) + for src in range(1, num_workers): + tensor = dist.recv(tensor[src], src=src, group=comm_group) + builder.add_item(tensor) + close_mmap_dataset_builder(builder, fname) + else: + dist.send(tensor, dst=0, group=comm_group) + dist.barrier(comm_group) + @staticmethod def dist_min_max(tensor, comm_group): """ given a 1D tensor, return the min/max values across all ranks""" - value_min, value_max = tensor.min(), tensor.max() - dist.all_reduce(value_min, op=dist.reduce_op.MIN, group=comm_group) - dist.all_reduce(value_max, op=dist.reduce_op.MAX, group=comm_group) + value_min = tensor.min() + value_max = tensor.max() + dist.all_reduce(value_min, op=dist.ReduceOp.MIN, group=comm_group) + dist.all_reduce(value_max, op=dist.ReduceOp.MAX, group=comm_group) return value_min.item(), value_max.item() + @staticmethod def dist_sample_sort(tensor, comm_group, n_samples=100): """ perform a distributed random sort of a 2D tensor, and return the sorted partial tensor""" @@ -558,8 +563,8 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): tensor = torch.sort(tensor, dim=0)[0] # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) - samples = tensor[idx[:-1]][:, 0].to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) + samples = tensor[idx[:-1]][:,0].to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, device=device).contiguous()] * world_size @@ -569,24 +574,26 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) - ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] - ranges[-1] = +torch.inf #upper limit of last rank. + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges[-1] = +torch.inf #upper limit of last rank. # 4 - collect elements to send to each rank, based on the rank ranges send = [] for rank in range(world_size): - mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) + mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64).to(device) for tensor in send] + send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64).to(device) for tensor in send] recv_count = list(torch.empty([world_size], dtype=torch.int64, device=device).chunk(world_size)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) - recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) + recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) + + From a1e121c9f31250e5d47335021d933f1088496f1d Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 15 Feb 2024 14:43:49 +0000 Subject: [PATCH 09/64] new iteration, many bug fixes --- .../data_sampling/data_analyzer.py | 157 +++++++++++------- 1 file changed, 100 insertions(+), 57 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index c60a7840c235..f4dbf294617d 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -418,12 +418,13 @@ def run_reduce(self): + class DistributedDataAnalyzer(object): - @staticmethod - def run_map_reduce( + def __init__(self, dataset, - comm_group, + num_workers=1, + worker_id=0, batch_size=1, metric_names=[], metric_functions=[], @@ -431,21 +432,42 @@ def run_map_reduce( save_path="./", collate_fn=None, device='cuda', - ): + comm_group=None, + ) -> None: + self.dataset = dataset + self.comm_group = comm_group + self.batch_size = batch_size + self.metric_names = metric_names + self.metric_functions = metric_functions + self.metric_types = metric_types + self.save_path = save_path + self.collate_fn = collate_fn + self.device = device + + # comm_group and num_workers/worker_id are mutually exclusive + self.comm_group = comm_group + if comm_group is not None: + self.num_workers = comm_group.size() + self.worker_id = comm_group.rank() + else: + self.num_workers = num_workers + self.worker_id = worker_id + + + def run_map_reduce(self): # setup individual dataloaders - num_workers, worker_id = comm_group.size(), comm_group.rank() - worker_splits, _ = split_dataset( dataset, num_workers, worker_id, num_threads=1) - start_idx, end_idx = worker_splits[worker_id] - logger.info(f"worker {worker_id}: start working on data subset {start_idx} to {end_idx}") - worker_dataset = Subset(dataset, list(range(start_idx, end_idx))) - sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=batch_size, drop_last=False) + worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1) + start_idx, end_idx = worker_splits[self.worker_id] + logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") + worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) + sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, - num_workers=0, collate_fn=collate_fn, pin_memory=False) + num_workers=0, collate_fn=self.collate_fn, pin_memory=False) # set initial results list metric_results = [] - for metric_type in metric_types: + for metric_type in self.metric_types: assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ f"metric_type {metric_type} not implemented." metric_results.append( [] if metric_type == 'single_value_per_sample' else None ) @@ -454,8 +476,9 @@ def run_map_reduce( sample_idx = start_idx valid_int_dtypes = (torch.uint8, torch.int16, torch.int32, torch.int64, np.uint8, np.int16, np.int32, np.int64) for data in dataloader: - for m_idx in range(len(metric_types)): - metric_type, metric_function, metric_result = metric_types[m_idx], metric_functions[m_idx], metric_results[m_idx] + for m_idx in range(len(self.metric_types)): + metric_type, metric_function, metric_result = \ + self.metric_types[m_idx], self.metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" @@ -476,38 +499,37 @@ def run_map_reduce( metric_result.add_(metric_values) # compute dtype for sample ids - total_num_samples = len(dataset) + total_num_samples = len(self.dataset) sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - metric_results = [ torch.tensor(m).to(device) for m in metric_results ] - for m_idx in range(len(metric_names)): + metric_results = [ torch.tensor(m).to(self.device) for m in metric_results ] + for m_idx in range(len(self.metric_names)): - metric_values, metric_name, metric_type = metric_results[m_idx], metric_names[m_idx], metric_types[m_idx] - metric_save_path = f"{save_path}/{metric_name}/" + metric_values, metric_name, metric_type = \ + metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] + metric_save_path = f"{self.save_path}/{metric_name}/" if metric_type == 'single_value_per_sample': # get unique values across all ranks and compute the values dtype based on min/max ids, values = metric_values[:,0], metric_values[:,1] - value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, comm_group) + value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) metric_value_dtype = find_fit_int_dtype(value_min, value_max) # sample_to_metric iterated metric_results and stored all metric values in same order sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - DistributedDataAnalyzer.file_write_ordered(ids, sample_to_metric_fname, metric_value_dtype, comm_group) + self.file_write_ordered(ids, sample_to_metric_fname, metric_value_dtype) # index_to_metric outputs a list of unique values (read from an value-ordered set) # index_to_sample outputs the list of all sample ids for each unique value index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" metric_values = metric_values[:,[1,0]] # swap columns - metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, comm_group) + metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, self.num_workers) unique_vals, sample_counts = torch.unique(metric_values[:,0], return_counts=True) + self.file_write_ordered(unique_vals, index_to_metric_fname, torch.long) - values = torch.tensor([ v.item() for v in unique_vals]) - DistributedDataAnalyzer.file_write_ordered(values, index_to_metric_fname, type.long, comm_group) - + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" samples_buffer, samples_it = [], 0 for unique_v, count in zip(unique_vals, sample_counts): values_buffer.append(unique_v) @@ -515,7 +537,7 @@ def run_map_reduce( samples_it += count values_buffer = torch.cat(values_buffer, dim=0) samples_buffer = torch.cat(samples_buffer, dim=0) - DistributedDataAnalyzer.file_write_ordered(samples_buffer, index_to_sample_fname, type.long, comm_group) + self.file_write_ordered(samples_buffer, index_to_sample_fname, torch.long) elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" # gather the sum of all values across all ranks and write to file @@ -526,37 +548,58 @@ def run_map_reduce( close_mmap_dataset_builder(builder, metric_value_fname) - @staticmethod - def file_write_ordered(tensor, fname, numpy_dtype, comm_group): - """ save distributed values to files (each rank appends iteratively to the same file) """ - num_workers, worker_id = comm_group.size(), comm_group.rank() - if worker_id == 0: + def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=False): + """ save a distributed tensor to a single file, iteratively, ordered by rank """ + assert tensor.dim() == 1, "tensor must be serialized (1D)" + + # gather the sizes of all tensors to be received in rank 0. + size = torch.tensor(len(tensor), dtype=torch.int64, device=self.device) + size_list = [torch.zeros(1, dtype=torch.int64, device=self.device)] * self.num_workers + dist.all_gather(size_list, size, group=self.comm_group) + assert size_list[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check + + # rank 0 creates the file + if self.worker_id == 0: + os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) - builder.add_item(tensor) - for src in range(1, num_workers): - tensor = dist.recv(tensor[src], src=src, group=comm_group) - builder.add_item(tensor) - close_mmap_dataset_builder(builder, fname) - else: - dist.send(tensor, dst=0, group=comm_group) - dist.barrier(comm_group) + + if sequential_comm: # send, receive and write all tensors sequentially + if self.worker_id == 0: + builder.add_item(tensor.cpu()) #rank 0 writes its local tensor + for src in range(1, self.num_workers): + tensor = torch.zeros(size_list[src].item(), dtype=tensor.dtype, device=tensor.device) + dist.recv(tensor, src=src, group=self.comm_group) # receive tensor + builder.add_item(tensor.cpu()) # writes received tensor + else: + dist.send(tensor, 0, group=self.comm_group) # send tensor + else: # collective gather followed by a single write of all tensors in rank 0 + if self.worker_id == 0: + tensor_list = [torch.zeros(size_list[src].item(), dtype=tensor.dtype, device=tensor.device) for src in range(self.num_workers)] + dist.gather(tensor, tensor_list, dst=0, group=self.comm_group) + tensor_list = torch.cat(tensor_list, dim=0) + builder.add_item(tensor_list.cpu()) + else: + dist.gather(tensor, None, dst=0, group=self.comm_group) + + # rank 0 closes the file + if self.worker_id == 0: + close_mmap_dataset_builder(builder, fname) # close file + dist.barrier(self.comm_group) @staticmethod def dist_min_max(tensor, comm_group): - """ given a 1D tensor, return the min/max values across all ranks""" - value_min = tensor.min() - value_max = tensor.max() - dist.all_reduce(value_min, op=dist.ReduceOp.MIN, group=comm_group) - dist.all_reduce(value_max, op=dist.ReduceOp.MAX, group=comm_group) + """ given a distributed tensor, return the min/max values across all ranks""" + + value_min, value_max = tensor.min(), tensor.max() + dist.reduce(value_min, 0, op=dist.ReduceOp.MIN, group=comm_group) + dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group) return value_min.item(), value_max.item() @staticmethod - def dist_sample_sort(tensor, comm_group, n_samples=100): - """ perform a distributed random sort of a 2D tensor, and return the sorted partial tensor""" - - world_size, world_rank = comm_group.size(), comm_group.rank() + def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): + """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" device, dims = tensor.device, tensor.size()[1] # 1 - Sort locally @@ -564,28 +607,28 @@ def dist_sample_sort(tensor, comm_group, n_samples=100): # 2 - collect few samples per rank idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) - samples = tensor[idx[:-1]][:,0].to(device) #only first column, all but last row + samples = tensor[idx[:-1]][:,0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples - all_samples = [torch.zeros(n_samples, device=device).contiguous()] * world_size - dist.all_gather(all_samples, samples.contiguous(), group=comm_group) + all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device)] * num_workers + dist.all_gather(all_samples, samples, group=comm_group) all_samples = torch.cat(all_samples, dim=0).to(device) # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] - idx = torch.round(torch.linspace(0, len(all_samples) - 1, world_size + 1)).to(int) + idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int) ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] - ranges[-1] = +torch.inf #upper limit of last rank. + ranges[-1] += 1 # increase upper limit of last rank so that x < ranges[r+1]. # 4 - collect elements to send to each rank, based on the rank ranges send = [] - for rank in range(world_size): + for rank in range(num_workers): mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64).to(device) for tensor in send] - recv_count = list(torch.empty([world_size], dtype=torch.int64, device=device).chunk(world_size)) + send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64, device=device) for tensor in send] + recv_count = list(torch.zeros([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor From e045753c944ce47317c90f383f0b5a53cb5acb4c Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 15 Feb 2024 16:37:41 +0000 Subject: [PATCH 10/64] bug fixes --- .../data_sampling/data_analyzer.py | 85 ++++++------------- 1 file changed, 26 insertions(+), 59 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index f4dbf294617d..4863df4c48e8 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -419,42 +419,7 @@ def run_reduce(self): -class DistributedDataAnalyzer(object): - - def __init__(self, - dataset, - num_workers=1, - worker_id=0, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - save_path="./", - collate_fn=None, - device='cuda', - comm_group=None, - ) -> None: - self.dataset = dataset - self.comm_group = comm_group - self.batch_size = batch_size - self.metric_names = metric_names - self.metric_functions = metric_functions - self.metric_types = metric_types - self.save_path = save_path - self.collate_fn = collate_fn - self.device = device - - # comm_group and num_workers/worker_id are mutually exclusive - self.comm_group = comm_group - if comm_group is not None: - self.num_workers = comm_group.size() - self.worker_id = comm_group.rank() - else: - self.num_workers = num_workers - self.worker_id = worker_id - - - def run_map_reduce(self): + def run_map_reduce(self): # setup individual dataloaders worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1) @@ -532,17 +497,17 @@ def run_map_reduce(self): index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" samples_buffer, samples_it = [], 0 for unique_v, count in zip(unique_vals, sample_counts): - values_buffer.append(unique_v) - samples_buffer += ids[samples_it:samples_it+count] + # values_buffer.append(unique_v.item()) + samples_buffer += ids[samples_it:samples_it+count.item()].tolist() samples_it += count - values_buffer = torch.cat(values_buffer, dim=0) - samples_buffer = torch.cat(samples_buffer, dim=0) + # values_buffer = torch.tensor(values_buffer) + samples_buffer = torch.tensor(samples_buffer) self.file_write_ordered(samples_buffer, index_to_sample_fname, torch.long) elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" # gather the sum of all values across all ranks and write to file - dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=comm_group) - if worker_id == 0: + dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=self.comm_group) + if self.worker_id == 0: builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) @@ -554,32 +519,34 @@ def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=False): # gather the sizes of all tensors to be received in rank 0. size = torch.tensor(len(tensor), dtype=torch.int64, device=self.device) - size_list = [torch.zeros(1, dtype=torch.int64, device=self.device)] * self.num_workers - dist.all_gather(size_list, size, group=self.comm_group) - assert size_list[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check + sizes = torch.zeros(self.num_workers, dtype=torch.int64, device=self.device) + dist.all_gather_into_tensor(sizes, size, group=self.comm_group) + assert sizes[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check # rank 0 creates the file if self.worker_id == 0: os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) - if sequential_comm: # send, receive and write all tensors sequentially + if sequential_comm: # send, receive and write all tensors sequentially (slow, memory safe) + if self.worker_id == 0: - builder.add_item(tensor.cpu()) #rank 0 writes its local tensor - for src in range(1, self.num_workers): - tensor = torch.zeros(size_list[src].item(), dtype=tensor.dtype, device=tensor.device) - dist.recv(tensor, src=src, group=self.comm_group) # receive tensor + builder.add_item(tensor.cpu()) # write rank 0's tensor + + for src in range(1, self.num_workers): + dist.barrier() + if src == self.worker_id: + dist.send(tensor, 0, group=self.comm_group) # send tensor + elif self.worker_id == 0: + tensor = torch.zeros(sizes[src].item(), dtype=tensor.dtype, device=tensor.device) + dist.recv(tensor, src=src, group=self.comm_group) builder.add_item(tensor.cpu()) # writes received tensor - else: - dist.send(tensor, 0, group=self.comm_group) # send tensor - else: # collective gather followed by a single write of all tensors in rank 0 + + else: # all to all communication of serialized tensor (faster but requires more memory) + tensors = torch.zeros(sum(sizes).item(), dtype=tensor.dtype, device=tensor.device) + dist.all_gather_into_tensor(sizes, size, group=self.comm_group) if self.worker_id == 0: - tensor_list = [torch.zeros(size_list[src].item(), dtype=tensor.dtype, device=tensor.device) for src in range(self.num_workers)] - dist.gather(tensor, tensor_list, dst=0, group=self.comm_group) - tensor_list = torch.cat(tensor_list, dim=0) - builder.add_item(tensor_list.cpu()) - else: - dist.gather(tensor, None, dst=0, group=self.comm_group) + builder.add_item(tensors.cpu()) # rank 0 closes the file if self.worker_id == 0: From cdc838c19914cf6f4cb7801e2aac2193f36b6997 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 15 Feb 2024 16:48:41 +0000 Subject: [PATCH 11/64] fixing previous commit --- .../data_sampling/data_analyzer.py | 48 ++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 6d58bea95bcc..8a9f1cd5362b 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -424,7 +424,43 @@ def run_reduce(self): - def run_map_reduce(self): + +class DistributedDataAnalyzer(object): + + def __init__(self, + dataset, + num_workers=1, + worker_id=0, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + comm_group=None, + ) -> None: + self.dataset = dataset + self.comm_group = comm_group + self.batch_size = batch_size + self.metric_names = metric_names + self.metric_functions = metric_functions + self.metric_types = metric_types + self.save_path = save_path + self.collate_fn = collate_fn + self.device = device + + # comm_group and num_workers/worker_id are mutually exclusive + self.comm_group = comm_group + if comm_group is not None: + self.num_workers = comm_group.size() + self.worker_id = comm_group.rank() + else: + self.num_workers = num_workers + self.worker_id = worker_id + + + def run_map_reduce(self): # setup individual dataloaders worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1) @@ -506,7 +542,7 @@ def run_map_reduce(self): samples_buffer += ids[samples_it:samples_it+count.item()].tolist() samples_it += count # values_buffer = torch.tensor(values_buffer) - samples_buffer = torch.tensor(samples_buffer) + samples_buffer = torch.tensor(samples_buffer, device=self.device) self.file_write_ordered(samples_buffer, index_to_sample_fname, torch.long) elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" @@ -518,7 +554,7 @@ def run_map_reduce(self): close_mmap_dataset_builder(builder, metric_value_fname) - def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=False): + def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): """ save a distributed tensor to a single file, iteratively, ordered by rank """ assert tensor.dim() == 1, "tensor must be serialized (1D)" @@ -543,9 +579,9 @@ def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=False): if src == self.worker_id: dist.send(tensor, 0, group=self.comm_group) # send tensor elif self.worker_id == 0: - tensor = torch.zeros(sizes[src].item(), dtype=tensor.dtype, device=tensor.device) - dist.recv(tensor, src=src, group=self.comm_group) - builder.add_item(tensor.cpu()) # writes received tensor + recv = torch.zeros(sizes[src].item(), dtype=tensor.dtype, device=tensor.device) + dist.recv(recv, src=src, group=self.comm_group) + builder.add_item(recv.cpu()) # writes received tensor else: # all to all communication of serialized tensor (faster but requires more memory) tensors = torch.zeros(sum(sizes).item(), dtype=tensor.dtype, device=tensor.device) From 5c077104d47359ed3cb40f3cca3b7bb7ab088a49 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 16 Feb 2024 07:55:38 +0000 Subject: [PATCH 12/64] pre-commit --- .../data_sampling/data_analyzer.py | 94 +++++++++---------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 8a9f1cd5362b..77d2b3ede8e2 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -412,34 +412,32 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ close_mmap_dataset_builder(metric_value_builder, metric_value_fname) def run_reduce(self): - if self.worker_id == 0: # only one node does merging of files - if self.custom_reduce is None: - self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, + if self.worker_id == 0: # only one node does merging of files + if self.custom_reduce is None: + self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, + self.num_workers, self.num_threads, self.num_threads_reduce) + else: + self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) - else: - self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, - self.num_threads, self.num_threads_reduce) dist.barrier(group=self.comm_group) - - - class DistributedDataAnalyzer(object): - def __init__(self, - dataset, - num_workers=1, - worker_id=0, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - save_path="./", - collate_fn=None, - device='cuda', - comm_group=None, - ) -> None: + def __init__( + self, + dataset, + num_workers=1, + worker_id=0, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + comm_group=None, + ) -> None: self.dataset = dataset self.comm_group = comm_group self.batch_size = batch_size @@ -459,7 +457,6 @@ def __init__(self, self.num_workers = num_workers self.worker_id = worker_id - def run_map_reduce(self): # setup individual dataloaders @@ -468,15 +465,18 @@ def run_map_reduce(self): logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) - dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, - num_workers=0, collate_fn=self.collate_fn, pin_memory=False) + dataloader = DataLoader(dataset=worker_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=self.collate_fn, + pin_memory=False) # set initial results list metric_results = [] for metric_type in self.metric_types: assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ f"metric_type {metric_type} not implemented." - metric_results.append( [] if metric_type == 'single_value_per_sample' else None ) + metric_results.append([] if metric_type == 'single_value_per_sample' else None) # iterate dataloader and store metric results sample_idx = start_idx @@ -497,7 +497,7 @@ def run_map_reduce(self): for row in range(metric_values.size()[0]): value = metric_values[row].item() metric_result.append((sample_idx, value)) - sample_idx+=1 + sample_idx += 1 elif metric_type == 'accumulate_value_over_samples': if metric_result is None: metric_result = metric_values @@ -510,7 +510,7 @@ def run_map_reduce(self): logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - metric_results = [ torch.tensor(m).to(self.device) for m in metric_results ] + metric_results = [torch.tensor(m).to(self.device) for m in metric_results] for m_idx in range(len(self.metric_names)): metric_values, metric_name, metric_type = \ @@ -519,7 +519,7 @@ def run_map_reduce(self): if metric_type == 'single_value_per_sample': # get unique values across all ranks and compute the values dtype based on min/max - ids, values = metric_values[:,0], metric_values[:,1] + ids, values = metric_values[:, 0], metric_values[:, 1] value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) metric_value_dtype = find_fit_int_dtype(value_min, value_max) @@ -530,16 +530,17 @@ def run_map_reduce(self): # index_to_metric outputs a list of unique values (read from an value-ordered set) # index_to_sample outputs the list of all sample ids for each unique value index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - metric_values = metric_values[:,[1,0]] # swap columns - metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, self.num_workers) - unique_vals, sample_counts = torch.unique(metric_values[:,0], return_counts=True) + metric_values = metric_values[:, [1, 0]] # swap columns + metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, + self.num_workers) + unique_vals, sample_counts = torch.unique(metric_values[:, 0], return_counts=True) self.file_write_ordered(unique_vals, index_to_metric_fname, torch.long) index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" samples_buffer, samples_it = [], 0 for unique_v, count in zip(unique_vals, sample_counts): # values_buffer.append(unique_v.item()) - samples_buffer += ids[samples_it:samples_it+count.item()].tolist() + samples_buffer += ids[samples_it:samples_it + count.item()].tolist() samples_it += count # values_buffer = torch.tensor(values_buffer) samples_buffer = torch.tensor(samples_buffer, device=self.device) @@ -553,7 +554,6 @@ def run_map_reduce(self): builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) - def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): """ save a distributed tensor to a single file, iteratively, ordered by rank """ assert tensor.dim() == 1, "tensor must be serialized (1D)" @@ -562,7 +562,7 @@ def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): size = torch.tensor(len(tensor), dtype=torch.int64, device=self.device) sizes = torch.zeros(self.num_workers, dtype=torch.int64, device=self.device) dist.all_gather_into_tensor(sizes, size, group=self.comm_group) - assert sizes[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check + assert sizes[self.worker_id] == size, "all_gather did not return the same sizes" #sanity check # rank 0 creates the file if self.worker_id == 0: @@ -572,16 +572,16 @@ def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): if sequential_comm: # send, receive and write all tensors sequentially (slow, memory safe) if self.worker_id == 0: - builder.add_item(tensor.cpu()) # write rank 0's tensor + builder.add_item(tensor.cpu()) # write rank 0's tensor for src in range(1, self.num_workers): dist.barrier() if src == self.worker_id: - dist.send(tensor, 0, group=self.comm_group) # send tensor + dist.send(tensor, 0, group=self.comm_group) # send tensor elif self.worker_id == 0: recv = torch.zeros(sizes[src].item(), dtype=tensor.dtype, device=tensor.device) dist.recv(recv, src=src, group=self.comm_group) - builder.add_item(recv.cpu()) # writes received tensor + builder.add_item(recv.cpu()) # writes received tensor else: # all to all communication of serialized tensor (faster but requires more memory) tensors = torch.zeros(sum(sizes).item(), dtype=tensor.dtype, device=tensor.device) @@ -591,10 +591,9 @@ def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): # rank 0 closes the file if self.worker_id == 0: - close_mmap_dataset_builder(builder, fname) # close file + close_mmap_dataset_builder(builder, fname) # close file dist.barrier(self.comm_group) - @staticmethod def dist_min_max(tensor, comm_group): """ given a distributed tensor, return the min/max values across all ranks""" @@ -604,7 +603,6 @@ def dist_min_max(tensor, comm_group): dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group) return value_min.item(), value_max.item() - @staticmethod def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" @@ -614,8 +612,8 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): tensor = torch.sort(tensor, dim=0)[0] # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) - samples = tensor[idx[:-1]][:,0].contiguous().to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) + samples = tensor[idx[:-1]][:, 0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device)] * num_workers @@ -625,26 +623,24 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int) - ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] ranges[-1] += 1 # increase upper limit of last rank so that x < ranges[r+1]. # 4 - collect elements to send to each rank, based on the rank ranges send = [] for rank in range(num_workers): - mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) + mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64, device=device) for tensor in send] + send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64, device=device) for tensor in send] recv_count = list(torch.zeros([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) - recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) + recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) - - From f28e829b5ab970f2779638aa3a7699ed18375385 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 16 Feb 2024 09:00:37 +0000 Subject: [PATCH 13/64] recoverd master branch --- .../data_sampling/data_analyzer.py | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index b1f8e6aaeb24..3d4d8bde7d1c 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -13,7 +13,7 @@ from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset from deepspeed.utils import logger -from .indexed_dataset import MMapIndexedDataset +from .indexed_dataset import MMapIndexedDataset, valid_dtypes from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype @@ -37,7 +37,7 @@ def __init__(self, custom_map_update=None, custom_map_finalize=None, custom_reduce=None, - comm_group=None): + sample_indices=None): super().__init__() self.dataset = dataset self.num_workers = num_workers @@ -56,16 +56,14 @@ def __init__(self, self.custom_map_update = custom_map_update self.custom_map_finalize = custom_map_finalize self.custom_reduce = custom_reduce - self.comm_group = comm_group + self.sample_indices = sample_indices def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id): metric_results = [] for m_idx in range(len(metric_names)): metric_name, metric_type, metric_dtype = metric_names[m_idx], \ metric_types[m_idx], metric_dtypes[m_idx] - assert metric_dtype not in [ - np.float64, np.double - ], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)." + assert metric_dtype in valid_dtypes, f"metric_dtype {metric_dtype} not supported. Supported dtypes {valid_dtypes}" metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/" os.makedirs(metric_save_path, exist_ok=True) if metric_type == 'single_value_per_sample': @@ -86,18 +84,34 @@ def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtyp metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) return metric_results - def update_metric_results(self, data, metric_types, metric_dtypes, metric_functions, metric_results): + def update_metric_results(self, + data, + metric_types, + metric_dtypes, + metric_functions, + metric_results, + batch_start_idx=0): for m_idx in range(len(metric_types)): metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) - assert metric_values.numpy().dtype == metric_dtype, \ - f"dtype {type(m_value)} returned by metric_function {metric_function} is not consistent with the metric_dtype {metric_dtype}" + + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ + "metric_function must return a tensor or array" + assert metric_values.dtype == metric_dtype, \ + f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" + if isinstance(metric_values, np.ndarray): + metric_values = torch.from_numpy(metric_values) + if metric_type == 'single_value_per_sample': for row in range(metric_values.size()[0]): + sample_idx = batch_start_idx + row # sample idx following dataset iteration order + if isinstance(data, dict) and 'index' in data: # Megatron use case, idx provided in 'index' field + sample_idx = data['index'][row][0].item() + elif self.sample_indices is not None: # user defined shuffling of indices + sample_idx = self.sample_indices[sample_idx] metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) - metric_result["metric_to_sample_dict"][metric_values[row].item()].append( - data['index'][row][0].item()) + metric_result["metric_to_sample_dict"][metric_values[row].item()].append(sample_idx) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 100: metric_fname = metric_result["metric_to_sample_fname"] @@ -139,15 +153,12 @@ def run_map_helper(self, thread_id): f"on data subset {start_idx} to {end_idx}") thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False) - if self.collate_fn is None: - iterator = iter(DataLoader(thread_dataset, batch_sampler=sampler, num_workers=0, pin_memory=False)) - else: - iterator = iter( - DataLoader(thread_dataset, - batch_sampler=sampler, - num_workers=0, - collate_fn=self.collate_fn, - pin_memory=False)) + iterator = iter( + DataLoader(thread_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=self.collate_fn, + pin_memory=False)) if self.custom_map_init is None: metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types, self.metric_dtypes, self.save_path, self.worker_id) @@ -160,10 +171,13 @@ def run_map_helper(self, thread_id): while True: try: data = next(iterator) + batch_start_idx = start_idx + processed_sample if self.custom_map_update is None: - self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, metric_results) + self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, + metric_results, batch_start_idx) else: - self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results) + self.custom_map_update(data, self.metric_types, self.metric_dtypes, self.metric_functions, + metric_results, batch_start_idx) processed_sample += self.batch_size duration = (time.time() - start) / 3600.0 remain_duration = duration * total_sample / processed_sample - duration @@ -198,7 +212,6 @@ def run_map(self): else: assert self.num_threads == 1 self.run_map_helper(0) - dist.barrier(group=self.comm_group) def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): logger.info(f"Checking the value percentiles of metric {metric_name}...") @@ -413,12 +426,9 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ close_mmap_dataset_builder(metric_value_builder, metric_value_fname) def run_reduce(self): - if self.worker_id == 0: # only one node does merging of files - if self.custom_reduce is None: + if self.custom_reduce is None: self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) - else: + else: self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) - dist.barrier(group=self.comm_group) - From a634787f7dc77c1c7cdde89712ba021ada13c734 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 16 Feb 2024 11:52:44 +0000 Subject: [PATCH 14/64] write sequentially to file --- .../data_sampling/data_analyzer.py | 227 ++++++++++-------- 1 file changed, 132 insertions(+), 95 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index f051210ce727..5e92594d67f8 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -213,7 +213,6 @@ def run_map(self): else: assert self.num_threads == 1 self.run_map_helper(0) - dist.barrier(group=self.comm_group) def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): logger.info(f"Checking the value percentiles of metric {metric_name}...") @@ -428,32 +427,40 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ close_mmap_dataset_builder(metric_value_builder, metric_value_fname) def run_reduce(self): - if self.worker_id == 0: # only one node does merging of files - if self.custom_reduce is None: - self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, - self.num_workers, self.num_threads, self.num_threads_reduce) - else: - self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, + if self.custom_reduce is None: + self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, self.num_threads, self.num_threads_reduce) - dist.barrier(group=self.comm_group) + else: + self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, + self.num_threads, self.num_threads_reduce) + + def run_map_reduce(self, comm_group=None): + self.run_map() + # wait for the mapping operation, where all nodes outputs their own (partial) result files + dist.barrier(group=comm_group) + if self.worker_id == 0: + self.run_reduce() + # wait for the reduce, where rank 0 merges all (partial) files. Dataset can then be used by all nodes. + dist.barrier(group=comm_group) + class DistributedDataAnalyzer(object): - def __init__( - self, - dataset, - num_workers=1, - worker_id=0, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - save_path="./", - collate_fn=None, - device='cuda', - comm_group=None, - ) -> None: + def __init__(self, + dataset, + num_workers=1, + worker_id=0, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + comm_group=None, + sample_indices=None, + ) -> None: self.dataset = dataset self.comm_group = comm_group self.batch_size = batch_size @@ -463,6 +470,7 @@ def __init__( self.save_path = save_path self.collate_fn = collate_fn self.device = device + self.sample_indices = sample_indices # comm_group and num_workers/worker_id are mutually exclusive self.comm_group = comm_group @@ -473,6 +481,7 @@ def __init__( self.num_workers = num_workers self.worker_id = worker_id + def run_map_reduce(self): # setup individual dataloaders @@ -481,24 +490,20 @@ def run_map_reduce(self): logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) - dataloader = DataLoader(dataset=worker_dataset, - batch_sampler=sampler, - num_workers=0, - collate_fn=self.collate_fn, - pin_memory=False) + dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, + num_workers=0, collate_fn=self.collate_fn, pin_memory=False) # set initial results list metric_results = [] for metric_type in self.metric_types: assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ f"metric_type {metric_type} not implemented." - metric_results.append([] if metric_type == 'single_value_per_sample' else None) + metric_results.append( [] if metric_type == 'single_value_per_sample' else None ) # iterate dataloader and store metric results - sample_idx = start_idx - valid_int_dtypes = (torch.uint8, torch.int16, torch.int32, torch.int64, np.uint8, np.int16, np.int32, np.int64) + batch_start_idx = start_idx for data in dataloader: - for m_idx in range(len(self.metric_types)): + for m_idx in range(len(self.metric_names)): metric_type, metric_function, metric_result = \ self.metric_types[m_idx], self.metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) @@ -506,19 +511,24 @@ def run_map_reduce(self): "metric_function must return a tensor or array" if isinstance(metric_values, np.ndarray): metric_values = torch.from_numpy(metric_values) - assert metric_values.dtype in valid_int_dtypes, \ - f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_int_dtypes}" + assert metric_values.dtype in valid_dtypes, \ + f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_dtypes}" if metric_type == 'single_value_per_sample': for row in range(metric_values.size()[0]): value = metric_values[row].item() - metric_result.append((sample_idx, value)) - sample_idx += 1 + sample_idx = batch_start_idx + row # sample idx following dataset iteration order + if isinstance(data, dict) and 'index' in data: # Megatron use case, idx provided in 'index' field + sample_idx = data['index'][row][0].item() + elif self.sample_indices is not None: # user defined shuffling of indices + sample_idx = self.sample_indices[sample_idx] + metric_result.append((value, sample_idx)) elif metric_type == 'accumulate_value_over_samples': if metric_result is None: metric_result = metric_values else: metric_result.add_(metric_values) + batch_start_idx += self.batch_size # compute dtype for sample ids total_num_samples = len(self.dataset) @@ -526,90 +536,113 @@ def run_map_reduce(self): logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - metric_results = [torch.tensor(m).to(self.device) for m in metric_results] - for m_idx in range(len(self.metric_names)): + # convert to list of tensors + metric_results = [ torch.tensor(m).to(self.device) for m in metric_results ] + for m_idx in range(len(self.metric_names)): metric_values, metric_name, metric_type = \ metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] metric_save_path = f"{self.save_path}/{metric_name}/" if metric_type == 'single_value_per_sample': - # get unique values across all ranks and compute the values dtype based on min/max - ids, values = metric_values[:, 0], metric_values[:, 1] - value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) - metric_value_dtype = find_fit_int_dtype(value_min, value_max) - # sample_to_metric iterated metric_results and stored all metric values in same order + # sample_to_metric maps sample ids to metric values, as a list of metric values sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - self.file_write_ordered(ids, sample_to_metric_fname, metric_value_dtype) + values = [torch.tensor([x]) for x in metric_values[:,0]] + self.file_write_ordered(values, sample_to_metric_fname, torch.long) + + # index_to_metric and index_to_sample serialize a dicitonary from metric to samples + # index_to_metric stores a key per row, index_to_sample stores the values per row + + # distributed sorting by values, gives an ordered disjoint subset of keys on nodes + metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, self.num_workers) + metric_to_sample_dict = {} + for value, sample in metric_values: + if value.item() not in metric_to_sample_dict: + metric_to_sample_dict[value.item()] = [] + metric_to_sample_dict[value.item()].append(sample.item()) + values = [torch.tensor([x]) for x in metric_to_sample_dict.keys()] + samples = [torch.tensor(metric_to_sample_dict[x]) for x in metric_to_sample_dict.keys()] + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values + self.file_write_ordered(values, index_to_metric_fname, torch.long) + self.file_write_ordered(samples, index_to_sample_fname, torch.long) - # index_to_metric outputs a list of unique values (read from an value-ordered set) - # index_to_sample outputs the list of all sample ids for each unique value - index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - metric_values = metric_values[:, [1, 0]] # swap columns - metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, - self.num_workers) - unique_vals, sample_counts = torch.unique(metric_values[:, 0], return_counts=True) - self.file_write_ordered(unique_vals, index_to_metric_fname, torch.long) + # get unique values across all ranks and compute the values dtype based on min/max + # value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) + # metric_value_dtype = find_fit_int_dtype(value_min, value_max) - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - samples_buffer, samples_it = [], 0 - for unique_v, count in zip(unique_vals, sample_counts): - # values_buffer.append(unique_v.item()) - samples_buffer += ids[samples_it:samples_it + count.item()].tolist() - samples_it += count - # values_buffer = torch.tensor(values_buffer) - samples_buffer = torch.tensor(samples_buffer, device=self.device) - self.file_write_ordered(samples_buffer, index_to_sample_fname, torch.long) elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - # gather the sum of all values across all ranks and write to file dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=self.comm_group) if self.worker_id == 0: - builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + metric_values = metric_values.cpu().numpy() + builder = create_mmap_dataset_builder(metric_value_fname, torch.long) builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) - def file_write_ordered(self, tensor, fname, numpy_dtype, sequential_comm=True): + + def file_write_ordered(self, tensor_list, fname, numpy_dtype): """ save a distributed tensor to a single file, iteratively, ordered by rank """ - assert tensor.dim() == 1, "tensor must be serialized (1D)" - # gather the sizes of all tensors to be received in rank 0. - size = torch.tensor(len(tensor), dtype=torch.int64, device=self.device) - sizes = torch.zeros(self.num_workers, dtype=torch.int64, device=self.device) + # each not has a list of rows (tensors) to be written to the file. + # we will serialize it to communicate it in one comm step. + + tkwargs = dict(dtype=torch.int64, device=self.device) + + # 1. gather on rank 0 the number of rows to be sent/recv + row_count = torch.tensor(len(tensor_list), **tkwargs) + row_counts = torch.zeros(self.num_workers, **tkwargs) + dist.all_gather_into_tensor(row_counts, row_count, group=self.comm_group) + assert row_counts[self.worker_id]==row_count, "all_gather failed" #sanity check + + # 3. gather on rank 0 the sizes of the rows to be sent/recv + # (all_gather requires all tensors to be of same size so we need pad them) + max_size = max(row_counts) + row_lens, row_len = None, torch.zeros(max_size, **tkwargs) + if self.worker_id == 0: # create padded recv buffers + row_lens = [torch.zeros(max_size, **tkwargs)]*self.num_workers + row_len[0:len(tensor_list)] = torch.tensor([len(l) for l in tensor_list], **tkwargs) + dist.gather(row_len, row_lens, dst=0, group=self.comm_group) + if self.worker_id == 0: # remove padding from buffers + row_lens = [r[:s] for r,s in zip(row_lens, row_counts)] + + # 4. gather on rank 0 of the total size (sum of all row lengths) to be received + size = torch.tensor(sum(row_len).item(), **tkwargs) + sizes = torch.zeros(self.num_workers, **tkwargs) + print("XXX", self.worker_id) dist.all_gather_into_tensor(sizes, size, group=self.comm_group) - assert sizes[self.worker_id] == size, "all_gather did not return the same sizes" #sanity check + assert sizes[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check - # rank 0 creates the file + # method to deserializes a buffer into rows of different lengths and write them to file + def write_recv_buffer_to_file(recv_buffer, src, builder): + assert self.worker_id == 0, "only rank 0 can write to file" + for row_len in row_lens[src]: + builder.add_item(recv_buffer[:row_len].cpu()) + recv_buffer = recv_buffer[row_len:] + + # 5. rank 0 receives all tensors sequentially and writes them to the file + buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer if self.worker_id == 0: os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) + write_recv_buffer_to_file(buffer, 0, builder) - if sequential_comm: # send, receive and write all tensors sequentially (slow, memory safe) - - if self.worker_id == 0: - builder.add_item(tensor.cpu()) # write rank 0's tensor - - for src in range(1, self.num_workers): - dist.barrier() - if src == self.worker_id: - dist.send(tensor, 0, group=self.comm_group) # send tensor - elif self.worker_id == 0: - recv = torch.zeros(sizes[src].item(), dtype=tensor.dtype, device=tensor.device) - dist.recv(recv, src=src, group=self.comm_group) - builder.add_item(recv.cpu()) # writes received tensor - - else: # all to all communication of serialized tensor (faster but requires more memory) - tensors = torch.zeros(sum(sizes).item(), dtype=tensor.dtype, device=tensor.device) - dist.all_gather_into_tensor(sizes, size, group=self.comm_group) - if self.worker_id == 0: - builder.add_item(tensors.cpu()) + for src in range(1, self.num_workers): + dist.barrier(group=self.comm_group) + if src == self.worker_id: + dist.send(buffer, 0, group=self.comm_group) # send tensor + elif self.worker_id == 0: + buffer = torch.zeros(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) + dist.recv(buffer, src=src, group=self.comm_group) + write_recv_buffer_to_file(buffer, src, builder) # rank 0 closes the file if self.worker_id == 0: - close_mmap_dataset_builder(builder, fname) # close file + close_mmap_dataset_builder(builder, fname) # close file dist.barrier(self.comm_group) + @staticmethod def dist_min_max(tensor, comm_group): """ given a distributed tensor, return the min/max values across all ranks""" @@ -619,6 +652,7 @@ def dist_min_max(tensor, comm_group): dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group) return value_min.item(), value_max.item() + @staticmethod def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" @@ -628,8 +662,8 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): tensor = torch.sort(tensor, dim=0)[0] # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) - samples = tensor[idx[:-1]][:, 0].contiguous().to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) + samples = tensor[idx[:-1]][:,0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device)] * num_workers @@ -639,24 +673,27 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int) - ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] ranges[-1] += 1 # increase upper limit of last rank so that x < ranges[r+1]. # 4 - collect elements to send to each rank, based on the rank ranges send = [] for rank in range(num_workers): - mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) + mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64, device=device) for tensor in send] + send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64, device=device) for tensor in send] recv_count = list(torch.zeros([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) - recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) + recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) + del send # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) + + From ec59f08d2534fd13c9a7fb2c8dbdaf6de2df6ee0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 16 Feb 2024 13:32:44 +0000 Subject: [PATCH 15/64] fixes in sequential write --- .../data_sampling/data_analyzer.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 5e92594d67f8..f3a05b620fa8 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -445,6 +445,7 @@ def run_map_reduce(self, comm_group=None): + class DistributedDataAnalyzer(object): def __init__(self, @@ -545,33 +546,33 @@ def run_map_reduce(self): metric_save_path = f"{self.save_path}/{metric_name}/" if metric_type == 'single_value_per_sample': + # get unique values across all ranks and compute the values dtype based on min/max + # value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) + # metric_value_dtype = find_fit_int_dtype(value_min, value_max) # sample_to_metric maps sample ids to metric values, as a list of metric values sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" values = [torch.tensor([x]) for x in metric_values[:,0]] self.file_write_ordered(values, sample_to_metric_fname, torch.long) - # index_to_metric and index_to_sample serialize a dicitonary from metric to samples - # index_to_metric stores a key per row, index_to_sample stores the values per row - # distributed sorting by values, gives an ordered disjoint subset of keys on nodes metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, self.num_workers) - metric_to_sample_dict = {} - for value, sample in metric_values: - if value.item() not in metric_to_sample_dict: - metric_to_sample_dict[value.item()] = [] - metric_to_sample_dict[value.item()].append(sample.item()) - values = [torch.tensor([x]) for x in metric_to_sample_dict.keys()] - samples = [torch.tensor(metric_to_sample_dict[x]) for x in metric_to_sample_dict.keys()] + metric_to_samples_dict = {} + if len(metric_values)>0: + for value, sample in metric_values: + if value.item() not in metric_to_samples_dict: + metric_to_samples_dict[value.item()] = [] + metric_to_samples_dict[value.item()].append(sample.item()) + + # index_to_metric and index_to_sample serialize a dicitonary from metric to samples + # index_to_metric stores a key per row, index_to_sample stores the values per row + values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()] + samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()] index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values self.file_write_ordered(values, index_to_metric_fname, torch.long) self.file_write_ordered(samples, index_to_sample_fname, torch.long) - # get unique values across all ranks and compute the values dtype based on min/max - # value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) - # metric_value_dtype = find_fit_int_dtype(value_min, value_max) - elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=self.comm_group) @@ -610,7 +611,6 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype): # 4. gather on rank 0 of the total size (sum of all row lengths) to be received size = torch.tensor(sum(row_len).item(), **tkwargs) sizes = torch.zeros(self.num_workers, **tkwargs) - print("XXX", self.worker_id) dist.all_gather_into_tensor(sizes, size, group=self.comm_group) assert sizes[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check @@ -622,7 +622,10 @@ def write_recv_buffer_to_file(recv_buffer, src, builder): recv_buffer = recv_buffer[row_len:] # 5. rank 0 receives all tensors sequentially and writes them to the file - buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer + if len(tensor_list) == 0: + buffer = torch.tensor([], **tkwargs) #create zero-size buffer + else: + buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer if self.worker_id == 0: os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) @@ -696,4 +699,3 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) - From ea0d65f558c937a87ce7575fa8a464df032a21bd Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 16 Feb 2024 13:33:45 +0000 Subject: [PATCH 16/64] pre-commit hooks --- .../data_sampling/data_analyzer.py | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 3f69373dea56..1725786a4428 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -442,25 +442,25 @@ def run_map_reduce(self, comm_group=None): self.run_reduce() # wait for the reduce, where rank 0 merges all (partial) files. Dataset can then be used by all nodes. dist.barrier(group=comm_group) - class DistributedDataAnalyzer(object): - def __init__(self, - dataset, - num_workers=1, - worker_id=0, - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - save_path="./", - collate_fn=None, - device='cuda', - comm_group=None, - sample_indices=None, - ) -> None: + def __init__( + self, + dataset, + num_workers=1, + worker_id=0, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + comm_group=None, + sample_indices=None, + ) -> None: self.dataset = dataset self.comm_group = comm_group self.batch_size = batch_size @@ -481,7 +481,6 @@ def __init__(self, self.num_workers = num_workers self.worker_id = worker_id - def run_map_reduce(self): # setup individual dataloaders @@ -490,15 +489,18 @@ def run_map_reduce(self): logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) - dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, - num_workers=0, collate_fn=self.collate_fn, pin_memory=False) + dataloader = DataLoader(dataset=worker_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=self.collate_fn, + pin_memory=False) # set initial results list metric_results = [] for metric_type in self.metric_types: assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ f"metric_type {metric_type} not implemented." - metric_results.append( [] if metric_type == 'single_value_per_sample' else None ) + metric_results.append([] if metric_type == 'single_value_per_sample' else None) # iterate dataloader and store metric results batch_start_idx = start_idx @@ -518,7 +520,8 @@ def run_map_reduce(self): for row in range(metric_values.size()[0]): value = metric_values[row].item() sample_idx = batch_start_idx + row # sample idx following dataset iteration order - if isinstance(data, dict) and 'index' in data: # Megatron use case, idx provided in 'index' field + if isinstance(data, + dict) and 'index' in data: # Megatron use case, idx provided in 'index' field sample_idx = data['index'][row][0].item() elif self.sample_indices is not None: # user defined shuffling of indices sample_idx = self.sample_indices[sample_idx] @@ -537,7 +540,7 @@ def run_map_reduce(self): logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") # convert to list of tensors - metric_results = [ torch.tensor(m).to(self.device) for m in metric_results ] + metric_results = [torch.tensor(m).to(self.device) for m in metric_results] for m_idx in range(len(self.metric_names)): metric_values, metric_name, metric_type = \ @@ -551,13 +554,14 @@ def run_map_reduce(self): # sample_to_metric maps sample ids to metric values, as a list of metric values sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - values = [torch.tensor([x]) for x in metric_values[:,0]] + values = [torch.tensor([x]) for x in metric_values[:, 0]] self.file_write_ordered(values, sample_to_metric_fname, torch.long) # distributed sorting by values, gives an ordered disjoint subset of keys on nodes - metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, self.num_workers) + metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, + self.num_workers) metric_to_samples_dict = {} - if len(metric_values)>0: + if len(metric_values) > 0: for value, sample in metric_values: if value.item() not in metric_to_samples_dict: metric_to_samples_dict[value.item()] = [] @@ -567,8 +571,8 @@ def run_map_reduce(self): # index_to_metric stores a key per row, index_to_sample stores the values per row values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()] samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()] - index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values self.file_write_ordered(values, index_to_metric_fname, torch.long) self.file_write_ordered(samples, index_to_sample_fname, torch.long) @@ -581,7 +585,6 @@ def run_map_reduce(self): builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) - def file_write_ordered(self, tensor_list, fname, numpy_dtype): """ save a distributed tensor to a single file, iteratively, ordered by rank """ @@ -594,24 +597,24 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype): row_count = torch.tensor(len(tensor_list), **tkwargs) row_counts = torch.zeros(self.num_workers, **tkwargs) dist.all_gather_into_tensor(row_counts, row_count, group=self.comm_group) - assert row_counts[self.worker_id]==row_count, "all_gather failed" #sanity check + assert row_counts[self.worker_id] == row_count, "all_gather failed" #sanity check # 3. gather on rank 0 the sizes of the rows to be sent/recv # (all_gather requires all tensors to be of same size so we need pad them) max_size = max(row_counts) row_lens, row_len = None, torch.zeros(max_size, **tkwargs) - if self.worker_id == 0: # create padded recv buffers - row_lens = [torch.zeros(max_size, **tkwargs)]*self.num_workers + if self.worker_id == 0: # create padded recv buffers + row_lens = [torch.zeros(max_size, **tkwargs)] * self.num_workers row_len[0:len(tensor_list)] = torch.tensor([len(l) for l in tensor_list], **tkwargs) dist.gather(row_len, row_lens, dst=0, group=self.comm_group) - if self.worker_id == 0: # remove padding from buffers - row_lens = [r[:s] for r,s in zip(row_lens, row_counts)] + if self.worker_id == 0: # remove padding from buffers + row_lens = [r[:s] for r, s in zip(row_lens, row_counts)] # 4. gather on rank 0 of the total size (sum of all row lengths) to be received size = torch.tensor(sum(row_len).item(), **tkwargs) sizes = torch.zeros(self.num_workers, **tkwargs) dist.all_gather_into_tensor(sizes, size, group=self.comm_group) - assert sizes[self.worker_id]==size, "all_gather did not return the same sizes" #sanity check + assert sizes[self.worker_id] == size, "all_gather did not return the same sizes" #sanity check # method to deserializes a buffer into rows of different lengths and write them to file def write_recv_buffer_to_file(recv_buffer, src, builder): @@ -622,9 +625,9 @@ def write_recv_buffer_to_file(recv_buffer, src, builder): # 5. rank 0 receives all tensors sequentially and writes them to the file if len(tensor_list) == 0: - buffer = torch.tensor([], **tkwargs) #create zero-size buffer + buffer = torch.tensor([], **tkwargs) #create zero-size buffer else: - buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer + buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer if self.worker_id == 0: os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) @@ -633,7 +636,7 @@ def write_recv_buffer_to_file(recv_buffer, src, builder): for src in range(1, self.num_workers): dist.barrier(group=self.comm_group) if src == self.worker_id: - dist.send(buffer, 0, group=self.comm_group) # send tensor + dist.send(buffer, 0, group=self.comm_group) # send tensor elif self.worker_id == 0: buffer = torch.zeros(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) dist.recv(buffer, src=src, group=self.comm_group) @@ -641,10 +644,9 @@ def write_recv_buffer_to_file(recv_buffer, src, builder): # rank 0 closes the file if self.worker_id == 0: - close_mmap_dataset_builder(builder, fname) # close file + close_mmap_dataset_builder(builder, fname) # close file dist.barrier(self.comm_group) - @staticmethod def dist_min_max(tensor, comm_group): """ given a distributed tensor, return the min/max values across all ranks""" @@ -654,7 +656,6 @@ def dist_min_max(tensor, comm_group): dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group) return value_min.item(), value_max.item() - @staticmethod def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" @@ -664,8 +665,8 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): tensor = torch.sort(tensor, dim=0)[0] # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples+1)).to(int) - samples = tensor[idx[:-1]][:,0].contiguous().to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) + samples = tensor[idx[:-1]][:, 0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device)] * num_workers @@ -675,26 +676,25 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): # 3 - Sort all samples and collect the ranges of each rank as equidistant all_samples = all_samples.sort()[0] idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int) - ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] ranges[-1] += 1 # increase upper limit of last rank so that x < ranges[r+1]. # 4 - collect elements to send to each rank, based on the rank ranges send = [] for rank in range(num_workers): - mask = (tensor[:,0] >= ranges[rank]) & (tensor[:,0] < ranges[rank+1]) + mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [ torch.tensor([len(tensor)*dims], dtype=torch.int64, device=device) for tensor in send] + send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64, device=device) for tensor in send] recv_count = list(torch.zeros([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) dist.all_to_all(recv_count, send_count, group=comm_group) # 6. all to all to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) - recv = torch.zeros( sum(recv_count), dtype=send.dtype).to(device) + recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) del send # 7. the received tensor is the 1D disjoint subset of the distributed tensor return recv.view(-1, dims) - From 56a953386d9b7c42c91814d7cc7e985ddfa1c20b Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Sun, 18 Feb 2024 08:39:18 +0000 Subject: [PATCH 17/64] added main as example --- .../data_sampling/data_analyzer.py | 199 ++++++++++++------ 1 file changed, 135 insertions(+), 64 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 1725786a4428..795cde884bf9 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -12,7 +12,7 @@ import torch from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset -from deepspeed.utils import logger +from deepspeed.utils import logger, groups import deepspeed.comm as dist from .indexed_dataset import MMapIndexedDataset, valid_dtypes from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype @@ -449,8 +449,6 @@ class DistributedDataAnalyzer(object): def __init__( self, dataset, - num_workers=1, - worker_id=0, batch_size=1, metric_names=[], metric_functions=[], @@ -462,7 +460,6 @@ def __init__( sample_indices=None, ) -> None: self.dataset = dataset - self.comm_group = comm_group self.batch_size = batch_size self.metric_names = metric_names self.metric_functions = metric_functions @@ -471,15 +468,11 @@ def __init__( self.collate_fn = collate_fn self.device = device self.sample_indices = sample_indices - - # comm_group and num_workers/worker_id are mutually exclusive - self.comm_group = comm_group - if comm_group is not None: - self.num_workers = comm_group.size() - self.worker_id = comm_group.rank() - else: - self.num_workers = num_workers - self.worker_id = worker_id + self.comm_group = comm_group or groups._clone_world_group() + self.num_workers = self.comm_group.size() + self.worker_id = self.comm_group.rank() + if self.worker_id == 0: + logger.info(f"Data analyzer initialized with {self.num_workers} workers.") def run_map_reduce(self): @@ -506,8 +499,7 @@ def run_map_reduce(self): batch_start_idx = start_idx for data in dataloader: for m_idx in range(len(self.metric_names)): - metric_type, metric_function, metric_result = \ - self.metric_types[m_idx], self.metric_functions[m_idx], metric_results[m_idx] + metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx] metric_values = metric_function(data) assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" @@ -525,12 +517,13 @@ def run_map_reduce(self): sample_idx = data['index'][row][0].item() elif self.sample_indices is not None: # user defined shuffling of indices sample_idx = self.sample_indices[sample_idx] - metric_result.append((value, sample_idx)) + metric_results[m_idx].append((value, sample_idx)) elif metric_type == 'accumulate_value_over_samples': - if metric_result is None: - metric_result = metric_values + metric_values = metric_values.sum(dim=0) # sum over batch + if metric_results[m_idx] is None: + metric_results[m_idx] = metric_values else: - metric_result.add_(metric_values) + metric_results[m_idx].add_(metric_values) batch_start_idx += self.batch_size # compute dtype for sample ids @@ -549,7 +542,7 @@ def run_map_reduce(self): if metric_type == 'single_value_per_sample': # get unique values across all ranks and compute the values dtype based on min/max - # value_min, value_max = DistributedDataAnalyzer.dist_min_max(values, self.comm_group) + # value_min, value_max = Dist.min_max(values, self.comm_group) # metric_value_dtype = find_fit_int_dtype(value_min, value_max) # sample_to_metric maps sample ids to metric values, as a list of metric values @@ -558,8 +551,7 @@ def run_map_reduce(self): self.file_write_ordered(values, sample_to_metric_fname, torch.long) # distributed sorting by values, gives an ordered disjoint subset of keys on nodes - metric_values = DistributedDataAnalyzer.dist_sample_sort(metric_values, self.comm_group, - self.num_workers) + metric_values = Dist.sample_sort(metric_values, self.comm_group, self.num_workers) metric_to_samples_dict = {} if len(metric_values) > 0: for value, sample in metric_values: @@ -576,17 +568,25 @@ def run_map_reduce(self): self.file_write_ordered(values, index_to_metric_fname, torch.long) self.file_write_ordered(samples, index_to_sample_fname, torch.long) + if self.worker_id == 0: + DataAnalyzer.output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, + metric_name, metric_save_path, total_num_samples, + sample_idx_dtype) + dist.barrier(self.comm_group) + elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - dist.all_reduce(metric_values, op=dist.ReduceOp.SUM, group=self.comm_group) + # metric_values = metric_values.sum(dim=0) + dist.reduce(metric_values, dst=0, op=dist.ReduceOp.SUM, group=self.comm_group) if self.worker_id == 0: metric_values = metric_values.cpu().numpy() builder = create_mmap_dataset_builder(metric_value_fname, torch.long) builder.add_item(metric_values) close_mmap_dataset_builder(builder, metric_value_fname) + dist.barrier(self.comm_group) def file_write_ordered(self, tensor_list, fname, numpy_dtype): - """ save a distributed tensor to a single file, iteratively, ordered by rank """ + """ save a distributed list of tensors to a file, by one rank, iteratively """ # each not has a list of rows (tensors) to be written to the file. # we will serialize it to communicate it in one comm step. @@ -594,61 +594,61 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype): tkwargs = dict(dtype=torch.int64, device=self.device) # 1. gather on rank 0 the number of rows to be sent/recv - row_count = torch.tensor(len(tensor_list), **tkwargs) + row_count = torch.tensor([len(tensor_list)], **tkwargs) row_counts = torch.zeros(self.num_workers, **tkwargs) dist.all_gather_into_tensor(row_counts, row_count, group=self.comm_group) - assert row_counts[self.worker_id] == row_count, "all_gather failed" #sanity check - - # 3. gather on rank 0 the sizes of the rows to be sent/recv - # (all_gather requires all tensors to be of same size so we need pad them) - max_size = max(row_counts) - row_lens, row_len = None, torch.zeros(max_size, **tkwargs) - if self.worker_id == 0: # create padded recv buffers - row_lens = [torch.zeros(max_size, **tkwargs)] * self.num_workers - row_len[0:len(tensor_list)] = torch.tensor([len(l) for l in tensor_list], **tkwargs) - dist.gather(row_len, row_lens, dst=0, group=self.comm_group) - if self.worker_id == 0: # remove padding from buffers - row_lens = [r[:s] for r, s in zip(row_lens, row_counts)] + assert row_counts[self.worker_id] == row_count == len(tensor_list), "all_gather failed" + + # 2. gather on rank 0 the sizes of the rows to be sent/recv + row_len = torch.tensor([len(l) for l in tensor_list], **tkwargs) + row_lens = Dist.gather_v(row_len, 0, self.comm_group, self.num_workers, self.worker_id) # 4. gather on rank 0 of the total size (sum of all row lengths) to be received - size = torch.tensor(sum(row_len).item(), **tkwargs) + size = torch.tensor([sum(row_len).item()], **tkwargs) sizes = torch.zeros(self.num_workers, **tkwargs) dist.all_gather_into_tensor(sizes, size, group=self.comm_group) - assert sizes[self.worker_id] == size, "all_gather did not return the same sizes" #sanity check + assert sizes[self.worker_id] == size.item(), "all_gather did not return the same sizes" #sanity check # method to deserializes a buffer into rows of different lengths and write them to file - def write_recv_buffer_to_file(recv_buffer, src, builder): + def write_buffer_to_file(buff, src, builder): assert self.worker_id == 0, "only rank 0 can write to file" for row_len in row_lens[src]: - builder.add_item(recv_buffer[:row_len].cpu()) - recv_buffer = recv_buffer[row_len:] + builder.add_item(buff[:row_len].cpu()) + print("--", src, fname, buff[:row_len].cpu()) + buff = buff[row_len:] - # 5. rank 0 receives all tensors sequentially and writes them to the file - if len(tensor_list) == 0: - buffer = torch.tensor([], **tkwargs) #create zero-size buffer - else: - buffer = torch.cat(tensor_list, dim=0).to(self.device) #serialize list into buffer + # 5. rank 0 prepares output folder and file if self.worker_id == 0: os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) - write_recv_buffer_to_file(buffer, 0, builder) - for src in range(1, self.num_workers): + # iterate through ranks that have data to be sent/recv/written + for src in [rank for rank, count in enumerate(row_counts) if count > 0]: + + if self.worker_id == 0: + if src == 0: # rank 0's write its own data + buffer = torch.cat(tensor_list, dim=0).to(self.device) + write_buffer_to_file(buffer, 0, builder) + else: # rank 0 receives other rank's data and writes it + buffer = torch.zeros(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) + dist.recv(buffer, src=src, group=self.comm_group) + write_buffer_to_file(buffer, src, builder) + elif self.worker_id == src: # current rank sends data to rank 0 + buffer = torch.cat(tensor_list, dim=0).to(self.device) + dist.send(buffer, 0, group=self.comm_group) dist.barrier(group=self.comm_group) - if src == self.worker_id: - dist.send(buffer, 0, group=self.comm_group) # send tensor - elif self.worker_id == 0: - buffer = torch.zeros(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) - dist.recv(buffer, src=src, group=self.comm_group) - write_recv_buffer_to_file(buffer, src, builder) # rank 0 closes the file if self.worker_id == 0: close_mmap_dataset_builder(builder, fname) # close file dist.barrier(self.comm_group) + +class Dist: + """ auxiliary class to perform distributed operations on tensors""" + @staticmethod - def dist_min_max(tensor, comm_group): + def min_max(tensor, comm_group): """ given a distributed tensor, return the min/max values across all ranks""" value_min, value_max = tensor.min(), tensor.max() @@ -657,12 +657,36 @@ def dist_min_max(tensor, comm_group): return value_min.item(), value_max.item() @staticmethod - def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): + def gather_v(tensor, dst, comm_group, num_workers, worker_id): + """ MPI_Alltoallv. gather tensors of variable sizes in a single rank """ + + # gather on rank 0 the number of rows to be sent/recv + size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device) + sizes = torch.zeros(num_workers, dtype=torch.int64, device=tensor.device) + dist.all_gather_into_tensor(sizes, size, group=comm_group) + assert sizes[worker_id] == size, "all_gather failed" + + # all_gather requires all tensors to be of same size so we need to pad them + max_size = max(sizes).item() + buffer = torch.zeros(max_size, dtype=tensor.dtype, device=tensor.device) + buffer[0:size] = torch.tensor(tensor, dtype=tensor.dtype, device=tensor.device) + buffer_list = None + if worker_id == 0: # create padded recv buffers + buffer_list = [torch.zeros(max_size, dtype=torch.int64, device=tensor.device)] * num_workers + dist.gather(buffer, buffer_list, dst=dst, group=comm_group) + + # revert padding and return value + if worker_id == 0: + buffer_list = [r[:s.item()] for r, s in zip(buffer_list, sizes)] + return buffer_list + + @staticmethod + def sample_sort(tensor, comm_group, num_workers, n_samples=100): """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" device, dims = tensor.device, tensor.size()[1] - # 1 - Sort locally - tensor = torch.sort(tensor, dim=0)[0] + # 1 - sort rows by first column, then second column, then third, etc... + tensor = torch.tensor(sorted(tensor.tolist()), dtype=tensor.dtype, device=tensor.device) # 2 - collect few samples per rank idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) @@ -686,15 +710,62 @@ def dist_sample_sort(tensor, comm_group, num_workers, n_samples=100): send.append(tensor[mask]) # 5. all to all to communicate the sizes to be sent/recv - send_count = [torch.tensor([len(tensor) * dims], dtype=torch.int64, device=device) for tensor in send] - recv_count = list(torch.zeros([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) + send_count = [torch.tensor([len(s) * dims], dtype=torch.int64, device=device) for s in send] + recv_count = list(torch.empty([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) dist.all_to_all(recv_count, send_count, group=comm_group) - # 6. all to all to communicate the elements to be sent/recv as a single tensor + # 6. all-to-all-v to communicate the elements to be sent/recv as a single tensor send = torch.cat(send, dim=0).flatten().to(device) recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) + send_count = [s.item() for s in send_count] # convert to list of ints + recv_count = [r.item() for r in recv_count] dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) del send - # 7. the received tensor is the 1D disjoint subset of the distributed tensor - return recv.view(-1, dims) + # 7. the received tensor is the 1D disjoint subset of the distributed tensor. + # We will recover the original dimensionality and sort it by columns again. + recv = recv.view(-1, dims) + recv = torch.tensor(sorted(recv.tolist()), dtype=recv.dtype, device=recv.device) + return recv + + +if __name__ == "__main__": + + # from deepspeed.utils.groups import _get_data_parallel_group, _WORLD_GROUP + if not dist.is_initialized(): + dist.init_distributed() + # comm_group = dist.new_group(ranks=range(3)) + + class Dataset(torch.utils.data.Dataset): + + def __init__(self, size=20): + self.values = [1001 + x % 6 for x in range(size)] + self.size = size + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return self.values[idx] + + id = lambda t: torch.tensor(t).to(torch.int64) + kwargs = dict( + dataset=Dataset(), + batch_size=3, + metric_names=["mod", "sum"], + metric_functions=[id, id], + metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], + ) + + DistributedDataAnalyzer( + comm_group=None, + save_path="./output_dist", + # device=f"cuda:{int(os.environ['LOCAL_RANK'])}", + **kwargs, + ).run_map_reduce() + + DataAnalyzer(num_threads=2, + num_threads_reduce=2, + metric_dtypes=[torch.int64, torch.int64], + save_path="./output_disk", + **kwargs).run_map_reduce() From 6788af553078c52059cd7a3dec3dba5f7f86479f Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Sun, 18 Feb 2024 22:29:39 +0100 Subject: [PATCH 18/64] Update data_analyzer.py --- deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 596bd6fbe459..b2d3b92cd594 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -658,7 +658,7 @@ def min_max(tensor, comm_group): @staticmethod def gather_v(tensor, dst, comm_group, num_workers, worker_id): - """ MPI_Alltoallv. gather tensors of variable sizes in a single rank """ + """ Same as MPI_Gatherv. Gathers tensors of variable sizes in a single rank """ # gather on rank 0 the number of rows to be sent/recv size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device) From bd61d9c201cb7750c05daf7f456bf5200cd4169f Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 00:46:21 +0000 Subject: [PATCH 19/64] first working version. idx files differ --- .../data_sampling/data_analyzer.py | 153 +++++++++++------- 1 file changed, 97 insertions(+), 56 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 596bd6fbe459..3489937d48f4 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -449,6 +449,8 @@ class DistributedDataAnalyzer(object): def __init__( self, dataset, + num_workers=1, + worker_id=0, batch_size=1, metric_names=[], metric_functions=[], @@ -468,9 +470,20 @@ def __init__( self.collate_fn = collate_fn self.device = device self.sample_indices = sample_indices - self.comm_group = comm_group or groups._clone_world_group() - self.num_workers = self.comm_group.size() - self.worker_id = self.comm_group.rank() + + if not dist.is_initialized(): + dist.init_distributed() + + # comm_group and worker_id+num_workers are mutually exclusive + if comm_group is not None: + self.comm_group = comm_group + self.num_workers = self.comm_group.size() + self.worker_id = self.comm_group.rank() + else: + self.comm_group = groups._clone_world_group() + self.num_workers = self.comm_group.size() + self.worker_id = self.comm_group.rank() + if self.worker_id == 0: logger.info(f"Data analyzer initialized with {self.num_workers} workers.") @@ -519,7 +532,6 @@ def run_map_reduce(self): sample_idx = self.sample_indices[sample_idx] metric_results[m_idx].append((value, sample_idx)) elif metric_type == 'accumulate_value_over_samples': - metric_values = metric_values.sum(dim=0) # sum over batch if metric_results[m_idx] is None: metric_results[m_idx] = metric_values else: @@ -539,16 +551,21 @@ def run_map_reduce(self): metric_values, metric_name, metric_type = \ metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] metric_save_path = f"{self.save_path}/{metric_name}/" + os.makedirs(metric_save_path, exist_ok=True) + if metric_type == 'single_value_per_sample': - # get unique values across all ranks and compute the values dtype based on min/max - # value_min, value_max = Dist.min_max(values, self.comm_group) - # metric_value_dtype = find_fit_int_dtype(value_min, value_max) + # Compute sample and metric value dtypes based on range + values, samples = metric_values[:, 0], metric_values[:, 1] + value_min, value_max = Dist.min_max(values, self.comm_group) + sample_min, sample_max = Dist.min_max(samples, self.comm_group) + metric_value_dtype = find_fit_int_dtype(value_min, value_max) + sample_value_dtype = find_fit_int_dtype(sample_min, sample_max) # sample_to_metric maps sample ids to metric values, as a list of metric values sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" values = [torch.tensor([x]) for x in metric_values[:, 0]] - self.file_write_ordered(values, sample_to_metric_fname, torch.long) + self.file_write_ordered(values, sample_to_metric_fname, metric_value_dtype) # distributed sorting by values, gives an ordered disjoint subset of keys on nodes metric_values = Dist.sample_sort(metric_values, self.comm_group, self.num_workers) @@ -565,8 +582,8 @@ def run_map_reduce(self): samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()] index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values - self.file_write_ordered(values, index_to_metric_fname, torch.long) - self.file_write_ordered(samples, index_to_sample_fname, torch.long) + self.file_write_ordered(values, index_to_metric_fname, metric_value_dtype) + self.file_write_ordered(samples, index_to_sample_fname, sample_value_dtype) if self.worker_id == 0: DataAnalyzer.output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, @@ -576,12 +593,12 @@ def run_map_reduce(self): elif metric_type == 'accumulate_value_over_samples': metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - # metric_values = metric_values.sum(dim=0) dist.reduce(metric_values, dst=0, op=dist.ReduceOp.SUM, group=self.comm_group) + metric_value_dtype = find_fit_int_dtype(metric_values.min(), metric_values.max()) + if self.worker_id == 0: - metric_values = metric_values.cpu().numpy() - builder = create_mmap_dataset_builder(metric_value_fname, torch.long) - builder.add_item(metric_values) + builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + builder.add_item(metric_values.cpu()) close_mmap_dataset_builder(builder, metric_value_fname) dist.barrier(self.comm_group) @@ -614,7 +631,6 @@ def write_buffer_to_file(buff, src, builder): assert self.worker_id == 0, "only rank 0 can write to file" for row_len in row_lens[src]: builder.add_item(buff[:row_len].cpu()) - print("--", src, fname, buff[:row_len].cpu()) buff = buff[row_len:] # 5. rank 0 prepares output folder and file @@ -625,18 +641,18 @@ def write_buffer_to_file(buff, src, builder): # iterate through ranks that have data to be sent/recv/written for src in [rank for rank, count in enumerate(row_counts) if count > 0]: - if self.worker_id == 0: - if src == 0: # rank 0's write its own data - buffer = torch.cat(tensor_list, dim=0).to(self.device) - write_buffer_to_file(buffer, 0, builder) - else: # rank 0 receives other rank's data and writes it - buffer = torch.zeros(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) - dist.recv(buffer, src=src, group=self.comm_group) - write_buffer_to_file(buffer, src, builder) + dist.barrier(group=self.comm_group) + if self.worker_id == 0 and src == 0: # rank 0's write its own data + buffer = torch.cat(tensor_list, dim=0).to(self.device) + write_buffer_to_file(buffer, 0, builder) + elif self.worker_id == 0 and src > 0: # rank 0 receives other rank's data and writes it + buffer = torch.empty(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) + err = dist.recv(buffer, src=src, group=self.comm_group, tag=src) + assert err == src and len(buffer) > 0, "recv failed" + write_buffer_to_file(buffer, src, builder) elif self.worker_id == src: # current rank sends data to rank 0 buffer = torch.cat(tensor_list, dim=0).to(self.device) - dist.send(buffer, 0, group=self.comm_group) - dist.barrier(group=self.comm_group) + dist.send(buffer, 0, group=self.comm_group, tag=src) # rank 0 closes the file if self.worker_id == 0: @@ -660,7 +676,7 @@ def min_max(tensor, comm_group): def gather_v(tensor, dst, comm_group, num_workers, worker_id): """ MPI_Alltoallv. gather tensors of variable sizes in a single rank """ - # gather on rank 0 the number of rows to be sent/recv + # gather the number of rows to be sent/recv size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device) sizes = torch.zeros(num_workers, dtype=torch.int64, device=tensor.device) dist.all_gather_into_tensor(sizes, size, group=comm_group) @@ -668,11 +684,11 @@ def gather_v(tensor, dst, comm_group, num_workers, worker_id): # all_gather requires all tensors to be of same size so we need to pad them max_size = max(sizes).item() - buffer = torch.zeros(max_size, dtype=tensor.dtype, device=tensor.device) + buffer = torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) buffer[0:size] = torch.tensor(tensor, dtype=tensor.dtype, device=tensor.device) buffer_list = None if worker_id == 0: # create padded recv buffers - buffer_list = [torch.zeros(max_size, dtype=torch.int64, device=tensor.device)] * num_workers + buffer_list = [torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) for _ in range(num_workers)] dist.gather(buffer, buffer_list, dst=dst, group=comm_group) # revert padding and return value @@ -693,7 +709,7 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): samples = tensor[idx[:-1]][:, 0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples - all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device)] * num_workers + all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device) for _ in range(num_workers)] dist.all_gather(all_samples, samples, group=comm_group) all_samples = torch.cat(all_samples, dim=0).to(device) @@ -729,14 +745,59 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): return recv -if __name__ == "__main__": +def sanity_check(dataset): + """ given a dataset, compare file and memory based data analyser""" + + id = lambda t: torch.tensor(t).to(torch.int64) # identity + batch_sum = lambda t: id(t).sum() #sum batch + kwargs = dict( + dataset=dataset, + batch_size=3, + metric_names=["mod", "batch_sum"], + metric_functions=[id, batch_sum], + metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], + ) + + dda = DistributedDataAnalyzer( + save_path="./output_dist", + worker_id=int(os.environ['RANK']), + num_workers=int(os.environ['WORLD_SIZE']), + device=f"cuda:{int(os.environ['LOCAL_RANK'])}", + **kwargs, + ) + start_time = time.time() + dda.run_map_reduce() + if dda.worker_id == 0: + print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + + da = DataAnalyzer(num_threads=2, + num_threads_reduce=2, + metric_dtypes=[torch.int64, torch.int64], + save_path="./output_disk", + **kwargs) + start_time = time.time() + da.run_map_reduce() + if da.worker_id == 0: + print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + + output_paths = [ + "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", "mod/mod_index_to_metric.bin", + "mod/mod_index_to_metric.idx", "mod/mod_index_to_sample.bin", "mod/mod_index_to_sample.idx", + "mod/mod_index_to_sample_percentile_merged.bin", "mod/mod_index_to_sample_percentile_merged.idx", + "mod/mod_sample_to_metric.bin", "mod/mod_sample_to_metric.idx" + ] + + if dda.worker_id == 0: + for path in output_paths: + with open(os.path.join(da.save_path, path), 'rb') as f1, \ + open(os.path.join(dda.save_path, path), 'rb') as f2: + if f1.read() != f2.read(): + print(f"files {path} are not identical.") + - # from deepspeed.utils.groups import _get_data_parallel_group, _WORLD_GROUP - if not dist.is_initialized(): - dist.init_distributed() - # comm_group = dist.new_group(ranks=range(3)) +if __name__ == "__main__": - class Dataset(torch.utils.data.Dataset): + class DummyDataset(torch.utils.data.Dataset): def __init__(self, size=20): self.values = [1001 + x % 6 for x in range(size)] @@ -748,24 +809,4 @@ def __len__(self): def __getitem__(self, idx): return self.values[idx] - id = lambda t: torch.tensor(t).to(torch.int64) - kwargs = dict( - dataset=Dataset(), - batch_size=3, - metric_names=["mod", "sum"], - metric_functions=[id, id], - metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], - ) - - DistributedDataAnalyzer( - comm_group=None, - save_path="./output_dist", - # device=f"cuda:{int(os.environ['LOCAL_RANK'])}", - **kwargs, - ).run_map_reduce() - - DataAnalyzer(num_threads=2, - num_threads_reduce=2, - metric_dtypes=[torch.int64, torch.int64], - save_path="./output_disk", - **kwargs).run_map_reduce() + sanity_check(DummyDataset()) From 8bf0e63528818a08955299f6946e2e9e09790291 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 09:49:11 +0000 Subject: [PATCH 20/64] added missing static function --- .../data_sampling/data_analyzer.py | 67 +++++++++++-------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 3eb736986b36..44f43a9eaee3 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -96,11 +96,10 @@ def update_metric_results(self, metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) - assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ - "metric_function must return a tensor or array" + "metric_function must return a tensor or array" assert metric_values.dtype == metric_dtype, \ - f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" + f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" if isinstance(metric_values, np.ndarray): metric_values = torch.from_numpy(metric_values) @@ -385,27 +384,12 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ index_to_metric_builder.merge_file_(chunk_im_fname) close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) - num_sample_per_value = {} - index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) - index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) - index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" - index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, - sample_idx_dtype) - for v_idx in range(len(index_to_sample)): - if v_idx > 0: - assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] - num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) - assert sum(num_sample_per_value.values()) == total_num_samples - merge_step = max(1, len(index_to_sample) // 100) - for v_idx in range(0, len(index_to_sample), merge_step): - merged_samples = np.copy( - np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], - axis=None)) - index_to_sample_merged_builder.add_item( - torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) - logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") - close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) + + num_sample_per_value = DataAnalyzer.output_index_to_sample_percentile( + index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples, + sample_idx_dtype) self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) + elif metric_type == 'accumulate_value_over_samples': metric_save_path = f"{save_path}/{metric_name}/" metric_value = None @@ -426,6 +410,29 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long)) close_mmap_dataset_builder(metric_value_builder, metric_value_fname) + @staticmethod + def output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, + total_num_samples, sample_idx_dtype): + """ read index_to_metric and index_to_sample files and write distribution to percentage_merged_file """ + num_sample_per_value = {} + index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) + index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) + index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" + index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, sample_idx_dtype) + for v_idx in range(len(index_to_sample)): + if v_idx > 0: + assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] + num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) + assert sum(list(num_sample_per_value.values())) == total_num_samples + merge_step = max(1, len(index_to_sample) // 100) + for v_idx in range(0, len(index_to_sample), merge_step): + merged_samples = np.copy( + np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], axis=None)) + index_to_sample_merged_builder.add_item(torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") + close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) + return num_sample_per_value + def run_reduce(self): if self.custom_reduce is None: self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, @@ -435,6 +442,10 @@ def run_reduce(self): self.num_threads, self.num_threads_reduce) def run_map_reduce(self, comm_group=None): + + if not dist.is_initialized(): + dist.init_distributed() + self.run_map() # wait for the mapping operation, where all nodes outputs their own (partial) result files dist.barrier(group=comm_group) @@ -603,7 +614,7 @@ def run_map_reduce(self): dist.barrier(self.comm_group) def file_write_ordered(self, tensor_list, fname, numpy_dtype): - """ save a distributed list of tensors to a file, by one rank, iteratively """ + """ MPI_file_write_ordered extended to write a list of tensors, by one rank, iteratively """ # each not has a list of rows (tensors) to be written to the file. # we will serialize it to communicate it in one comm step. @@ -674,7 +685,7 @@ def min_max(tensor, comm_group): @staticmethod def gather_v(tensor, dst, comm_group, num_workers, worker_id): - """ Same as MPI_Gatherv. Gathers tensors of variable sizes in a single rank """ + """ MPI_Gatherv. gather tensors of variable sizes in a single rank """ # gather the number of rows to be sent/recv size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device) @@ -745,7 +756,7 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): return recv -def sanity_check(dataset): +def test_compare_both_data_analyzers(dataset): """ given a dataset, compare file and memory based data analyser""" id = lambda t: torch.tensor(t).to(torch.int64) # identity @@ -797,7 +808,7 @@ def sanity_check(dataset): if __name__ == "__main__": - class DummyDataset(torch.utils.data.Dataset): + class TestDataset(torch.utils.data.Dataset): def __init__(self, size=20): self.values = [1001 + x % 6 for x in range(size)] @@ -809,4 +820,4 @@ def __len__(self): def __getitem__(self, idx): return self.values[idx] - sanity_check(DummyDataset()) + test_compare_both_data_analyzers(TestDataset()) From e5a7eb0fb341a5116420b3f404cd0674e6dec1e1 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 09:50:55 +0000 Subject: [PATCH 21/64] removed/added breaklines to match base code --- deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 44f43a9eaee3..435b1dc3600a 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -96,6 +96,7 @@ def update_metric_results(self, metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] metric_values = metric_function(data) + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ "metric_function must return a tensor or array" assert metric_values.dtype == metric_dtype, \ @@ -389,7 +390,6 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples, sample_idx_dtype) self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) - elif metric_type == 'accumulate_value_over_samples': metric_save_path = f"{save_path}/{metric_name}/" metric_value = None From 3b8014fd0e7e022884fe43f04ce03e2070e33fae Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 09:52:39 +0000 Subject: [PATCH 22/64] corrected comment --- deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 435b1dc3600a..33c769012dab 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -413,7 +413,7 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ @staticmethod def output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples, sample_idx_dtype): - """ read index_to_metric and index_to_sample files and write distribution to percentage_merged_file """ + """ read index_to_metric and index_to_sample files and write distribution to index_to_sample_percentage_merged """ num_sample_per_value = {} index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) From 5a4268799709a95cb74c7c49a3e7c59c328986f0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 10:01:01 +0000 Subject: [PATCH 23/64] imports --- .../data_pipeline/data_sampling/data_analyzer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 33c769012dab..8416c3259866 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -14,8 +14,8 @@ from deepspeed.utils import logger, groups import deepspeed.comm as dist -from .indexed_dataset import MMapIndexedDataset, valid_dtypes -from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype +from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes +from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype class DataAnalyzer(object): @@ -792,9 +792,10 @@ def test_compare_both_data_analyzers(dataset): print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time)) output_paths = [ - "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", "mod/mod_index_to_metric.bin", - "mod/mod_index_to_metric.idx", "mod/mod_index_to_sample.bin", "mod/mod_index_to_sample.idx", - "mod/mod_index_to_sample_percentile_merged.bin", "mod/mod_index_to_sample_percentile_merged.idx", + "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", \ + "mod/mod_index_to_metric.bin", "mod/mod_index_to_metric.idx", \ + "mod/mod_index_to_sample.bin", "mod/mod_index_to_sample.idx", \ + "mod/mod_index_to_sample_percentile_merged.bin", "mod/mod_index_to_sample_percentile_merged.idx", \ "mod/mod_sample_to_metric.bin", "mod/mod_sample_to_metric.idx" ] From cdaad3622b5c824de52d742e55a1375c4255fc7c Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 10:50:09 +0000 Subject: [PATCH 24/64] removed main --- .../data_sampling/data_analyzer.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 8416c3259866..60183b7b77fb 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -756,9 +756,25 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): return recv -def test_compare_both_data_analyzers(dataset): +def text_both_data_analyzers(dataset=None): """ given a dataset, compare file and memory based data analyser""" + if dataset is None: + + class TestDataset(torch.utils.data.Dataset): + + def __init__(self, size=20): + self.values = [1001 + x % 6 for x in range(size)] + self.size = size + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return self.values[idx] + + dataset = TestDataset() + id = lambda t: torch.tensor(t).to(torch.int64) # identity batch_sum = lambda t: id(t).sum() #sum batch kwargs = dict( @@ -807,18 +823,3 @@ def test_compare_both_data_analyzers(dataset): print(f"files {path} are not identical.") -if __name__ == "__main__": - - class TestDataset(torch.utils.data.Dataset): - - def __init__(self, size=20): - self.values = [1001 + x % 6 for x in range(size)] - self.size = size - - def __len__(self): - return self.size - - def __getitem__(self, idx): - return self.values[idx] - - test_compare_both_data_analyzers(TestDataset()) From b3d406206c73d60a9fd203ff67c0d62366edc7b0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 11:05:50 +0000 Subject: [PATCH 25/64] reverted main --- .../data_sampling/data_analyzer.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 60183b7b77fb..8416c3259866 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -756,25 +756,9 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): return recv -def text_both_data_analyzers(dataset=None): +def test_compare_both_data_analyzers(dataset): """ given a dataset, compare file and memory based data analyser""" - if dataset is None: - - class TestDataset(torch.utils.data.Dataset): - - def __init__(self, size=20): - self.values = [1001 + x % 6 for x in range(size)] - self.size = size - - def __len__(self): - return self.size - - def __getitem__(self, idx): - return self.values[idx] - - dataset = TestDataset() - id = lambda t: torch.tensor(t).to(torch.int64) # identity batch_sum = lambda t: id(t).sum() #sum batch kwargs = dict( @@ -823,3 +807,18 @@ def __getitem__(self, idx): print(f"files {path} are not identical.") +if __name__ == "__main__": + + class TestDataset(torch.utils.data.Dataset): + + def __init__(self, size=20): + self.values = [1001 + x % 6 for x in range(size)] + self.size = size + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return self.values[idx] + + test_compare_both_data_analyzers(TestDataset()) From 7cabfa2ab48c0ca6fa6b975a2db580e7de00236c Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 13:59:51 +0000 Subject: [PATCH 26/64] bug fix in sample calculation --- .../data_pipeline/data_sampling/data_analyzer.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 8416c3259866..bae32bbc5ccf 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -179,7 +179,7 @@ def run_map_helper(self, thread_id): else: self.custom_map_update(data, self.metric_types, self.metric_dtypes, self.metric_functions, metric_results, batch_start_idx) - processed_sample += self.batch_size + processed_sample += len(data) duration = (time.time() - start) / 3600.0 remain_duration = duration * total_sample / processed_sample - duration logger.info( @@ -536,8 +536,7 @@ def run_map_reduce(self): for row in range(metric_values.size()[0]): value = metric_values[row].item() sample_idx = batch_start_idx + row # sample idx following dataset iteration order - if isinstance(data, - dict) and 'index' in data: # Megatron use case, idx provided in 'index' field + if isinstance(data, dict) and 'index' in data: # Megatron use case sample_idx = data['index'][row][0].item() elif self.sample_indices is not None: # user defined shuffling of indices sample_idx = self.sample_indices[sample_idx] @@ -547,7 +546,7 @@ def run_map_reduce(self): metric_results[m_idx] = metric_values else: metric_results[m_idx].add_(metric_values) - batch_start_idx += self.batch_size + batch_start_idx += len(data) # compute dtype for sample ids total_num_samples = len(self.dataset) @@ -716,8 +715,8 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): tensor = torch.tensor(sorted(tensor.tolist()), dtype=tensor.dtype, device=tensor.device) # 2 - collect few samples per rank - idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples + 1)).to(int) - samples = tensor[idx[:-1]][:, 0].contiguous().to(device) #only first column, all but last row + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples)).to(int) + samples = tensor[idx][:, 0].contiguous().to(device) #only first column, all but last row # 2 - Allgather samples all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device) for _ in range(num_workers)] From 62f68dd1f25595e5994a8dc65a166885eeb1e934 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 14:05:05 +0000 Subject: [PATCH 27/64] added worker_an and num_worker to kwargs --- .../runtime/data_pipeline/data_sampling/data_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index bae32bbc5ccf..db2297fdd355 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -763,6 +763,8 @@ def test_compare_both_data_analyzers(dataset): kwargs = dict( dataset=dataset, batch_size=3, + worker_id=int(os.environ['RANK']), + num_workers=int(os.environ['WORLD_SIZE']), metric_names=["mod", "batch_sum"], metric_functions=[id, batch_sum], metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], @@ -770,8 +772,6 @@ def test_compare_both_data_analyzers(dataset): dda = DistributedDataAnalyzer( save_path="./output_dist", - worker_id=int(os.environ['RANK']), - num_workers=int(os.environ['WORLD_SIZE']), device=f"cuda:{int(os.environ['LOCAL_RANK'])}", **kwargs, ) From 6d35e454b6d05975f7919a0a97fbccbf0e285601 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Mon, 19 Feb 2024 14:42:12 +0000 Subject: [PATCH 28/64] removed dist.initialize ()from DataAnalyzer.run_map_reduce --- .../runtime/data_pipeline/data_sampling/data_analyzer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index db2297fdd355..5da11097319d 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -442,10 +442,6 @@ def run_reduce(self): self.num_threads, self.num_threads_reduce) def run_map_reduce(self, comm_group=None): - - if not dist.is_initialized(): - dist.init_distributed() - self.run_map() # wait for the mapping operation, where all nodes outputs their own (partial) result files dist.barrier(group=comm_group) From be91d37c9558eff0991b3021b049cbe395620806 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Tue, 20 Feb 2024 23:20:44 +0000 Subject: [PATCH 29/64] first iteration --- .../data_sampling/data_analyzer.py | 191 ++++++++++++++++-- 1 file changed, 172 insertions(+), 19 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 5da11097319d..14e59b49dcb9 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -492,7 +492,7 @@ def __init__( self.worker_id = self.comm_group.rank() if self.worker_id == 0: - logger.info(f"Data analyzer initialized with {self.num_workers} workers.") + logger.info(f"Distributed data analyzer initialized with {self.num_workers} workers.") def run_map_reduce(self): @@ -751,7 +751,147 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): return recv -def test_compare_both_data_analyzers(dataset): +class SerialDataAnalyzer(object): + + def __init__( + self, + dataset, + batch_size=1, + num_threads=4, #num_workrs in DataLoader + prefetch_factor=4, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + sample_indices=None, + ) -> None: + self.dataset = dataset + self.batch_size = batch_size + self.num_threads = num_threads + self.prefetch_factor = prefetch_factor + self.metric_names = metric_names + self.metric_functions = metric_functions + self.metric_types = metric_types + self.save_path = save_path + self.collate_fn = collate_fn + self.sample_indices = sample_indices + logger.info(f"Serial data analyzer initialized.") + + def run_map_reduce(self): + + dataloader = DataLoader(dataset=self.dataset, + num_workers=self.num_threads, + prefetch_factor=self.prefetch_factor, + collate_fn=self.collate_fn, + pin_memory=False) + + # set initial results list + metric_results = [] + for metric_type in self.metric_types: + assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ + f"metric_type {metric_type} not implemented." + if metric_type == 'single_value_per_sample': + metric_results.append({'values_list': [], 'metric_to_samples_dict': {}}) + else: + metric_results.append(None) + + # update results list + processed_samples = 0 + for data in dataloader: + for m_idx in range(len(self.metric_names)): + metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx] + metric_values = metric_function(data) + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ + "metric_function must return a tensor or array" + if isinstance(metric_values, np.ndarray): + metric_values = torch.from_numpy(metric_values) + assert metric_values.dtype in valid_dtypes, \ + f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_dtypes}" + + if metric_type == 'single_value_per_sample': + metric_to_samples_dict = metric_results[m_idx]['metric_to_samples_dict'] + values = metric_results[m_idx]['values_list'] + for row in range(metric_values.size()[0]): + value = metric_values[row].item() + sample_idx = processed_samples + row # sample idx following dataset iteration order + if isinstance(data, dict) and 'index' in data: # Megatron use case + sample_idx = data['index'][row][0].item() + elif self.sample_indices is not None: # user defined shuffling of indices + sample_idx = self.sample_indices[sample_idx] + if value not in metric_to_samples_dict: + metric_to_samples_dict[value] = [] + metric_to_samples_dict[value].append(sample_idx) + values.append(value) + elif metric_type == 'accumulate_value_over_samples': + if metric_results[m_idx] is None: + metric_results[m_idx] = metric_values + else: + metric_results[m_idx].add_(metric_values) + processed_samples += len(data) + + # convert lists to arrays to same memory + # compute dtype for sample ids + total_num_samples = len(self.dataset) + sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) + logger.info(f"Total number of data samples: {total_num_samples}.") + logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") + + for m_idx in range(len(self.metric_names)): + metric_values, metric_name, metric_type = \ + metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] + metric_save_path = f"{self.save_path}/{metric_name}/" + os.makedirs(metric_save_path, exist_ok=True) + + if metric_type == 'single_value_per_sample': + + # Compute metric value dtypes based on range + values = metric_results[m_idx]['values_list'] + metric_value_dtype = find_fit_int_dtype(min(values), max(values)) + + # sample_to_metric maps sample ids to metric values, as a list of metric values + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + values = [torch.tensor([x], device='cpu') for x in values] + self.file_write(values, sample_to_metric_fname, metric_value_dtype) + + # Compute sample dtypes based on range + metric_to_samples_dict = metric_values['metric_to_samples_dict'] + values, samples = metric_to_samples_dict.keys(), metric_to_samples_dict.values() + sample_value_dtype = find_fit_int_dtype(min([min(x) for x in samples]), max([max(x) for x in samples])) + + # index_to_metric and index_to_sample serialize a dicitonary from metric to samples + # index_to_metric stores a key per row, index_to_sample stores the values per row + values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()] + samples = [torch.tensor(x) for x in metric_to_samples_dict.values()] + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values + self.file_write(values, index_to_metric_fname, metric_value_dtype) + self.file_write(samples, index_to_sample_fname, sample_value_dtype) + + DataAnalyzer.output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, + metric_name, metric_save_path, total_num_samples, + sample_idx_dtype) + + elif metric_type == 'accumulate_value_over_samples': + metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" + metric_value_dtype = find_fit_int_dtype(metric_values.min(), metric_values.max()) + self.file_write([metric_values], metric_value_fname, metric_value_dtype) + + def file_write(self, tensor_list, fname, numpy_dtype): + """ write a list of tensors to a file """ + + # prepares output folder and file + os.makedirs(os.path.dirname(fname), exist_ok=True) + builder = create_mmap_dataset_builder(fname, numpy_dtype) + + # iterate through tensors and write them + for tensor in tensor_list: + builder.add_item(tensor) + + close_mmap_dataset_builder(builder, fname) # close file + + +def test_compare_data_analyzers(dataset, num_threads=8): """ given a dataset, compare file and memory based data analyser""" id = lambda t: torch.tensor(t).to(torch.int64) # identity @@ -759,31 +899,42 @@ def test_compare_both_data_analyzers(dataset): kwargs = dict( dataset=dataset, batch_size=3, - worker_id=int(os.environ['RANK']), - num_workers=int(os.environ['WORLD_SIZE']), metric_names=["mod", "batch_sum"], metric_functions=[id, batch_sum], metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], ) + worker_id = int(os.environ['RANK']) + num_workers = int(os.environ['WORLD_SIZE']) + + # run Serial Data Analyzer (with on single CPU-memory storage of map-reduce) + start_time = time.time() + if worker_id == 0: + sda = SerialDataAnalyzer(save_path="./output_sda", num_threads=num_threads, **kwargs) + sda.run_map_reduce() + print("SerialDataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + # run Distributed Data Analyzer (with distributed CUDA-memory storage of map-reduce) + start_time = time.time() dda = DistributedDataAnalyzer( - save_path="./output_dist", + save_path="./output_dda", device=f"cuda:{int(os.environ['LOCAL_RANK'])}", - **kwargs, + **kwargs | dict(worker_id=worker_id, num_workers=num_workers), ) - start_time = time.time() dda.run_map_reduce() - if dda.worker_id == 0: + if worker_id == 0: print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time)) - da = DataAnalyzer(num_threads=2, - num_threads_reduce=2, - metric_dtypes=[torch.int64, torch.int64], - save_path="./output_disk", - **kwargs) + # run regular Data Analyzer (with shared disk storage of map-reduce) start_time = time.time() + da = DataAnalyzer( + num_threads=num_threads, + num_threads_reduce=num_threads, + metric_dtypes=[torch.int64, torch.int64], + save_path="./output_da", + **kwargs | dict(worker_id=worker_id, num_workers=num_workers), + ) da.run_map_reduce() - if da.worker_id == 0: + if worker_id == 0: print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time)) output_paths = [ @@ -794,11 +945,13 @@ def test_compare_both_data_analyzers(dataset): "mod/mod_sample_to_metric.bin", "mod/mod_sample_to_metric.idx" ] - if dda.worker_id == 0: + if worker_id == 0: for path in output_paths: with open(os.path.join(da.save_path, path), 'rb') as f1, \ - open(os.path.join(dda.save_path, path), 'rb') as f2: - if f1.read() != f2.read(): + open(os.path.join(dda.save_path, path), 'rb') as f2, \ + open(os.path.join(sda.save_path, path), 'rb') as f3: + f1c, f2c, f3c = f1.read(), f2.read(), f3.read() + if f1c != f2c or f2c != f3c: print(f"files {path} are not identical.") @@ -806,7 +959,7 @@ def test_compare_both_data_analyzers(dataset): class TestDataset(torch.utils.data.Dataset): - def __init__(self, size=20): + def __init__(self, size=200000): self.values = [1001 + x % 6 for x in range(size)] self.size = size @@ -816,4 +969,4 @@ def __len__(self): def __getitem__(self, idx): return self.values[idx] - test_compare_both_data_analyzers(TestDataset()) + test_compare_data_analyzers(TestDataset()) From 5fd054686ce2e178158d3d05ece4094665c552e0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 21 Feb 2024 12:20:00 +0000 Subject: [PATCH 30/64] updated with add_items --- .../data_sampling/data_analyzer.py | 50 ++++++++++--------- .../data_sampling/indexed_dataset.py | 10 +++- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 14e59b49dcb9..7f0de003a916 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -12,7 +12,7 @@ import torch from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset -from deepspeed.utils import logger, groups +from deepspeed.utils import logger import deepspeed.comm as dist from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype @@ -482,12 +482,12 @@ def __init__( dist.init_distributed() # comm_group and worker_id+num_workers are mutually exclusive - if comm_group is not None: - self.comm_group = comm_group - self.num_workers = self.comm_group.size() - self.worker_id = self.comm_group.rank() + self.comm_group = comm_group + if self.comm_group is None: + # self.comm_group = deepspeed.utils.groups._clone_world_group() + self.num_workers = num_workers + self.worker_id = worker_id else: - self.comm_group = groups._clone_world_group() self.num_workers = self.comm_group.size() self.worker_id = self.comm_group.rank() @@ -601,7 +601,6 @@ def run_map_reduce(self): metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" dist.reduce(metric_values, dst=0, op=dist.ReduceOp.SUM, group=self.comm_group) metric_value_dtype = find_fit_int_dtype(metric_values.min(), metric_values.max()) - if self.worker_id == 0: builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) builder.add_item(metric_values.cpu()) @@ -635,9 +634,18 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype): # method to deserializes a buffer into rows of different lengths and write them to file def write_buffer_to_file(buff, src, builder): assert self.worker_id == 0, "only rank 0 can write to file" + + # # write one buffer at a time + # for row_len in row_lens[src]: + # builder.add_item(buff[:row_len].cpu()) + # buff = buff[row_len:] + + # collect all buffers and write them all at once + buffer_list = [] for row_len in row_lens[src]: - builder.add_item(buff[:row_len].cpu()) + buffer_list.append(buff[:row_len].cpu()) buff = buff[row_len:] + builder.add_items(buffer_list) # 5. rank 0 prepares output folder and file if self.worker_id == 0: @@ -757,8 +765,7 @@ def __init__( self, dataset, batch_size=1, - num_threads=4, #num_workrs in DataLoader - prefetch_factor=4, + num_threads=4, metric_names=[], metric_functions=[], metric_types=[], @@ -769,7 +776,6 @@ def __init__( self.dataset = dataset self.batch_size = batch_size self.num_threads = num_threads - self.prefetch_factor = prefetch_factor self.metric_names = metric_names self.metric_functions = metric_functions self.metric_types = metric_types @@ -782,7 +788,6 @@ def run_map_reduce(self): dataloader = DataLoader(dataset=self.dataset, num_workers=self.num_threads, - prefetch_factor=self.prefetch_factor, collate_fn=self.collate_fn, pin_memory=False) @@ -883,22 +888,18 @@ def file_write(self, tensor_list, fname, numpy_dtype): # prepares output folder and file os.makedirs(os.path.dirname(fname), exist_ok=True) builder = create_mmap_dataset_builder(fname, numpy_dtype) - - # iterate through tensors and write them - for tensor in tensor_list: - builder.add_item(tensor) - + builder.add_items(tensor_list) close_mmap_dataset_builder(builder, fname) # close file -def test_compare_data_analyzers(dataset, num_threads=8): +def test_compare_data_analyzers(dataset, num_threads=16): """ given a dataset, compare file and memory based data analyser""" id = lambda t: torch.tensor(t).to(torch.int64) # identity batch_sum = lambda t: id(t).sum() #sum batch kwargs = dict( dataset=dataset, - batch_size=3, + batch_size=2**10, metric_names=["mod", "batch_sum"], metric_functions=[id, batch_sum], metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], @@ -937,6 +938,7 @@ def test_compare_data_analyzers(dataset, num_threads=8): if worker_id == 0: print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + # check that all output files match output_paths = [ "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", \ "mod/mod_index_to_metric.bin", "mod/mod_index_to_metric.idx", \ @@ -951,16 +953,18 @@ def test_compare_data_analyzers(dataset, num_threads=8): open(os.path.join(dda.save_path, path), 'rb') as f2, \ open(os.path.join(sda.save_path, path), 'rb') as f3: f1c, f2c, f3c = f1.read(), f2.read(), f3.read() - if f1c != f2c or f2c != f3c: - print(f"files {path} are not identical.") + if f1c != f2c: + print(f"DataAnalyzer and DistributedDataAnalyzer {path} are not identical.") + if f2c != f3c: + print(f"DistributedDataAnalyzer and SerialDataAnalyzer {path} are not identical.") if __name__ == "__main__": class TestDataset(torch.utils.data.Dataset): - def __init__(self, size=200000): - self.values = [1001 + x % 6 for x in range(size)] + def __init__(self, size=20000): + self.values = [1001 + x % 37 for x in range(size)] self.size = size def __len__(self): diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py index 7a6963bc27eb..b300ff4aab89 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -98,7 +98,7 @@ def write_longs(f, a): f.write(np.array(a, dtype=np.int64)) -# valid metric_dtypes as numpy and torch types +# valid dtypes for metric_values and their corresponding numpy/torch types dtypes = { 1: (np.uint8, torch.uint8), 2: (np.int8, torch.int8), @@ -581,10 +581,18 @@ def __init__(self, out_file, dtype=np.int64): self._doc_idx = [0] def add_item(self, tensor): + """ write the tensor to the file and update its size in the index""" np_array = np.array(tensor.numpy(), dtype=self._dtype) self._data_file.write(np_array.tobytes(order='C')) self._sizes.append(np_array.size) + def add_items(self, tensor_list): + """ write a list of tensors to the file and update their sizes in the index""" + np_arrays = [np.array(t.numpy(), dtype=self._dtype) for t in tensor_list] + self._data_file.writelines([arr.tobytes(order='C') for arr in np_arrays]) + for arr in np_arrays: + self._sizes.append(arr.size) + def add_item_numpy(self, np_array): if np_array.dtype != self._dtype: np_array = np_array.astype(self._dtype) From 4f23873aeda2b35dd9a0df60293c6243a3480170 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 7 Mar 2024 13:01:20 +0000 Subject: [PATCH 31/64] first iteration, testing --- .../variable_batch_size_and_lr.py | 426 ++++++++++++++++++ 1 file changed, 426 insertions(+) create mode 100644 deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py new file mode 100644 index 000000000000..cde883546a16 --- /dev/null +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -0,0 +1,426 @@ +import random +import torch +from deepspeed.utils import logger +from torch.utils.data import DistributedSampler +from torch.optim.lr_scheduler import LRScheduler +from torch.utils.data import DataLoader +import deepspeed + + +# see https://github.com/facebookresearch/fairseq/blob/b5a039c292facba9c73f59ff34621ec131d82341/fairseq/data/data_utils.py#L282 +# see how to set new batch size here: +# https://github.com/microsoft/DeepSpeed/issues/2798#issuecomment-1435475061 +# engine.set_train_micro_batch_size and set_train_batch_size (only changes grad acc steps) in +# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/engine.py +# TODO we need same batch size per GPU per grad step! + + +def batch_by_size( + metric_values, + max_metric_value_per_batch, + sample_ids=None, + min_batch_size=1, + max_batch_size=None, + shuffle_metric_values=False, + order_by_metric_value=False, + batch_size_multiple=1, + required_microbatches_of_same_size=False, + ): + + """ + Yield mini-batches of indices bucketed by size. Batches may contain sequences of different lengths. + Similar to "Attention is all you need", Section 5.1: + "Sentence pairs were batched together by approximate sequence length. Each training batch + contained a set of sentence pairs containing approximately X source tokens and X target tokens" + + Arguments: + - `metric_values`: a list of difficulties (metric values) for every sample in the dataset; + - `max_metric_value_per_batch`: upper cap in total difficulty in a batch; + - `sample_ids`: user-defined ids of the samples in metric_values. If not provided, + automatically assigns a sequential order; + - `min_batch_size`: smallest allowed size of a batch; + - `min_batch_size`: largest allowed size of a batch; + - `shuffle_metric_values`: shuffle metric values before packing samples into batches; + - `order_by_metric_value`: order samples by ascending metric values before packing into batches; + - `batch_size_multiple`: total batch count should divide the final number of batches, with + remaining batches being dropped. + Useful for data parallelism (where `batch_size_multiple`=`num_data_loaders`) and gradient + accumulation (where `batch_size_multiple`=`num_data_loaders`*`gradient_accumulation_steps`). + - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` + micro-batches per batch), should have all micro-batches with the same batch size. + Required for pipeline parallelism (as activation shapes is uniform across mini-batches), or + in regular data parallelism if we want the same number of samples per accumulation step. + + Returns a list of the ids of each micro-batch and a list of effective batch sizes. + """ + + assert not shuffle_metric_values or not order_by_metric_value, \ + "either sort_metric_values or shuffle_metric_values can be True, not both." + + sample_ids = sample_ids or list(range(len(metric_values))) + metrics = list(zip(metric_values, sample_ids)) + + if shuffle_metric_values: + random.shuffle(metrics) + if order_by_metric_value: + metrics = sorted(metrics) + + # go through metrics and warn user and filter samples that alone exceed the max batch threshold + long_ids = [ idx for val, idx in metrics if val>max_metric_value_per_batch ] + if len(long_ids)>0: + logger.warning(f"Data indices {long_ids} ignored as metrics exceed {max_metric_value_per_batch}.") + logger.info(f"Original dataset length: {len(metrics)}. New dataset length: {len(long_ids)}") + metrics = [ m for m in metrics if m[1] not in long_ids ] + + def is_microbatch_valid(metrics): + if len(metrics) < min_batch_size: return False # insufficient sample count + if max_batch_size and len(metrics)>max_batch_size: return False # too many samples + if sum([m[0] for m in metrics]) > max_metric_value_per_batch: return False # exceeds max + return True + + # go through all samples and pack then in microbatches of metric sums below the threshold + # `required_microbatches_of_same_size` means all minibatches in a batch must be of equal size + equal_size_multiple = batch_size_multiple if required_microbatches_of_same_size else 1 + microbatches = [] + batch_init = 0 + while batch_init < len(metrics): + + # we iterate over possible effective batch sizes (groups of microbatches of same size) + for batch_size in range(equal_size_multiple, len(metrics), equal_size_multiple): + + # attempt effective batch + batch = metrics[batch_init:batch_init+batch_size] + + # pick interleaved samples for each microbatch to help with load balancing + # (in the ordered use case), and to replicate what the distributed sampler does. + microbatch = [ batch[b::equal_size_multiple] for b in range(equal_size_multiple) ] + + # if they are all valid micro-batches, keep them until you find longer mbatches, if any + is_batch_valid = all([is_microbatch_valid(mb) for mb in microbatch] ) + if not is_batch_valid: + break + + if not is_batch_valid: batch_size -= equal_size_multiple #ignore last iteration (not valid) + batch = metrics[batch_init:batch_init+batch_size] + microbatch = [ batch[b::equal_size_multiple] for b in range(equal_size_multiple) ] + batch_init += sum( [ len(l) for l in microbatch ] ) + microbatches += microbatch + + # make sure we give the same number of batches to each dataloader by trimming the dataset + microbatches = microbatches[:len(microbatches) - len(microbatches) % batch_size_multiple] + + #compute the effective batch size for each microbatch. + effective_batch_sizes, sample_ids = [], [] + for rank in range(0, len(microbatches), batch_size_multiple): + microbatch = microbatches[rank: rank+batch_size_multiple] + batch_size = sum([len(mb) for mb in microbatch]) + effective_batch_sizes += [batch_size]*len(microbatch) + sample_ids += [ [m[1] for m in metrics] for metrics in microbatch] + + # return the sample ids of each microbatch, and their effective batch size + assert len(effective_batch_sizes) == len(sample_ids) + return sample_ids, effective_batch_sizes + + +def scale_lr(effective_batch_size, batch_size, base_lr=1, method="linear"): + """ given a reference lr and batch_size, compute the new LR for a given batch size """ + if method == "linear": + # Linear Scaling Rule: "When the minibatch size is multiplied by k, multiply the learning, + # rate by k" (Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour, Goyal et al) + return base_lr * batch_size / effective_batch_size + if method == "sqrt": + # Square Root scaling: "when multiplying the batch size by k, multiply the learning rate + # by √k, to keep the variance in the gradient expectation constant" + # (A. Krizhevsky. One weird trick for parallelizing convolutional neural networks) + return base_lr * torch.sqrt(batch_size / effective_batch_size) + raise ValueError("Unknown scaling method: {}".format(method)) + + +def dataloader_for_variable_batch_size(dataset, + microbatch_sample_ids, dataloader_rank, dataloader_num_replicas, dataloader_collate_fn, + dataloader_num_workers=2, dataloader_pin_memory=False, deepspeed_engine=None): + + # equidistantly distribute the microbatches across the replicas in an interleaved fashion. + sampler = DistributedSampler( + dataset=microbatch_sample_ids, + num_replicas=dataloader_num_replicas, + rank=dataloader_rank, + shuffle=False, + drop_last=False, + ) + + # collate function applies wraps user defined collate function to the variable batch data + def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): + # batch is a list of sample ids per microbatch + assert len(batch_sample_ids)==1, "only 1 element should be returned by the sampler." + batch_data = [dataset[idx] for idx in batch_sample_ids[0]] + return collate_fn(batch_data) if collate_fn else batch_data + + collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) + if deepspeed_engine is None: + return DataLoader( + dataset=microbatch_sample_ids, + sampler=sampler, + num_workers = dataloader_num_workers, + collate_fn = collate_fn, + pin_memory=dataloader_pin_memory, + ) + else: + deepspeed_engine.deepspeed_io(dataset, + batch_size=1, + pin_memory=dataloader_pin_memory, + data_sampler=sampler, + collate_fn=collate_fn, + num_local_io_workers=dataloader_num_workers) + + + +def lr_scheduler_for_variable_batch_size( + effective_batch_size, batch_size_per_epoch_fn, lr_scaling_method='linear', + lr_scheduler_class=LRScheduler, **lr_scheduler_kwargs): + """ + returns a class that inherits from `lr_scheduler_class` and provides a scaled + learning rate for batches of different sizes. + + Arguments: + - `effective_batch_size`: the batch size that the base_LR refers to; + - `lr_scaling_method`: method to use to scale LR - see `scale_lr()`; + - `batch_size_per_epoch_fn`: a function that returns the batch size for a given epoch; + - `lr_scheduler_class`: the class to inherit from (default: `LRScheduler`). It not provided, + will use the constant LR `optimizer.lr` as the LR value instead ; + returns: + - the class that inherits from `lr_scheduler_class`. + """ + assert issubclass(lr_scheduler_class, LRScheduler), \ + "lr_scheduler should be a subclass of LRScheduler" + + class VariableBatchSizeLR(lr_scheduler_class): + + def __init__(self, **lr_scheduler_kwargs): + super().__init__(**lr_scheduler_kwargs) + self.batch_size_per_epoch_fn = batch_size_per_epoch_fn + self.effective_batch_size = effective_batch_size + self.lr_scaling_method = lr_scaling_method + self.unscaled_lrs = self.get_last_lr()[:] # first epoch LRs, cloned + + def state_dict(self): + return {'base': super().state_dict(), + 'effective_batch_size': self.effective_batch_size, + 'lr_scaling_method': self.lr_scaling_method, + 'unscaled_lrs': self.unscaled_lrs, + } + + + def load_state_dict(self, state_dict): + super().load_state_dict(state_dict['base']) + self.effective_batch_size = state_dict['effective_batch_size'] + self.lr_scaling_method = state_dict['lr_scaling_method'] + self.unscaled_lrs = state_dict['unscaled_lrs'] + + + def step(self, epoch=None): + + # call the base scheduler's step method to get LR for next epoch + # note: optimizer.step preceeds lr_scheduler.step(), so the stepping workflow is: + # init: lr_scheduler.step(0) --> set LR for epoch 0 + # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 + # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 + + if lr_scheduler_class!=LRScheduler: #use LR scheduler + + # reset unscaled LRs (to the original scheduler's one) for the current epoch + for param_group, lr in zip(self.optimizer.param_groups, self.unscaled_lrs): + param_group['lr'] = lr + + super().step(epoch) # set lr, _step_count and last_epoch (for next epoch), _last_lr + self.unscaled_lrs = self.get_last_lr()[:] # backup next epoch LRs, cloned + + else: + + # replicate step(): set LR (constant), _step_count, last_epoch and _last_lr + for param_group, lr in zip(self.optimizer.param_groups, self.base_lrs): + param_group['lr'] = lr + + self._step_count += 1 + self.last_epoch = self.last_epoch+1 if epoch is None else epoch + self._last_lr = [lr]*len(self.optimizer.param_groups) + + # scale the learning rate for next epoch for each parameter group + batch_size = self.batch_size_per_epoch_fn(self.last_epoch) + lr_multiplier = scale_lr(self.effective_batch_size, batch_size, lr_scaling_method=lr_scaling_method) + for param_group in self.optimizer.param_groups: + param_group['lr'] *= lr_multiplier + + return VariableBatchSizeLR(**lr_scheduler_kwargs) + + +def get_dataloader_and_lr_scheduler_for_variable_batch_size( + dataset, + dataset_metric_values, + max_metric_value_per_batch, + sample_ids=None, + lr_scaling_method="linear", + min_batch_size=1, + max_batch_size=None, + shuffle_metric_values=False, + order_by_metric_value=False, + gradient_accumulation_steps=1, + pipeline_parallelism=False, + dataloader_rank=0, + dataloader_num_replicas=1, + dataloader_num_workers=0, + dataloader_collate_fn=None, + dataloader_pin_memory=False, + lr_scheduler_class=None, + lr_scheduler_kwargs={}, + deepspeed_engine=None, +): + # pipelining in DeepSpeed takes the first micro-batch activation shape as reference. + # So we need to make sure batch size remains contant across all microbatches in a batch. + required_microbatches_of_same_size = pipeline_parallelism + effective_batch_size = dataloader_num_replicas*gradient_accumulation_steps + + # batch_by_size returns the effective batch size and the sample ids for each microbatch. + # We will use the sample ids to retrieve the batches from the dataset, and + # the effective batch size to retrive the scaled learning rate for each batch + microbatch_sample_ids, microbatch_batch_sizes = batch_by_size( + metric_values=dataset_metric_values, + max_metric_value_per_batch=max_metric_value_per_batch, + sample_ids=sample_ids, + min_batch_size=min_batch_size, + max_batch_size=max_batch_size, + shuffle_metric_values=shuffle_metric_values, + order_by_metric_value=order_by_metric_value, + batch_size_multiple=effective_batch_size, + required_microbatches_of_same_size=required_microbatches_of_same_size, + ) + + dataloader = dataloader_for_variable_batch_size( + dataset=dataset, + microbatch_sample_ids=microbatch_sample_ids, + dataloader_rank=dataloader_rank, + dataloader_num_replicas=dataloader_num_replicas, + dataloader_collate_fn=dataloader_collate_fn, + dataloader_num_workers=dataloader_num_workers, + dataloader_pin_memory=dataloader_pin_memory, + deepspeed_engine=deepspeed_engine, + ) + + if lr_scheduler_class is None: + return dataloader + + lr_scheduler = lr_scheduler_for_variable_batch_size( + effective_batch_size=effective_batch_size, + batch_size_per_epoch_fn=lambda epoch: microbatch_batch_sizes[epoch], + lr_scaling_method=lr_scaling_method, + lr_scheduler_class=lr_scheduler_class, **lr_scheduler_kwargs) + + return dataloader, lr_scheduler + + +if __name__ == "__main__": + + # A small example/test on how to use this module + + from torch.utils.data import Dataset + class TestData(Dataset): + """ A test dataset with sequences of random length, and their sum as the target""" + def __init__(self, seq_count, min_seq_len=1, max_seq_len=21): + self.seqs = [ torch.ones(random.randrange(min_seq_len,max_seq_len)) for _ in range(seq_count) ] + + __len__ = lambda self: len(self.seqs) + __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] + + # collate_fn merges sequences, padded to the max length, or trimmed/paded to a value + @staticmethod + def collate_fn(batch, max_seq_len=None): + # if max_seq_len in enforces, trim/pad them to the max_len specified + if max_seq_len is not None: + for i, (seq, _) in enumerate(batch): + batch[i][0] = torch.nn.ConstantPad1d((0, max_seq_len - seq.shape[0]), 0)(seq) + seqs, labels = zip(*batch) + padded = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0) + labels = torch.tensor(labels) + return padded, labels + + + import torch.nn as nn + import torch.nn.functional as F + class TestFeedForward(nn.Module): + + def __init__(self): + super(TestFeedForward, self).__init__() + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(max_seq_len, 128) + self.fc2 = nn.Linear(128, 128) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return x.sum(dim=1) + + + max_seq_len=20 + dataset = TestData(seq_count=30, min_seq_len=5, max_seq_len=max_seq_len) + max_metric_value_per_batch=50 + dataloader_num_workers=2 + gradient_accumulation_steps=2 + effective_batch_size=dataloader_num_workers*gradient_accumulation_steps + base_lr=1, + metric_values = [ len(s) for s in dataset] + gradient_accumulation_steps=2 + + model = TestFeedForward() + optimizer = torch.optim.SGD(model.parameters(), lr=1) + criterion = torch.nn.MSELoss() + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) + + dataloader = get_dataloader_and_lr_scheduler_for_variable_batch_size( + dataset=dataset, + dataset_metric_values=metric_values, + max_metric_value_per_batch=max_metric_value_per_batch, + dataloader_rank=0, + dataloader_num_replicas=1, + sample_ids=None, + pipeline_parallelism=False, + lr_scaling_method="linear", + min_batch_size=1, + max_batch_size=None, + shuffle_metric_values=False, + order_by_metric_value=False, + gradient_accumulation_steps=gradient_accumulation_steps, + dataloader_num_workers=0, + dataloader_collate_fn=lambda b : TestData.collate_fn(b, max_seq_len=max_seq_len), + dataloader_pin_memory=False, + lr_scheduler_class=None, + lr_scheduler_kwargs={}, + ) + + # test with PyTorch + for epoch in range(2): + with torch.set_grad_enabled(True): + for minibatch_id in range(len(dataloader)//gradient_accumulation_steps): + for microbatch_id in range(gradient_accumulation_steps): + inputs, label = next(iter(dataloader)) + outputs = model(inputs) + loss = criterion(outputs, label) + loss.backward() + print(f"Epoch {epoch}, minibatch {minibatch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + optimizer.step() + optimizer.zero_grad() + + # Test with DeepSpeed + engine, optimizer, _, _ = deepspeed.initialize ( + model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + engine.training_dataloader = dataloader + # engine.training_dataloader = engine.deepspeed_io() + + for epoch in range(2): + for minibatch_id in range(len(dataloader)//gradient_accumulation_steps): + inputs, label = next(iter(dataloader)) + loss = engine(inputs) + engine.backward(loss) + engine.step() + print(f"Epoch {epoch}, minibatch {minibatch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + + \ No newline at end of file From f732a8f8cf6b656870041e82ee2334d602a43888 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 7 Mar 2024 16:17:04 +0000 Subject: [PATCH 32/64] bug fixes with batch sizes vs metrics --- .../variable_batch_size_and_lr.py | 213 ++++++++++-------- 1 file changed, 115 insertions(+), 98 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index cde883546a16..ae47f17099ff 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -2,6 +2,7 @@ import torch from deepspeed.utils import logger from torch.utils.data import DistributedSampler +from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader import deepspeed @@ -23,7 +24,8 @@ def batch_by_size( max_batch_size=None, shuffle_metric_values=False, order_by_metric_value=False, - batch_size_multiple=1, + dataloader_num_replicas=1, + gradient_accumulation_steps=1, required_microbatches_of_same_size=False, ): @@ -42,10 +44,8 @@ def batch_by_size( - `min_batch_size`: largest allowed size of a batch; - `shuffle_metric_values`: shuffle metric values before packing samples into batches; - `order_by_metric_value`: order samples by ascending metric values before packing into batches; - - `batch_size_multiple`: total batch count should divide the final number of batches, with - remaining batches being dropped. - Useful for data parallelism (where `batch_size_multiple`=`num_data_loaders`) and gradient - accumulation (where `batch_size_multiple`=`num_data_loaders`*`gradient_accumulation_steps`). + - `dataloader_num_replicas`: number of dataloaders + - `gradient_accumulation_steps`: number of gradient accumulation steps; - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` micro-batches per batch), should have all micro-batches with the same batch size. Required for pipeline parallelism (as activation shapes is uniform across mini-batches), or @@ -80,7 +80,8 @@ def is_microbatch_valid(metrics): # go through all samples and pack then in microbatches of metric sums below the threshold # `required_microbatches_of_same_size` means all minibatches in a batch must be of equal size - equal_size_multiple = batch_size_multiple if required_microbatches_of_same_size else 1 + num_microbatches_per_batch = dataloader_num_replicas * gradient_accumulation_steps + equal_size_multiple = num_microbatches_per_batch if required_microbatches_of_same_size else 1 microbatches = [] batch_init = 0 while batch_init < len(metrics): @@ -106,20 +107,22 @@ def is_microbatch_valid(metrics): batch_init += sum( [ len(l) for l in microbatch ] ) microbatches += microbatch - # make sure we give the same number of batches to each dataloader by trimming the dataset - microbatches = microbatches[:len(microbatches) - len(microbatches) % batch_size_multiple] + # make sure we give the same number of (micro-)batches to each dataloader by trimming dataset + microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] #compute the effective batch size for each microbatch. - effective_batch_sizes, sample_ids = [], [] - for rank in range(0, len(microbatches), batch_size_multiple): - microbatch = microbatches[rank: rank+batch_size_multiple] + batch_sizes, batch_metrics, microbatch_sample_ids = [], [], [] + for rank in range(0, len(microbatches), num_microbatches_per_batch): + microbatch = microbatches[rank: rank+num_microbatches_per_batch] batch_size = sum([len(mb) for mb in microbatch]) - effective_batch_sizes += [batch_size]*len(microbatch) - sample_ids += [ [m[1] for m in metrics] for metrics in microbatch] - - # return the sample ids of each microbatch, and their effective batch size - assert len(effective_batch_sizes) == len(sample_ids) - return sample_ids, effective_batch_sizes + batch_metric = sum([m[0] for m in microbatch[0]]) + batch_sizes.append(batch_size) + batch_metrics.append(batch_metric) + microbatch_sample_ids += [ [m[1] for m in metrics] for metrics in microbatch] + + # return the sample ids of each microbatch, and the batch sizes + assert len(batch_sizes) == len(microbatch_sample_ids)//num_microbatches_per_batch + return microbatch_sample_ids, batch_sizes, batch_metrics def scale_lr(effective_batch_size, batch_size, base_lr=1, method="linear"): @@ -174,49 +177,53 @@ def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): num_local_io_workers=dataloader_num_workers) +class StubLRScheduler(LRScheduler): + """ a stub LR scheduler that does not change the LR, keeps it constant """ + def get_lr(self) -> float: + return self.base_lrs def lr_scheduler_for_variable_batch_size( - effective_batch_size, batch_size_per_epoch_fn, lr_scaling_method='linear', - lr_scheduler_class=LRScheduler, **lr_scheduler_kwargs): + effective_batch_size, batch_sizes, lr_scaling_method='linear', + optimizer=None, lr_scheduler_class=None, **lr_scheduler_kwargs): """ - returns a class that inherits from `lr_scheduler_class` and provides a scaled - learning rate for batches of different sizes. + returns a class that provides an LR scheduler that scales learning rate at every + epoch taking into account the batch size of each epoch. + If learning rate is constant, ie no LR scheduler, then `optimizer` must be provided. + Otherwise, the base `LRScheduler` must be provided as `lr_scheduler_class`. Arguments: - `effective_batch_size`: the batch size that the base_LR refers to; - `lr_scaling_method`: method to use to scale LR - see `scale_lr()`; - - `batch_size_per_epoch_fn`: a function that returns the batch size for a given epoch; - - `lr_scheduler_class`: the class to inherit from (default: `LRScheduler`). It not provided, - will use the constant LR `optimizer.lr` as the LR value instead ; - returns: - - the class that inherits from `lr_scheduler_class`. + - `batch_sizes`: the effective batch size of each batch in the dataloader; + - `optimizer` and `lr_scheduler_class`: the base LR scheduler. It not provided, + will use the constant LRs from the optimizer's param groups instead. If provided, + the initialization of the scheduler will be done with `lr_scheduler_kwargs`. + + Returns the new LRScheduler """ - assert issubclass(lr_scheduler_class, LRScheduler), \ - "lr_scheduler should be a subclass of LRScheduler" - class VariableBatchSizeLR(lr_scheduler_class): + class VariableBatchSizeLR(lr_scheduler_class or StubLRScheduler): - def __init__(self, **lr_scheduler_kwargs): - super().__init__(**lr_scheduler_kwargs) - self.batch_size_per_epoch_fn = batch_size_per_epoch_fn + def __init__(self, optimizer, **lr_scheduler_kwargs): + self.batch_sizes = batch_sizes self.effective_batch_size = effective_batch_size self.lr_scaling_method = lr_scaling_method - self.unscaled_lrs = self.get_last_lr()[:] # first epoch LRs, cloned + self.unscaled_lrs = [p['lr'] for p in optimizer.param_groups] + super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) def state_dict(self): return {'base': super().state_dict(), 'effective_batch_size': self.effective_batch_size, 'lr_scaling_method': self.lr_scaling_method, 'unscaled_lrs': self.unscaled_lrs, - } - + 'batch_sizes': self.batch_sizes} def load_state_dict(self, state_dict): super().load_state_dict(state_dict['base']) self.effective_batch_size = state_dict['effective_batch_size'] self.lr_scaling_method = state_dict['lr_scaling_method'] self.unscaled_lrs = state_dict['unscaled_lrs'] - + self.batch_sizes = state_dict['batch_sizes'] def step(self, epoch=None): @@ -226,38 +233,37 @@ def step(self, epoch=None): # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 - if lr_scheduler_class!=LRScheduler: #use LR scheduler - - # reset unscaled LRs (to the original scheduler's one) for the current epoch - for param_group, lr in zip(self.optimizer.param_groups, self.unscaled_lrs): - param_group['lr'] = lr - - super().step(epoch) # set lr, _step_count and last_epoch (for next epoch), _last_lr - self.unscaled_lrs = self.get_last_lr()[:] # backup next epoch LRs, cloned - - else: - - # replicate step(): set LR (constant), _step_count, last_epoch and _last_lr - for param_group, lr in zip(self.optimizer.param_groups, self.base_lrs): - param_group['lr'] = lr - - self._step_count += 1 - self.last_epoch = self.last_epoch+1 if epoch is None else epoch - self._last_lr = [lr]*len(self.optimizer.param_groups) - + # reset unscaled LRs (to the original scheduler's one) for the current epoch + for param_group, lr in zip(self.optimizer.param_groups, self.unscaled_lrs): + param_group['lr'] = lr + + super().step(epoch) # set lr, _step_count and last_epoch (for next epoch), _last_lr + self.unscaled_lrs = self.get_last_lr()[:] # backup next epoch LRs, cloned + # scale the learning rate for next epoch for each parameter group - batch_size = self.batch_size_per_epoch_fn(self.last_epoch) - lr_multiplier = scale_lr(self.effective_batch_size, batch_size, lr_scaling_method=lr_scaling_method) + batch_size = self.batch_sizes[self.last_epoch] + lr_multiplier = scale_lr(self.effective_batch_size, batch_size, method=lr_scaling_method) for param_group in self.optimizer.param_groups: param_group['lr'] *= lr_multiplier - return VariableBatchSizeLR(**lr_scheduler_kwargs) + + if lr_scheduler_class is None: + assert optimizer is not None, "optimizer must be provided if lr_scheduler_class is not" + else: + assert issubclass(lr_scheduler_class, LRScheduler), "lr_scheduler should be a LRScheduler" + + if optimizer is None: + assert lr_scheduler_class is not None, "lr_scheduler_class must be provided if optimizer is not" + optimizer = lr_scheduler_kwargs['optimizer'] + + return VariableBatchSizeLR(optimizer=optimizer, **lr_scheduler_kwargs) def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, dataset_metric_values, max_metric_value_per_batch, + base_batch_size, sample_ids=None, lr_scaling_method="linear", min_batch_size=1, @@ -271,19 +277,18 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_num_workers=0, dataloader_collate_fn=None, dataloader_pin_memory=False, + optimizer=None, lr_scheduler_class=None, lr_scheduler_kwargs={}, deepspeed_engine=None, ): - # pipelining in DeepSpeed takes the first micro-batch activation shape as reference. - # So we need to make sure batch size remains contant across all microbatches in a batch. - required_microbatches_of_same_size = pipeline_parallelism - effective_batch_size = dataloader_num_replicas*gradient_accumulation_steps # batch_by_size returns the effective batch size and the sample ids for each microbatch. - # We will use the sample ids to retrieve the batches from the dataset, and - # the effective batch size to retrive the scaled learning rate for each batch - microbatch_sample_ids, microbatch_batch_sizes = batch_by_size( + # We will use the sample ids to retrieve the batches from the dataset, + # and the effective batch size to retrieve the scaled learning rate for each batch + # Note: pipelining in DeepSpeed takes the first micro-batch activation shape as reference. + # So we need to make sure batch size remains contant across all microbatches in a batch. + microbatch_sample_ids, batch_sizes, batch_metrics = batch_by_size( metric_values=dataset_metric_values, max_metric_value_per_batch=max_metric_value_per_batch, sample_ids=sample_ids, @@ -291,8 +296,9 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( max_batch_size=max_batch_size, shuffle_metric_values=shuffle_metric_values, order_by_metric_value=order_by_metric_value, - batch_size_multiple=effective_batch_size, - required_microbatches_of_same_size=required_microbatches_of_same_size, + dataloader_num_replicas=dataloader_num_replicas, + gradient_accumulation_steps=gradient_accumulation_steps, + required_microbatches_of_same_size=pipeline_parallelism, ) dataloader = dataloader_for_variable_batch_size( @@ -306,14 +312,13 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( deepspeed_engine=deepspeed_engine, ) - if lr_scheduler_class is None: - return dataloader - lr_scheduler = lr_scheduler_for_variable_batch_size( - effective_batch_size=effective_batch_size, - batch_size_per_epoch_fn=lambda epoch: microbatch_batch_sizes[epoch], + effective_batch_size=base_batch_size, + batch_sizes=batch_sizes, lr_scaling_method=lr_scaling_method, - lr_scheduler_class=lr_scheduler_class, **lr_scheduler_kwargs) + optimizer=optimizer, + lr_scheduler_class=lr_scheduler_class, + **lr_scheduler_kwargs) return dataloader, lr_scheduler @@ -343,7 +348,6 @@ def collate_fn(batch, max_seq_len=None): labels = torch.tensor(labels) return padded, labels - import torch.nn as nn import torch.nn.functional as F class TestFeedForward(nn.Module): @@ -360,24 +364,22 @@ def forward(self, x): return x.sum(dim=1) - max_seq_len=20 - dataset = TestData(seq_count=30, min_seq_len=5, max_seq_len=max_seq_len) + max_seq_len=15 + dataset = TestData(seq_count=100, min_seq_len=5, max_seq_len=max_seq_len) max_metric_value_per_batch=50 dataloader_num_workers=2 gradient_accumulation_steps=2 - effective_batch_size=dataloader_num_workers*gradient_accumulation_steps base_lr=1, - metric_values = [ len(s) for s in dataset] + base_batch_size=8 gradient_accumulation_steps=2 - model = TestFeedForward() - optimizer = torch.optim.SGD(model.parameters(), lr=1) - criterion = torch.nn.MSELoss() - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) + optimizer = torch.optim.Adam(model.parameters(), lr=1) - dataloader = get_dataloader_and_lr_scheduler_for_variable_batch_size( + metric_values = [ len(s[0]) for s in dataset] # difficulty = input sequence length + dataloader, lr_scheduler = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_metric_values=metric_values, + base_batch_size=base_batch_size, max_metric_value_per_batch=max_metric_value_per_batch, dataloader_rank=0, dataloader_num_replicas=1, @@ -392,35 +394,50 @@ def forward(self, x): dataloader_num_workers=0, dataloader_collate_fn=lambda b : TestData.collate_fn(b, max_seq_len=max_seq_len), dataloader_pin_memory=False, - lr_scheduler_class=None, - lr_scheduler_kwargs={}, + optimizer=optimizer, + # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, + # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), ) # test with PyTorch - for epoch in range(2): - with torch.set_grad_enabled(True): - for minibatch_id in range(len(dataloader)//gradient_accumulation_steps): + with torch.set_grad_enabled(True): + for epoch in range(10): + for batch_id in range(len(dataloader)//gradient_accumulation_steps): for microbatch_id in range(gradient_accumulation_steps): inputs, label = next(iter(dataloader)) outputs = model(inputs) - loss = criterion(outputs, label) + loss = F.mse_loss(outputs, label) loss.backward() - print(f"Epoch {epoch}, minibatch {minibatch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") optimizer.step() - optimizer.zero_grad() + optimizer.zero_grad() + lr_scheduler.step() # Test with DeepSpeed - engine, optimizer, _, _ = deepspeed.initialize ( + config = { + "train_batch_size": base_batch_size, + "gradient_accumulation_steps": gradient_accumulation_steps, + "optimizer": { + "type": "Adam", + "params": { + "lr": base_lr, + } + }, + } + engine, optimizer, _, _ = deepspeed.initialize(config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) engine.training_dataloader = dataloader # engine.training_dataloader = engine.deepspeed_io() for epoch in range(2): - for minibatch_id in range(len(dataloader)//gradient_accumulation_steps): - inputs, label = next(iter(dataloader)) - loss = engine(inputs) - engine.backward(loss) - engine.step() - print(f"Epoch {epoch}, minibatch {minibatch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + for batch_id in range(len(dataloader)//gradient_accumulation_steps): + for microbatch_id in range(gradient_accumulation_steps): + inputs, labels = next(iter(dataloader)) + inputs, labels = inputs.to("cuda"), labels.to("cuda") + outputs = engine(inputs) + loss = F.mse_loss(outputs, labels) + engine.backward(loss) + engine.step() + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") \ No newline at end of file From 550ab31468a554b1edd481da7f1a9dfb4eb7d917 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 7 Mar 2024 17:23:06 +0000 Subject: [PATCH 33/64] deepspeed_io support --- .../variable_batch_size_and_lr.py | 151 ++++++++++-------- 1 file changed, 87 insertions(+), 64 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index ae47f17099ff..2f79de5d5c1e 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -27,6 +27,7 @@ def batch_by_size( dataloader_num_replicas=1, gradient_accumulation_steps=1, required_microbatches_of_same_size=False, + verbose=False, ): """ @@ -116,32 +117,35 @@ def is_microbatch_valid(metrics): microbatch = microbatches[rank: rank+num_microbatches_per_batch] batch_size = sum([len(mb) for mb in microbatch]) batch_metric = sum([m[0] for m in microbatch[0]]) + batch_sample_ids = [ [m[1] for m in metrics] for metrics in microbatch] batch_sizes.append(batch_size) batch_metrics.append(batch_metric) - microbatch_sample_ids += [ [m[1] for m in metrics] for metrics in microbatch] + microbatch_sample_ids += batch_sample_ids + if verbose: + print(f"Batch size {batch_size} samples, metric value {batch_metric}, samples: {batch_sample_ids}") # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_sample_ids)//num_microbatches_per_batch return microbatch_sample_ids, batch_sizes, batch_metrics -def scale_lr(effective_batch_size, batch_size, base_lr=1, method="linear"): +def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): """ given a reference lr and batch_size, compute the new LR for a given batch size """ if method == "linear": # Linear Scaling Rule: "When the minibatch size is multiplied by k, multiply the learning, # rate by k" (Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour, Goyal et al) - return base_lr * batch_size / effective_batch_size + return base_lr * batch_size / base_batch_size if method == "sqrt": # Square Root scaling: "when multiplying the batch size by k, multiply the learning rate # by √k, to keep the variance in the gradient expectation constant" # (A. Krizhevsky. One weird trick for parallelizing convolutional neural networks) - return base_lr * torch.sqrt(batch_size / effective_batch_size) + return base_lr * torch.sqrt(batch_size / base_batch_size) raise ValueError("Unknown scaling method: {}".format(method)) def dataloader_for_variable_batch_size(dataset, microbatch_sample_ids, dataloader_rank, dataloader_num_replicas, dataloader_collate_fn, - dataloader_num_workers=2, dataloader_pin_memory=False, deepspeed_engine=None): + dataloader_num_workers=2, dataloader_pin_memory=False): # equidistantly distribute the microbatches across the replicas in an interleaved fashion. sampler = DistributedSampler( @@ -160,21 +164,23 @@ def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): return collate_fn(batch_data) if collate_fn else batch_data collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) - if deepspeed_engine is None: - return DataLoader( + + dataloader = DataLoader( dataset=microbatch_sample_ids, sampler=sampler, num_workers = dataloader_num_workers, collate_fn = collate_fn, pin_memory=dataloader_pin_memory, ) - else: - deepspeed_engine.deepspeed_io(dataset, - batch_size=1, - pin_memory=dataloader_pin_memory, - data_sampler=sampler, - collate_fn=collate_fn, - num_local_io_workers=dataloader_num_workers) + + deepspeed_io_kwargs = dict(dataset=dataset, + batch_size=1, + pin_memory=dataloader_pin_memory, + data_sampler=sampler, + collate_fn=collate_fn, + num_local_io_workers=dataloader_num_workers) + + return dataloader, deepspeed_io_kwargs class StubLRScheduler(LRScheduler): @@ -183,7 +189,7 @@ def get_lr(self) -> float: return self.base_lrs def lr_scheduler_for_variable_batch_size( - effective_batch_size, batch_sizes, lr_scaling_method='linear', + base_batch_size, batch_sizes, lr_scaling_method='linear', optimizer=None, lr_scheduler_class=None, **lr_scheduler_kwargs): """ returns a class that provides an LR scheduler that scales learning rate at every @@ -192,7 +198,7 @@ def lr_scheduler_for_variable_batch_size( Otherwise, the base `LRScheduler` must be provided as `lr_scheduler_class`. Arguments: - - `effective_batch_size`: the batch size that the base_LR refers to; + - `base_batch_size`: the batch size that the base LR in the optimizer or scheduler refers to; - `lr_scaling_method`: method to use to scale LR - see `scale_lr()`; - `batch_sizes`: the effective batch size of each batch in the dataloader; - `optimizer` and `lr_scheduler_class`: the base LR scheduler. It not provided, @@ -206,21 +212,23 @@ class VariableBatchSizeLR(lr_scheduler_class or StubLRScheduler): def __init__(self, optimizer, **lr_scheduler_kwargs): self.batch_sizes = batch_sizes - self.effective_batch_size = effective_batch_size + self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method self.unscaled_lrs = [p['lr'] for p in optimizer.param_groups] super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) def state_dict(self): - return {'base': super().state_dict(), - 'effective_batch_size': self.effective_batch_size, - 'lr_scaling_method': self.lr_scaling_method, - 'unscaled_lrs': self.unscaled_lrs, - 'batch_sizes': self.batch_sizes} + return { + 'base': super().state_dict(), + 'base_batch_size': self.base_batch_size, + 'lr_scaling_method': self.lr_scaling_method, + 'unscaled_lrs': self.unscaled_lrs, + 'batch_sizes': self.batch_sizes + } def load_state_dict(self, state_dict): super().load_state_dict(state_dict['base']) - self.effective_batch_size = state_dict['effective_batch_size'] + self.base_batch_size = state_dict['base_batch_size'] self.lr_scaling_method = state_dict['lr_scaling_method'] self.unscaled_lrs = state_dict['unscaled_lrs'] self.batch_sizes = state_dict['batch_sizes'] @@ -236,15 +244,21 @@ def step(self, epoch=None): # reset unscaled LRs (to the original scheduler's one) for the current epoch for param_group, lr in zip(self.optimizer.param_groups, self.unscaled_lrs): param_group['lr'] = lr + self._last_lr = [group['lr'] for group in self.optimizer.param_groups] super().step(epoch) # set lr, _step_count and last_epoch (for next epoch), _last_lr self.unscaled_lrs = self.get_last_lr()[:] # backup next epoch LRs, cloned - # scale the learning rate for next epoch for each parameter group - batch_size = self.batch_sizes[self.last_epoch] - lr_multiplier = scale_lr(self.effective_batch_size, batch_size, method=lr_scaling_method) + # scale the learning rate for next epoch for each parameter group. + # if we reach the last element, assume looping of data, ie refer to the first element + if self.last_epoch % len(self.batch_sizes) == 0: + print("RESET") + batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] + lr_multiplier = scale_lr(self.base_batch_size, batch_size, method=lr_scaling_method) for param_group in self.optimizer.param_groups: param_group['lr'] *= lr_multiplier + self._last_lr = [group['lr'] for group in self.optimizer.param_groups] + print(f"LRs: {self.unscaled_lrs}, scaled by {lr_multiplier}, scaled LR: {self.get_last_lr()}") if lr_scheduler_class is None: @@ -280,7 +294,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( optimizer=None, lr_scheduler_class=None, lr_scheduler_kwargs={}, - deepspeed_engine=None, + verbose=False, ): # batch_by_size returns the effective batch size and the sample ids for each microbatch. @@ -299,9 +313,10 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_num_replicas=dataloader_num_replicas, gradient_accumulation_steps=gradient_accumulation_steps, required_microbatches_of_same_size=pipeline_parallelism, + verbose=verbose, ) - dataloader = dataloader_for_variable_batch_size( + dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( dataset=dataset, microbatch_sample_ids=microbatch_sample_ids, dataloader_rank=dataloader_rank, @@ -309,18 +324,17 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, - deepspeed_engine=deepspeed_engine, ) lr_scheduler = lr_scheduler_for_variable_batch_size( - effective_batch_size=base_batch_size, + base_batch_size=base_batch_size, batch_sizes=batch_sizes, lr_scaling_method=lr_scaling_method, optimizer=optimizer, lr_scheduler_class=lr_scheduler_class, **lr_scheduler_kwargs) - return dataloader, lr_scheduler + return dataloader, lr_scheduler, deepspeed_io_kwargs if __name__ == "__main__": @@ -366,17 +380,16 @@ def forward(self, x): max_seq_len=15 dataset = TestData(seq_count=100, min_seq_len=5, max_seq_len=max_seq_len) - max_metric_value_per_batch=50 + max_metric_value_per_batch=40 dataloader_num_workers=2 gradient_accumulation_steps=2 - base_lr=1, base_batch_size=8 - gradient_accumulation_steps=2 - model = TestFeedForward() - optimizer = torch.optim.Adam(model.parameters(), lr=1) + model = TestFeedForward().to("cuda") + base_lr=1e-3 + optimizer = torch.optim.Adam(model.parameters(), lr=base_lr) metric_values = [ len(s[0]) for s in dataset] # difficulty = input sequence length - dataloader, lr_scheduler = get_dataloader_and_lr_scheduler_for_variable_batch_size( + dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_metric_values=metric_values, base_batch_size=base_batch_size, @@ -397,47 +410,57 @@ def forward(self, x): optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), + verbose=True, ) # test with PyTorch + dataloader_it = iter(dataloader) with torch.set_grad_enabled(True): for epoch in range(10): - for batch_id in range(len(dataloader)//gradient_accumulation_steps): - for microbatch_id in range(gradient_accumulation_steps): - inputs, label = next(iter(dataloader)) - outputs = model(inputs) - loss = F.mse_loss(outputs, label) - loss.backward() - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") - optimizer.step() - optimizer.zero_grad() - lr_scheduler.step() + try: + for batch_id in range(len(dataloader)//gradient_accumulation_steps): + for microbatch_id in range(gradient_accumulation_steps): + inputs, labels = next(dataloader_it) + inputs, labels = inputs.to("cuda"), labels.to("cuda") + outputs = model(inputs) + loss = F.mse_loss(outputs, labels) + loss.backward() + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + optimizer.step() + optimizer.zero_grad() + lr_scheduler.step() + except StopIteration: + # if we run out of data, we restart from the very first batch + dataloader_it = iter(dataloader) + continue # Test with DeepSpeed config = { "train_batch_size": base_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, - "optimizer": { - "type": "Adam", - "params": { - "lr": base_lr, - } - }, + "optimizer": { "type": "Adam", "params": { "lr": base_lr, } }, } engine, optimizer, _, _ = deepspeed.initialize(config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) - engine.training_dataloader = dataloader + # engine.training_dataloader = dataloader + engine.deepspeed_io(**deepspeed_io_kwargs) # engine.training_dataloader = engine.deepspeed_io() - for epoch in range(2): - for batch_id in range(len(dataloader)//gradient_accumulation_steps): - for microbatch_id in range(gradient_accumulation_steps): - inputs, labels = next(iter(dataloader)) - inputs, labels = inputs.to("cuda"), labels.to("cuda") - outputs = engine(inputs) - loss = F.mse_loss(outputs, labels) - engine.backward(loss) - engine.step() - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, batch size {len(inputs)}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + dataloader_it = iter(engine.training_dataloader) + for epoch in range(10): + try: + for batch_id in range(len(engine.training_dataloader)//gradient_accumulation_steps): + for microbatch_id in range(gradient_accumulation_steps): + inputs, labels = next(dataloader_it) + inputs, labels = inputs.to("cuda"), labels.to("cuda") + outputs = engine(inputs) + loss = F.mse_loss(outputs, labels) + engine.backward(loss) + engine.step() + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + except StopIteration: + # if we run out of data, we restart from the very first batch + dataloader_it = iter(engine.training_dataloader) + continue \ No newline at end of file From b7f25204c0fe2a4a5527979aeeae12d1c4b3accb Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 7 Mar 2024 23:44:24 +0000 Subject: [PATCH 34/64] bug fixes --- .../variable_batch_size_and_lr.py | 96 +++++++++---------- 1 file changed, 45 insertions(+), 51 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 2f79de5d5c1e..b14d3cd4fe85 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -2,9 +2,9 @@ import torch from deepspeed.utils import logger from torch.utils.data import DistributedSampler -from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader +import torch.nn.functional as F import deepspeed @@ -166,19 +166,19 @@ def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) dataloader = DataLoader( - dataset=microbatch_sample_ids, - sampler=sampler, - num_workers = dataloader_num_workers, - collate_fn = collate_fn, - pin_memory=dataloader_pin_memory, - ) + dataset=microbatch_sample_ids, + sampler=sampler, + num_workers = dataloader_num_workers, + collate_fn = collate_fn, + pin_memory=dataloader_pin_memory,) - deepspeed_io_kwargs = dict(dataset=dataset, - batch_size=1, - pin_memory=dataloader_pin_memory, - data_sampler=sampler, - collate_fn=collate_fn, - num_local_io_workers=dataloader_num_workers) + deepspeed_io_kwargs = dict( + dataset=microbatch_sample_ids, + batch_size=1, + pin_memory=dataloader_pin_memory, + data_sampler=sampler, + collate_fn=collate_fn, + num_local_io_workers=dataloader_num_workers,) return dataloader, deepspeed_io_kwargs @@ -189,7 +189,7 @@ def get_lr(self) -> float: return self.base_lrs def lr_scheduler_for_variable_batch_size( - base_batch_size, batch_sizes, lr_scaling_method='linear', + base_batch_size, batch_sizes, dataloader, lr_scaling_method='linear', optimizer=None, lr_scheduler_class=None, **lr_scheduler_kwargs): """ returns a class that provides an LR scheduler that scales learning rate at every @@ -214,7 +214,8 @@ def __init__(self, optimizer, **lr_scheduler_kwargs): self.batch_sizes = batch_sizes self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method - self.unscaled_lrs = [p['lr'] for p in optimizer.param_groups] + self.dataloader = dataloader + self._last_lr = [p['lr'] for p in optimizer.param_groups] super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) def state_dict(self): @@ -222,18 +223,19 @@ def state_dict(self): 'base': super().state_dict(), 'base_batch_size': self.base_batch_size, 'lr_scaling_method': self.lr_scaling_method, - 'unscaled_lrs': self.unscaled_lrs, - 'batch_sizes': self.batch_sizes + 'batch_sizes': self.batch_sizes, } def load_state_dict(self, state_dict): super().load_state_dict(state_dict['base']) self.base_batch_size = state_dict['base_batch_size'] self.lr_scaling_method = state_dict['lr_scaling_method'] - self.unscaled_lrs = state_dict['unscaled_lrs'] self.batch_sizes = state_dict['batch_sizes'] - def step(self, epoch=None): + def get_lr(self): + return [group['lr'] for group in self.optimizer.param_groups] + + def step(self, epoch=0): # call the base scheduler's step method to get LR for next epoch # note: optimizer.step preceeds lr_scheduler.step(), so the stepping workflow is: @@ -242,24 +244,21 @@ def step(self, epoch=None): # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 # reset unscaled LRs (to the original scheduler's one) for the current epoch - for param_group, lr in zip(self.optimizer.param_groups, self.unscaled_lrs): - param_group['lr'] = lr - self._last_lr = [group['lr'] for group in self.optimizer.param_groups] + for param_group, lr in zip(self.optimizer.param_groups, self._last_lr): + param_group['lr'] = lr # reset to last epoch's original/unscaled LR - super().step(epoch) # set lr, _step_count and last_epoch (for next epoch), _last_lr - self.unscaled_lrs = self.get_last_lr()[:] # backup next epoch LRs, cloned + super().step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch # scale the learning rate for next epoch for each parameter group. - # if we reach the last element, assume looping of data, ie refer to the first element - if self.last_epoch % len(self.batch_sizes) == 0: - print("RESET") - batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] + batch_size = self.batch_sizes[epoch % len(self.batch_sizes)] lr_multiplier = scale_lr(self.base_batch_size, batch_size, method=lr_scaling_method) for param_group in self.optimizer.param_groups: - param_group['lr'] *= lr_multiplier - self._last_lr = [group['lr'] for group in self.optimizer.param_groups] - print(f"LRs: {self.unscaled_lrs}, scaled by {lr_multiplier}, scaled LR: {self.get_last_lr()}") + param_group['lr'] *= lr_multiplier #set scale LR for new epoch + if self.verbose: + print(f"Batch id {epoch}, unscaled LR: {self._last_lr}, scaled LR: {self.get_lr()}") + + #### main loop: double check arguments and returns correctly-instantiated LR scheduler if lr_scheduler_class is None: assert optimizer is not None, "optimizer must be provided if lr_scheduler_class is not" @@ -293,7 +292,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_pin_memory=False, optimizer=None, lr_scheduler_class=None, - lr_scheduler_kwargs={}, + lr_scheduler_kwargs={'verbose':False}, verbose=False, ): @@ -331,6 +330,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( batch_sizes=batch_sizes, lr_scaling_method=lr_scaling_method, optimizer=optimizer, + dataloader=dataloader, lr_scheduler_class=lr_scheduler_class, **lr_scheduler_kwargs) @@ -338,11 +338,9 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( if __name__ == "__main__": - # A small example/test on how to use this module - from torch.utils.data import Dataset - class TestData(Dataset): + class TestData(torch.utils.data.Dataset): """ A test dataset with sequences of random length, and their sum as the target""" def __init__(self, seq_count, min_seq_len=1, max_seq_len=21): self.seqs = [ torch.ones(random.randrange(min_seq_len,max_seq_len)) for _ in range(seq_count) ] @@ -362,15 +360,13 @@ def collate_fn(batch, max_seq_len=None): labels = torch.tensor(labels) return padded, labels - import torch.nn as nn - import torch.nn.functional as F - class TestFeedForward(nn.Module): + class TestFeedForward(torch.nn.Module): def __init__(self): super(TestFeedForward, self).__init__() # an affine operation: y = Wx + b - self.fc1 = nn.Linear(max_seq_len, 128) - self.fc2 = nn.Linear(128, 128) + self.fc1 = torch.nn.Linear(max_seq_len, 128) + self.fc2 = torch.nn.Linear(128, 128) def forward(self, x): x = F.relu(self.fc1(x)) @@ -379,7 +375,7 @@ def forward(self, x): max_seq_len=15 - dataset = TestData(seq_count=100, min_seq_len=5, max_seq_len=max_seq_len) + dataset = TestData(seq_count=30, min_seq_len=5, max_seq_len=max_seq_len) max_metric_value_per_batch=40 dataloader_num_workers=2 gradient_accumulation_steps=2 @@ -410,7 +406,6 @@ def forward(self, x): optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), - verbose=True, ) # test with PyTorch @@ -425,12 +420,12 @@ def forward(self, x): outputs = model(inputs) loss = F.mse_loss(outputs, labels) loss.backward() - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") optimizer.step() optimizer.zero_grad() - lr_scheduler.step() + lr_scheduler.step(epoch=batch_id+1) except StopIteration: - # if we run out of data, we restart from the very first batch + # if we run out of data, we restart the dataloader and LR scheduler dataloader_it = iter(dataloader) continue @@ -442,9 +437,8 @@ def forward(self, x): } engine, optimizer, _, _ = deepspeed.initialize(config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) - # engine.training_dataloader = dataloader - engine.deepspeed_io(**deepspeed_io_kwargs) - # engine.training_dataloader = engine.deepspeed_io() + # engine.training_dataloader = dataloader #use this or the deepspeed_io() + engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) dataloader_it = iter(engine.training_dataloader) for epoch in range(10): @@ -456,10 +450,10 @@ def forward(self, x): outputs = engine(inputs) loss = F.mse_loss(outputs, labels) engine.backward(loss) - engine.step() - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_last_lr()}") + print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") + engine.step(lr_kwargs={'epoch': batch_id+1}) except StopIteration: - # if we run out of data, we restart from the very first batch + # if we run out of data, we restart the dataloader and LR scheduler dataloader_it = iter(engine.training_dataloader) continue From 6452d441cdc9c172f3ba4e19733bddbdbfd0812a Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 8 Mar 2024 00:06:36 +0000 Subject: [PATCH 35/64] better comment --- .../data_pipeline/data_sampling/variable_batch_size_and_lr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index b14d3cd4fe85..1e4174a25b1f 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -132,7 +132,7 @@ def is_microbatch_valid(metrics): def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): """ given a reference lr and batch_size, compute the new LR for a given batch size """ if method == "linear": - # Linear Scaling Rule: "When the minibatch size is multiplied by k, multiply the learning, + # Linear Scaling Rule: "When the minibatch size is multiplied by k, multiply the learning # rate by k" (Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour, Goyal et al) return base_lr * batch_size / base_batch_size if method == "sqrt": From 42accd1418788adb34c11b319ef3021a48fb17c9 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 8 Mar 2024 00:09:42 +0000 Subject: [PATCH 36/64] recovered files from master --- .../data_sampling/indexed_dataset.py | 48 ++++++++----------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py index b300ff4aab89..60115fa6efef 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -98,26 +98,25 @@ def write_longs(f, a): f.write(np.array(a, dtype=np.int64)) -# valid dtypes for metric_values and their corresponding numpy/torch types dtypes = { - 1: (np.uint8, torch.uint8), - 2: (np.int8, torch.int8), - 3: (np.int16, torch.int16), - 4: (np.int32, torch.int32), - 5: (np.int64, torch.int64), - 6: (np.uint16, None), - 7: (np.uint32, None), - 8: (np.uint64, None), + 1: np.uint8, + 2: np.int8, + 3: np.int16, + 4: np.int32, + 5: np.int64, + 6: np.float64, + 7: np.double, + 8: np.uint16, + 9: np.uint32, + 10: np.uint64 } -valid_dtypes = set([dt[0] for dt in dtypes.values()] + [dt[1] for dt in dtypes.values() if dt[1] is not None]) - def code(dtype): - for c, (np_dt, torch_dt) in dtypes.items(): - if dtype in [np_dt, torch_dt]: - return c - raise ValueError(f"{dtype} not supported. Supported types: {valid_dtypes}") + for k in dtypes.keys(): + if dtypes[k] == dtype: + return k + raise ValueError(dtype) def index_file_path(prefix_path): @@ -154,7 +153,7 @@ def read_index(self, path): version = f.read(8) assert struct.unpack(' Date: Fri, 8 Mar 2024 09:13:28 +0000 Subject: [PATCH 37/64] bug fixrs on LR scheduler reset --- .../data_sampling/data_analyzer.py | 47 ++++----- .../variable_batch_size_and_lr.py | 95 ++++++++++++------- 2 files changed, 83 insertions(+), 59 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 10a1a9e4eac0..3d4d8bde7d1c 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -384,10 +384,26 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ index_to_metric_builder.merge_file_(chunk_im_fname) close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) - - num_sample_per_value = DataAnalyzer.output_index_to_sample_percentile( - index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples, - sample_idx_dtype) + num_sample_per_value = {} + index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) + index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) + index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" + index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, + sample_idx_dtype) + for v_idx in range(len(index_to_sample)): + if v_idx > 0: + assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] + num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) + assert sum(num_sample_per_value.values()) == total_num_samples + merge_step = max(1, len(index_to_sample) // 100) + for v_idx in range(0, len(index_to_sample), merge_step): + merged_samples = np.copy( + np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], + axis=None)) + index_to_sample_merged_builder.add_item( + torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") + close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) elif metric_type == 'accumulate_value_over_samples': metric_save_path = f"{save_path}/{metric_name}/" @@ -409,29 +425,6 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_ metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long)) close_mmap_dataset_builder(metric_value_builder, metric_value_fname) - @staticmethod - def output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, - total_num_samples, sample_idx_dtype): - """ read index_to_metric and index_to_sample files and write distribution to index_to_sample_percentage_merged """ - num_sample_per_value = {} - index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) - index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) - index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" - index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, sample_idx_dtype) - for v_idx in range(len(index_to_sample)): - if v_idx > 0: - assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] - num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) - assert sum(list(num_sample_per_value.values())) == total_num_samples - merge_step = max(1, len(index_to_sample) // 100) - for v_idx in range(0, len(index_to_sample), merge_step): - merged_samples = np.copy( - np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], axis=None)) - index_to_sample_merged_builder.add_item(torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) - logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") - close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) - return num_sample_per_value - def run_reduce(self): if self.custom_reduce is None: self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 1e4174a25b1f..6897d75bdef4 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -189,8 +189,9 @@ def get_lr(self) -> float: return self.base_lrs def lr_scheduler_for_variable_batch_size( - base_batch_size, batch_sizes, dataloader, lr_scaling_method='linear', - optimizer=None, lr_scheduler_class=None, **lr_scheduler_kwargs): + base_batch_size, batch_sizes, dataloader, batch_metrics, + lr_scaling_method='linear', optimizer=None, lr_scheduler_class=None, + **lr_scheduler_kwargs): """ returns a class that provides an LR scheduler that scales learning rate at every epoch taking into account the batch size of each epoch. @@ -212,12 +213,13 @@ class VariableBatchSizeLR(lr_scheduler_class or StubLRScheduler): def __init__(self, optimizer, **lr_scheduler_kwargs): self.batch_sizes = batch_sizes + self.batch_metrics = batch_metrics self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method self.dataloader = dataloader self._last_lr = [p['lr'] for p in optimizer.param_groups] super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) - + def state_dict(self): return { 'base': super().state_dict(), @@ -235,27 +237,28 @@ def load_state_dict(self, state_dict): def get_lr(self): return [group['lr'] for group in self.optimizer.param_groups] - def step(self, epoch=0): - + def step(self, epoch=None): # call the base scheduler's step method to get LR for next epoch - # note: optimizer.step preceeds lr_scheduler.step(), so the stepping workflow is: + # Note: optimizer.step preceeds lr_scheduler.step(), so the stepping workflow is: # init: lr_scheduler.step(0) --> set LR for epoch 0 # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 # reset unscaled LRs (to the original scheduler's one) for the current epoch - for param_group, lr in zip(self.optimizer.param_groups, self._last_lr): - param_group['lr'] = lr # reset to last epoch's original/unscaled LR + # Note: epoch==0: reset LR scheduler; epoch==None: scale LR for next epoch; + unscaled_lrs = self.base_lrs if epoch==0 else self._last_lr + for group, lr in zip(self.optimizer.param_groups, unscaled_lrs): + group['lr'] = lr super().step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch # scale the learning rate for next epoch for each parameter group. - batch_size = self.batch_sizes[epoch % len(self.batch_sizes)] - lr_multiplier = scale_lr(self.base_batch_size, batch_size, method=lr_scaling_method) - for param_group in self.optimizer.param_groups: - param_group['lr'] *= lr_multiplier #set scale LR for new epoch + batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] + for group in self.optimizer.param_groups: + group['lr'] = scale_lr(self.base_batch_size, batch_size, group['lr'], lr_scaling_method) + if self.verbose: - print(f"Batch id {epoch}, unscaled LR: {self._last_lr}, scaled LR: {self.get_lr()}") + print(f"Batch id {self.last_epoch}, unscaled LR: {unscaled_lrs}, scaled LR: {self.get_lr()}") #### main loop: double check arguments and returns correctly-instantiated LR scheduler @@ -328,6 +331,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( lr_scheduler = lr_scheduler_for_variable_batch_size( base_batch_size=base_batch_size, batch_sizes=batch_sizes, + batch_metrics=batch_metrics, lr_scaling_method=lr_scaling_method, optimizer=optimizer, dataloader=dataloader, @@ -408,28 +412,55 @@ def forward(self, x): # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), ) - # test with PyTorch - dataloader_it = iter(dataloader) + # PyTorch example iterating whole dataset in one epoch with torch.set_grad_enabled(True): - for epoch in range(10): - try: - for batch_id in range(len(dataloader)//gradient_accumulation_steps): - for microbatch_id in range(gradient_accumulation_steps): - inputs, labels = next(dataloader_it) - inputs, labels = inputs.to("cuda"), labels.to("cuda") - outputs = model(inputs) - loss = F.mse_loss(outputs, labels) - loss.backward() - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") + for epoch in range(2): + for sample_idx, (inputs, labels) in enumerate(dataloader): + batch_id = sample_idx // gradient_accumulation_steps + microbatch_id = sample_idx % gradient_accumulation_steps + inputs, labels = inputs.to("cuda"), labels.to("cuda") + outputs = model(inputs) + loss = F.mse_loss(outputs, labels) + loss.backward() + if (microbatch_id+1) % gradient_accumulation_steps == 0: + print(f"Epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") optimizer.step() optimizer.zero_grad() - lr_scheduler.step(epoch=batch_id+1) - except StopIteration: - # if we run out of data, we restart the dataloader and LR scheduler - dataloader_it = iter(dataloader) - continue + lr_scheduler.step() - # Test with DeepSpeed + + # Pytorch example with loop around data. + # To handle loop-around data, we either pass the batch id as epoch value + # to the scheduler step (option 1 below) or reset the LR scheduler (option 2) + dataloader_it = iter(dataloader) + sample_idx, num_sentences_processed, num_tokens_processed = 0, 0, 0 + while True: + try: + inputs, labels = next(dataloader_it) + inputs, labels = inputs.to("cuda"), labels.to("cuda") + outputs = model(inputs) + loss = F.mse_loss(outputs, labels) + loss.backward() + batch_id = sample_idx // gradient_accumulation_steps + microbatch_id = sample_idx % gradient_accumulation_steps + num_sentences_processed += lr_scheduler.batch_sizes[batch_id] + num_tokens_processed += lr_scheduler.batch_metrics[batch_id] + sample_idx += 1 + if (microbatch_id+1) % gradient_accumulation_steps == 0: + print(f"Batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, size {lr_scheduler.batch_sizes[batch_id]}, metric {lr_scheduler.batch_metrics[batch_id]}") + optimizer.step() + optimizer.zero_grad() + lr_scheduler.step(epoch=batch_id+1) # option 1: specify next batch + + # stop after updating model for 100 sentences or 1000 tokens + if num_sentences_processed>=100 or num_tokens_processed>=1000: + break + except StopIteration: + dataloader_it = iter(dataloader) + sample_idx = 0 + lr_scheduler.step(0) # option 2: reset scheduler + + # DeepSpeed example config = { "train_batch_size": base_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, @@ -455,6 +486,6 @@ def forward(self, x): except StopIteration: # if we run out of data, we restart the dataloader and LR scheduler dataloader_it = iter(engine.training_dataloader) - continue + lr_scheduler.step(0) \ No newline at end of file From f6c5c18d77c0eea6a08c3eb0fc44e2ef2dfa218e Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 8 Mar 2024 09:18:20 +0000 Subject: [PATCH 38/64] master in line with remote --- .../data_sampling/indexed_dataset.py | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py index 60115fa6efef..1c56f5f503c5 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -98,25 +98,26 @@ def write_longs(f, a): f.write(np.array(a, dtype=np.int64)) +# valid metric_dtypes as numpy and torch types dtypes = { - 1: np.uint8, - 2: np.int8, - 3: np.int16, - 4: np.int32, - 5: np.int64, - 6: np.float64, - 7: np.double, - 8: np.uint16, - 9: np.uint32, - 10: np.uint64 + 1: (np.uint8, torch.uint8), + 2: (np.int8, torch.int8), + 3: (np.int16, torch.int16), + 4: (np.int32, torch.int32), + 5: (np.int64, torch.int64), + 6: (np.uint16, None), + 7: (np.uint32, None), + 8: (np.uint64, None), } +valid_dtypes = set([dt[0] for dt in dtypes.values()] + [dt[1] for dt in dtypes.values() if dt[1] is not None]) + def code(dtype): - for k in dtypes.keys(): - if dtypes[k] == dtype: - return k - raise ValueError(dtype) + for c, (np_dt, torch_dt) in dtypes.items(): + if dtype in [np_dt, torch_dt]: + return c + raise ValueError(f"{dtype} not supported. Supported types: {valid_dtypes}") def index_file_path(prefix_path): @@ -153,7 +154,7 @@ def read_index(self, path): version = f.read(8) assert struct.unpack(' Date: Fri, 8 Mar 2024 09:22:52 +0000 Subject: [PATCH 39/64] removed 2 files that are not part of commit --- .../data_sampling/data_analyzer.py | 434 ------------ .../data_sampling/indexed_dataset.py | 627 ------------------ 2 files changed, 1061 deletions(-) delete mode 100644 deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py delete mode 100644 deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py deleted file mode 100644 index 3d4d8bde7d1c..000000000000 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ /dev/null @@ -1,434 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - -import os -from collections import defaultdict -import csv -import time -from multiprocessing import Process, Manager -import numpy as np -import torch -from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset - -from deepspeed.utils import logger -from .indexed_dataset import MMapIndexedDataset, valid_dtypes -from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype - - -class DataAnalyzer(object): - - def __init__(self, - dataset, - num_workers=1, - worker_id=0, - num_threads=1, - num_threads_reduce=1, - specific_threads=[], - batch_size=1, - metric_names=[], - metric_functions=[], - metric_types=[], - metric_dtypes=[], - save_path="./", - collate_fn=None, - custom_map_init=None, - custom_map_update=None, - custom_map_finalize=None, - custom_reduce=None, - sample_indices=None): - super().__init__() - self.dataset = dataset - self.num_workers = num_workers - self.worker_id = worker_id - self.num_threads = num_threads - self.num_threads_reduce = num_threads_reduce - self.specific_threads = specific_threads - self.batch_size = batch_size - self.metric_names = metric_names - self.metric_functions = metric_functions - self.metric_types = metric_types - self.metric_dtypes = metric_dtypes - self.save_path = save_path - self.collate_fn = collate_fn - self.custom_map_init = custom_map_init - self.custom_map_update = custom_map_update - self.custom_map_finalize = custom_map_finalize - self.custom_reduce = custom_reduce - self.sample_indices = sample_indices - - def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id): - metric_results = [] - for m_idx in range(len(metric_names)): - metric_name, metric_type, metric_dtype = metric_names[m_idx], \ - metric_types[m_idx], metric_dtypes[m_idx] - assert metric_dtype in valid_dtypes, f"metric_dtype {metric_dtype} not supported. Supported dtypes {valid_dtypes}" - metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/" - os.makedirs(metric_save_path, exist_ok=True) - if metric_type == 'single_value_per_sample': - sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_dtype) - metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample" - os.system(f"rm -rf {metric_to_sample_fname}*") - metric_to_sample_dict = defaultdict(list) - metric_results.append({ - "sample_to_metric_fname": sample_to_metric_fname, - "sample_to_metric_builder": sample_to_metric_builder, - "metric_to_sample_fname": metric_to_sample_fname, - "metric_to_sample_dict": metric_to_sample_dict - }) - elif metric_type == 'accumulate_value_over_samples': - metric_value = None - metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) - return metric_results - - def update_metric_results(self, - data, - metric_types, - metric_dtypes, - metric_functions, - metric_results, - batch_start_idx=0): - for m_idx in range(len(metric_types)): - metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ - metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] - metric_values = metric_function(data) - - assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ - "metric_function must return a tensor or array" - assert metric_values.dtype == metric_dtype, \ - f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" - if isinstance(metric_values, np.ndarray): - metric_values = torch.from_numpy(metric_values) - - if metric_type == 'single_value_per_sample': - for row in range(metric_values.size()[0]): - sample_idx = batch_start_idx + row # sample idx following dataset iteration order - if isinstance(data, dict) and 'index' in data: # Megatron use case, idx provided in 'index' field - sample_idx = data['index'][row][0].item() - elif self.sample_indices is not None: # user defined shuffling of indices - sample_idx = self.sample_indices[sample_idx] - metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) - metric_result["metric_to_sample_dict"][metric_values[row].item()].append(sample_idx) - for m_value in metric_result["metric_to_sample_dict"]: - if len(metric_result["metric_to_sample_dict"][m_value]) > 100: - metric_fname = metric_result["metric_to_sample_fname"] - with open(f"{metric_fname}_{m_value}.csv", 'a') as f: - writer = csv.writer(f) - writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) - metric_result["metric_to_sample_dict"][m_value] = [] - elif metric_type == 'accumulate_value_over_samples': - if metric_result["metric_value"] is None: - metric_result["metric_value"] = metric_values - else: - metric_result["metric_value"].add_(metric_values) - - def finalize_metric_results(self, metric_types, metric_dtypes, metric_results): - for m_idx in range(len(metric_types)): - metric_type, metric_dtype, metric_result = metric_types[m_idx], \ - metric_dtypes[m_idx], metric_results[m_idx] - if metric_type == 'single_value_per_sample': - metric_fname = metric_result["sample_to_metric_fname"] - close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], metric_fname) - for m_value in metric_result["metric_to_sample_dict"]: - if len(metric_result["metric_to_sample_dict"][m_value]) > 0: - metric_fname = metric_result["metric_to_sample_fname"] - with open(f"{metric_fname}_{m_value}.csv", 'a') as f: - writer = csv.writer(f) - writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) - metric_result["metric_to_sample_dict"][m_value] = [] - elif metric_type == 'accumulate_value_over_samples': - if metric_result["metric_value"] is not None: - metric_value_builder = create_mmap_dataset_builder(metric_result["metric_value_fname"], - metric_dtype) - metric_value_builder.add_item(metric_result["metric_value"].reshape(-1)) - close_mmap_dataset_builder(metric_value_builder, metric_result["metric_value_fname"]) - - def run_map_helper(self, thread_id): - start_idx, end_idx = self.thread_splits[thread_id][0], \ - self.thread_splits[thread_id][1] - logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \ - f"on data subset {start_idx} to {end_idx}") - thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) - sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False) - iterator = iter( - DataLoader(thread_dataset, - batch_sampler=sampler, - num_workers=0, - collate_fn=self.collate_fn, - pin_memory=False)) - if self.custom_map_init is None: - metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types, - self.metric_dtypes, self.save_path, self.worker_id) - else: - metric_results = self.custom_map_init(thread_id, self.metric_names, self.metric_types, self.metric_dtypes, - self.save_path, self.worker_id) - total_sample = len(thread_dataset) - processed_sample = 0 - start = time.time() - while True: - try: - data = next(iterator) - batch_start_idx = start_idx + processed_sample - if self.custom_map_update is None: - self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, - metric_results, batch_start_idx) - else: - self.custom_map_update(data, self.metric_types, self.metric_dtypes, self.metric_functions, - metric_results, batch_start_idx) - processed_sample += self.batch_size - duration = (time.time() - start) / 3600.0 - remain_duration = duration * total_sample / processed_sample - duration - logger.info( - f"worker {self.worker_id} thread {thread_id}: {processed_sample} " \ - f"out of {total_sample} processed in {duration:.2f} hr, " \ - f"estimated to finish in {remain_duration:.2f} hr") - except StopIteration: - logger.info(f"worker {self.worker_id} thread {thread_id}: reach end of file") - break - if self.custom_map_finalize is None: - self.finalize_metric_results(self.metric_types, self.metric_dtypes, metric_results) - else: - self.custom_map_finalize(self.metric_types, self.metric_dtypes, metric_results) - logger.info(f"worker {self.worker_id} thread {thread_id}: finished") - - def run_map(self): - self.worker_splits, self.thread_splits = split_dataset(self.dataset, self.num_workers, self.worker_id, - self.num_threads) - if len(self.specific_threads) > 0: - threads_to_run = self.specific_threads - else: - threads_to_run = list(range(self.num_threads)) - if self.num_threads > 1: - p = [] - for thread in threads_to_run: - p.append(Process(target=self.run_map_helper, args=(thread, ))) - p[thread].start() - - for thread in threads_to_run: - p[thread].join() - else: - assert self.num_threads == 1 - self.run_map_helper(0) - - def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): - logger.info(f"Checking the value percentiles of metric {metric_name}...") - processed_samples = 0 - current_percentile = 5 - for key in sorted(num_sample_per_value.keys()): - processed_samples += num_sample_per_value[key] - if processed_samples >= total_num_samples * current_percentile / 100.0: - logger.info(f"Metric {metric_name} {current_percentile}th percentile: {key}") - current_percentile += 5 - - def merge_gather_map_stats(self, num_workers, num_threads, num_threads_reduce, t_idx_reduce, metric_save_path, - metric_name, return_dict): - results = [] - for w_idx in range(num_workers): - for t_idx in range(num_threads): - if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce: - w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" - w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" - w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) - unique_v = list(np.unique(w_sample_to_metric)) - sample_to_metric_count = len(w_sample_to_metric) - logger.info(f"Finished gathering map stats from worker {w_idx} thread {t_idx}.") - results.append([unique_v, sample_to_metric_count]) - return_dict[t_idx_reduce] = results - - def merge_sample_to_metric(self, t_idx_reduce, metric_save_path, metric_name, metric_value_dtype, - map_worker_thread): - sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" - sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) - for w_t in map_worker_thread: - w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/" - w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" - w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) - for row in range(len(w_data)): - sample_to_metric_builder.add_item(torch.tensor(w_data[row].astype(np.int64), dtype=torch.long)) - logger.info(f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.") - close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) - - def merge_metric_to_sample(self, t_idx_reduce, metric_save_path, metric_name, sample_idx_dtype, metric_value_dtype, - unique_metric_values, num_workers, num_threads): - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" - index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) - index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" - index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) - for unique_v in unique_metric_values: - samples = [] - for w_idx in range(num_workers): - for t_idx in range(num_threads): - w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" - w_metric_to_sample_fname = f"{w_metric_save_path}/{metric_name}_metric_to_sample_{unique_v}.csv" - if os.path.isfile(w_metric_to_sample_fname): - with open(w_metric_to_sample_fname, 'r') as f: - datareader = csv.reader(f) - for row in datareader: - samples += [int(x) for x in row] - index_to_sample_builder.add_item(torch.tensor(samples, dtype=torch.long)) - index_to_metric_builder.add_item(torch.tensor([unique_v], dtype=torch.long)) - logger.info(f"Finished reducing metric {metric_name} value {unique_v}.") - close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) - close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) - - def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_workers, num_threads, - num_threads_reduce): - total_num_samples = len(dataset) - sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) - logger.info( - f"Total number of data samples: {total_num_samples}. Will use {sample_idx_dtype} to store the sample indexes." - ) - for m_idx in range(len(metric_names)): - metric_name, metric_type = metric_names[m_idx], metric_types[m_idx] - if metric_type == 'single_value_per_sample': - metric_save_path = f"{save_path}/{metric_name}/" - sample_to_metric_count = 0 - unique_metric_values = set([]) - manager = Manager() - return_dict = manager.dict() - p = [] - for t_idx_reduce in range(num_threads_reduce): - p.append( - Process(target=self.merge_gather_map_stats, - args=( - num_workers, - num_threads, - num_threads_reduce, - t_idx_reduce, - metric_save_path, - metric_name, - return_dict, - ))) - p[t_idx_reduce].start() - for t_idx_reduce in range(num_threads_reduce): - p[t_idx_reduce].join() - for t_idx_reduce in range(num_threads_reduce): - results = return_dict[t_idx_reduce] - for res in results: - unique_metric_values = unique_metric_values.union(set(res[0])) - sample_to_metric_count += res[1] - value_max = max(unique_metric_values) - value_min = min(unique_metric_values) - assert sample_to_metric_count == total_num_samples, "The number of samples in map result files are not correct. It's possible that some map worker didn't finish successfully." - metric_value_dtype = find_fit_int_dtype(value_min, value_max) - logger.info( - f"Metric {metric_name} has values between {value_min} and {value_max}. Will use {metric_value_dtype} to store the metric values." - ) - - # sample_to_metric - map_worker_thread = [] - for w_idx in range(num_workers): - for t_idx in range(num_threads): - map_worker_thread.append([w_idx, t_idx]) - thread_splits = split_index(0, len(map_worker_thread), num_threads_reduce) - p = [] - for t_idx_reduce in range(num_threads_reduce): - start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] - p.append( - Process(target=self.merge_sample_to_metric, - args=( - t_idx_reduce, - metric_save_path, - metric_name, - metric_value_dtype, - map_worker_thread[start_idx:end_idx], - ))) - p[t_idx_reduce].start() - for t_idx_reduce in range(num_threads_reduce): - p[t_idx_reduce].join() - - sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) - for t_idx_reduce in range(num_threads_reduce): - chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" - logger.info(f"Merging file {chunk_fname}") - sample_to_metric_builder.merge_file_(chunk_fname) - close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) - sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, skip_warmup=True) - assert len(sample_to_metric) == total_num_samples - - # metric_to_sample - unique_metric_values = list(sorted(unique_metric_values)) - thread_splits = split_index(0, len(unique_metric_values), num_threads_reduce) - p = [] - for t_idx_reduce in range(num_threads_reduce): - start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] - p.append( - Process(target=self.merge_metric_to_sample, - args=( - t_idx_reduce, - metric_save_path, - metric_name, - sample_idx_dtype, - metric_value_dtype, - unique_metric_values[start_idx:end_idx], - num_workers, - num_threads, - ))) - p[t_idx_reduce].start() - for t_idx_reduce in range(num_threads_reduce): - p[t_idx_reduce].join() - index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) - index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) - for t_idx_reduce in range(num_threads_reduce): - chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" - logger.info(f"Merging file {chunk_is_fname}") - index_to_sample_builder.merge_file_(chunk_is_fname) - chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" - logger.info(f"Merging file {chunk_im_fname}") - index_to_metric_builder.merge_file_(chunk_im_fname) - close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) - close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) - num_sample_per_value = {} - index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) - index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) - index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" - index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, - sample_idx_dtype) - for v_idx in range(len(index_to_sample)): - if v_idx > 0: - assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] - num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) - assert sum(num_sample_per_value.values()) == total_num_samples - merge_step = max(1, len(index_to_sample) // 100) - for v_idx in range(0, len(index_to_sample), merge_step): - merged_samples = np.copy( - np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], - axis=None)) - index_to_sample_merged_builder.add_item( - torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) - logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") - close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) - self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) - elif metric_type == 'accumulate_value_over_samples': - metric_save_path = f"{save_path}/{metric_name}/" - metric_value = None - for w_idx in range(num_workers): - for t_idx in range(num_threads): - w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" - w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value" - w_metric_value = MMapIndexedDataset(w_metric_value_fname, skip_warmup=True) - if metric_value is None: - metric_value = np.copy(w_metric_value[0]) - else: - metric_value += np.copy(w_metric_value[0]) - value_max = int(max(metric_value)) - value_min = int(min(metric_value)) - metric_value_dtype = find_fit_int_dtype(value_min, value_max) - metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - metric_value_builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) - metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long)) - close_mmap_dataset_builder(metric_value_builder, metric_value_fname) - - def run_reduce(self): - if self.custom_reduce is None: - self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, - self.num_workers, self.num_threads, self.num_threads_reduce) - else: - self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, - self.num_threads, self.num_threads_reduce) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py deleted file mode 100644 index 1c56f5f503c5..000000000000 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ /dev/null @@ -1,627 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -""" -Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py -""" - -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# copied from fairseq/fairseq/data/indexed_dataset.py -# Removed IndexedRawTextDataset since it relied on Fairseq dictionary -# other slight modifications to remove fairseq dependencies -# Added document index to index file and made it accessible. -# An empty sentence no longer separates documents. - -# Some of the fixes/improvements are adopted from -# https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/main/megatron/data/indexed_dataset.py - -from functools import lru_cache -import os -import shutil -import struct -from itertools import accumulate - -import numpy as np -import torch - - -def __best_fitting_dtype(vocab_size=None): - if vocab_size is not None and vocab_size < 65500: - return np.uint16 - else: - return np.int32 - - -def get_available_dataset_impl(): - return ['lazy', 'cached', 'mmap'] - - -def infer_dataset_impl(path): - if IndexedDataset.exists(path): - with open(index_file_path(path), 'rb') as f: - magic = f.read(8) - if magic == IndexedDataset._HDR_MAGIC: - return 'cached' - elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: - return 'mmap' - else: - return None - else: - print(f"Dataset does not exist: {path}") - print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") - return None - - -def make_builder(out_file, impl, vocab_size=None): - if impl == 'mmap': - return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size)) - else: - return IndexedDatasetBuilder(out_file) - - -def make_dataset(path, impl, skip_warmup=False): - if not IndexedDataset.exists(path): - print(f"Dataset does not exist: {path}") - print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") - return None - if impl == 'infer': - impl = infer_dataset_impl(path) - if impl == 'lazy' and IndexedDataset.exists(path): - return IndexedDataset(path) - elif impl == 'cached' and IndexedDataset.exists(path): - return IndexedCachedDataset(path) - elif impl == 'mmap' and MMapIndexedDataset.exists(path): - return MMapIndexedDataset(path, skip_warmup) - print(f"Unknown dataset implementation: {impl}") - return None - - -def dataset_exists(path, impl): - if impl == 'mmap': - return MMapIndexedDataset.exists(path) - else: - return IndexedDataset.exists(path) - - -def read_longs(f, n): - a = np.empty(n, dtype=np.int64) - f.readinto(a) - return a - - -def write_longs(f, a): - f.write(np.array(a, dtype=np.int64)) - - -# valid metric_dtypes as numpy and torch types -dtypes = { - 1: (np.uint8, torch.uint8), - 2: (np.int8, torch.int8), - 3: (np.int16, torch.int16), - 4: (np.int32, torch.int32), - 5: (np.int64, torch.int64), - 6: (np.uint16, None), - 7: (np.uint32, None), - 8: (np.uint64, None), -} - -valid_dtypes = set([dt[0] for dt in dtypes.values()] + [dt[1] for dt in dtypes.values() if dt[1] is not None]) - - -def code(dtype): - for c, (np_dt, torch_dt) in dtypes.items(): - if dtype in [np_dt, torch_dt]: - return c - raise ValueError(f"{dtype} not supported. Supported types: {valid_dtypes}") - - -def index_file_path(prefix_path): - return prefix_path + '.idx' - - -def data_file_path(prefix_path): - return prefix_path + '.bin' - - -def create_doc_idx(sizes): - doc_idx = [0] - for i, s in enumerate(sizes): - if s == 0: - doc_idx.append(i + 1) - return doc_idx - - -class IndexedDataset(torch.utils.data.Dataset): - """Loader for IndexedDataset""" - _HDR_MAGIC = b'TNTIDX\x00\x00' - - def __init__(self, path): - super().__init__() - self.path = path - self.data_file = None - self.read_index(path) - - def read_index(self, path): - with open(index_file_path(path), 'rb') as f: - magic = f.read(8) - assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. ' - 'Make sure that --dataset-impl is configured properly.') - version = f.read(8) - assert struct.unpack('= self._len: - raise IndexError('index out of range') - - def __del__(self): - if self.data_file: - self.data_file.close() - - # @lru_cache(maxsize=8) - def __getitem__(self, idx): - if not self.data_file: - self.read_data(self.path) - if isinstance(idx, int): - i = idx - self.check_index(i) - tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] - a = np.empty(tensor_size, dtype=self.dtype) - self.data_file.seek(self.data_offsets[i] * self.element_size) - self.data_file.readinto(a) - return a - elif isinstance(idx, slice): - start, stop, step = idx.indices(len(self)) - if step != 1: - raise ValueError("Slices into indexed_dataset must be contiguous") - sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]] - size = sum(sizes) - a = np.empty(size, dtype=self.dtype) - self.data_file.seek(self.data_offsets[start] * self.element_size) - self.data_file.readinto(a) - offsets = list(accumulate(sizes)) - sents = np.split(a, offsets[:-1]) - return sents - - def __len__(self): - return self._len - - def num_tokens(self, index): - return self.sizes[index] - - def size(self, index): - return self.sizes[index] - - @staticmethod - def exists(path): - return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))) - - @property - def supports_prefetch(self): - return False # avoid prefetching to save memory - - -class IndexedCachedDataset(IndexedDataset): - - def __init__(self, path): - super().__init__(path) - self.cache = None - self.cache_index = {} - - @property - def supports_prefetch(self): - return True - - def prefetch(self, indices): - if all(i in self.cache_index for i in indices): - return - if not self.data_file: - self.read_data(self.path) - indices = sorted(set(indices)) - total_size = 0 - for i in indices: - total_size += self.data_offsets[i + 1] - self.data_offsets[i] - self.cache = np.empty(total_size, dtype=self.dtype) - ptx = 0 - self.cache_index.clear() - for i in indices: - self.cache_index[i] = ptx - size = self.data_offsets[i + 1] - self.data_offsets[i] - a = self.cache[ptx:ptx + size] - self.data_file.seek(self.data_offsets[i] * self.element_size) - self.data_file.readinto(a) - ptx += size - if self.data_file: - # close and delete data file after prefetch so we can pickle - self.data_file.close() - self.data_file = None - - # @lru_cache(maxsize=8) - def __getitem__(self, idx): - if isinstance(idx, int): - i = idx - self.check_index(i) - tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] - a = np.empty(tensor_size, dtype=self.dtype) - ptx = self.cache_index[i] - np.copyto(a, self.cache[ptx:ptx + a.size]) - return a - elif isinstance(idx, slice): - # Hack just to make this work, can optimizer later if necessary - sents = [] - for i in range(*idx.indices(len(self))): - sents.append(self[i]) - return sents - - -class IndexedDatasetBuilder(object): - - def __init__(self, out_file, dtype=np.int32): - self.out_file = open(out_file, 'wb') - self.dtype = dtype - self.data_offsets = [0] - self.dim_offsets = [0] - self.sizes = [] - self.element_size = self.dtype().itemsize - self.doc_idx = [0] - - def add_item(self, tensor): - bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) - self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) - for s in tensor.size(): - self.sizes.append(s) - self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) - - def end_document(self): - self.doc_idx.append(len(self.sizes)) - - def merge_file_(self, another_file): - index = IndexedDataset(another_file) - assert index.dtype == self.dtype - - doc_offset = len(self.sizes) - - begin = self.data_offsets[-1] - for data_offset in index.data_offsets[1:]: - self.data_offsets.append(begin + data_offset) - self.sizes.extend(index.sizes) - begin = self.dim_offsets[-1] - for dim_offset in index.dim_offsets[1:]: - self.dim_offsets.append(begin + dim_offset) - self.doc_idx.extend((doc_offset + index.doc_idx)[1:]) - - with open(data_file_path(another_file), 'rb') as f: - while True: - data = f.read(1024) - if data: - self.out_file.write(data) - else: - break - - def finalize(self, index_file): - self.out_file.close() - index = open(index_file, 'wb') - index.write(b'TNTIDX\x00\x00') - index.write(struct.pack(' [0, 10, 30, 35] - if arr.size > 1: - arr[1:] = arr[:-1] - if arr.size > 0: - arr[0] = 0 - - -def get_pointers_with_total(sizes, elemsize, dtype): - """Return a numpy array of type np.dtype giving the byte offsets. - - Multiplies values in the sizes array by elemsize (bytes), - and then computes an exclusive scan to get byte offsets. - Returns the total number of bytes as second item in a tuple. - """ - - # scale values in sizes array by elemsize to get sizes in bytes - pointers = np.array(sizes, dtype=dtype) - pointers *= elemsize - np.cumsum(pointers, axis=0, out=pointers) - - # get total number of bytes from all sizes (last element) - bytes_last = pointers[-1] if len(sizes) > 0 else 0 - - # convert to byte offsets - exscan_from_cumsum_(pointers) - - return pointers, bytes_last - - -class MMapIndexedDataset(torch.utils.data.Dataset): - - class Index(object): - _HDR_MAGIC = b'MMIDIDX\x00\x00' - - @classmethod - def writer(cls, path, dtype): - - class _Writer(object): - - def __enter__(self): - self._file = open(path, 'wb') - - self._file.write(cls._HDR_MAGIC) - self._file.write(struct.pack(' Date: Fri, 8 Mar 2024 09:26:22 +0000 Subject: [PATCH 40/64] added files removed accidentaly --- .../data_sampling/data_analyzer.py | 828 ++++++++++++++++++ .../data_sampling/indexed_dataset.py | 627 +++++++++++++ 2 files changed, 1455 insertions(+) create mode 100644 deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py create mode 100644 deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py new file mode 100644 index 000000000000..7088df223bd8 --- /dev/null +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -0,0 +1,828 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import os +from collections import defaultdict +import csv +import time +from multiprocessing import Process, Manager +import numpy as np +import torch +from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset + +from deepspeed.utils import logger +import deepspeed.comm as dist +from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes +from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype + + +class DataAnalyzer(object): + + def __init__(self, + dataset, + num_workers=1, + worker_id=0, + num_threads=1, + num_threads_reduce=1, + specific_threads=[], + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + metric_dtypes=[], + save_path="./", + collate_fn=None, + custom_map_init=None, + custom_map_update=None, + custom_map_finalize=None, + custom_reduce=None, + sample_indices=None): + super().__init__() + self.dataset = dataset + self.num_workers = num_workers + self.worker_id = worker_id + self.num_threads = num_threads + self.num_threads_reduce = num_threads_reduce + self.specific_threads = specific_threads + self.batch_size = batch_size + self.metric_names = metric_names + self.metric_functions = metric_functions + self.metric_types = metric_types + self.metric_dtypes = metric_dtypes + self.save_path = save_path + self.collate_fn = collate_fn + self.custom_map_init = custom_map_init + self.custom_map_update = custom_map_update + self.custom_map_finalize = custom_map_finalize + self.custom_reduce = custom_reduce + self.sample_indices = sample_indices + + def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id): + metric_results = [] + for m_idx in range(len(metric_names)): + metric_name, metric_type, metric_dtype = metric_names[m_idx], \ + metric_types[m_idx], metric_dtypes[m_idx] + assert metric_dtype in valid_dtypes, f"metric_dtype {metric_dtype} not supported. Supported dtypes {valid_dtypes}" + metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/" + os.makedirs(metric_save_path, exist_ok=True) + if metric_type == 'single_value_per_sample': + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_dtype) + metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample" + os.system(f"rm -rf {metric_to_sample_fname}*") + metric_to_sample_dict = defaultdict(list) + metric_results.append({ + "sample_to_metric_fname": sample_to_metric_fname, + "sample_to_metric_builder": sample_to_metric_builder, + "metric_to_sample_fname": metric_to_sample_fname, + "metric_to_sample_dict": metric_to_sample_dict + }) + elif metric_type == 'accumulate_value_over_samples': + metric_value = None + metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" + metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) + return metric_results + + def update_metric_results(self, + data, + metric_types, + metric_dtypes, + metric_functions, + metric_results, + batch_start_idx=0): + for m_idx in range(len(metric_types)): + metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \ + metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx] + metric_values = metric_function(data) + + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ + "metric_function must return a tensor or array" + assert metric_values.dtype == metric_dtype, \ + f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}" + if isinstance(metric_values, np.ndarray): + metric_values = torch.from_numpy(metric_values) + + if metric_type == 'single_value_per_sample': + for row in range(metric_values.size()[0]): + sample_idx = batch_start_idx + row # sample idx following dataset iteration order + if isinstance(data, dict) and 'index' in data: # Megatron use case, idx provided in 'index' field + sample_idx = data['index'][row][0].item() + elif self.sample_indices is not None: # user defined shuffling of indices + sample_idx = self.sample_indices[sample_idx] + metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) + metric_result["metric_to_sample_dict"][metric_values[row].item()].append(sample_idx) + for m_value in metric_result["metric_to_sample_dict"]: + if len(metric_result["metric_to_sample_dict"][m_value]) > 100: + metric_fname = metric_result["metric_to_sample_fname"] + with open(f"{metric_fname}_{m_value}.csv", 'a') as f: + writer = csv.writer(f) + writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) + metric_result["metric_to_sample_dict"][m_value] = [] + elif metric_type == 'accumulate_value_over_samples': + if metric_result["metric_value"] is None: + metric_result["metric_value"] = metric_values + else: + metric_result["metric_value"].add_(metric_values) + + def finalize_metric_results(self, metric_types, metric_dtypes, metric_results): + for m_idx in range(len(metric_types)): + metric_type, metric_dtype, metric_result = metric_types[m_idx], \ + metric_dtypes[m_idx], metric_results[m_idx] + if metric_type == 'single_value_per_sample': + metric_fname = metric_result["sample_to_metric_fname"] + close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], metric_fname) + for m_value in metric_result["metric_to_sample_dict"]: + if len(metric_result["metric_to_sample_dict"][m_value]) > 0: + metric_fname = metric_result["metric_to_sample_fname"] + with open(f"{metric_fname}_{m_value}.csv", 'a') as f: + writer = csv.writer(f) + writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) + metric_result["metric_to_sample_dict"][m_value] = [] + elif metric_type == 'accumulate_value_over_samples': + if metric_result["metric_value"] is not None: + metric_value_builder = create_mmap_dataset_builder(metric_result["metric_value_fname"], + metric_dtype) + metric_value_builder.add_item(metric_result["metric_value"].reshape(-1)) + close_mmap_dataset_builder(metric_value_builder, metric_result["metric_value_fname"]) + + def run_map_helper(self, thread_id): + start_idx, end_idx = self.thread_splits[thread_id][0], \ + self.thread_splits[thread_id][1] + logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \ + f"on data subset {start_idx} to {end_idx}") + thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) + sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False) + iterator = iter( + DataLoader(thread_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=self.collate_fn, + pin_memory=False)) + if self.custom_map_init is None: + metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types, + self.metric_dtypes, self.save_path, self.worker_id) + else: + metric_results = self.custom_map_init(thread_id, self.metric_names, self.metric_types, self.metric_dtypes, + self.save_path, self.worker_id) + total_sample = len(thread_dataset) + processed_sample = 0 + start = time.time() + while True: + try: + data = next(iterator) + batch_start_idx = start_idx + processed_sample + if self.custom_map_update is None: + self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions, + metric_results, batch_start_idx) + else: + self.custom_map_update(data, self.metric_types, self.metric_dtypes, self.metric_functions, + metric_results, batch_start_idx) + processed_sample += len(data) + duration = (time.time() - start) / 3600.0 + remain_duration = duration * total_sample / processed_sample - duration + logger.info( + f"worker {self.worker_id} thread {thread_id}: {processed_sample} " \ + f"out of {total_sample} processed in {duration:.2f} hr, " \ + f"estimated to finish in {remain_duration:.2f} hr") + except StopIteration: + logger.info(f"worker {self.worker_id} thread {thread_id}: reach end of file") + break + if self.custom_map_finalize is None: + self.finalize_metric_results(self.metric_types, self.metric_dtypes, metric_results) + else: + self.custom_map_finalize(self.metric_types, self.metric_dtypes, metric_results) + logger.info(f"worker {self.worker_id} thread {thread_id}: finished") + + def run_map(self): + self.worker_splits, self.thread_splits = split_dataset(self.dataset, self.num_workers, self.worker_id, + self.num_threads) + if len(self.specific_threads) > 0: + threads_to_run = self.specific_threads + else: + threads_to_run = list(range(self.num_threads)) + if self.num_threads > 1: + p = [] + for thread in threads_to_run: + p.append(Process(target=self.run_map_helper, args=(thread, ))) + p[thread].start() + + for thread in threads_to_run: + p[thread].join() + else: + assert self.num_threads == 1 + self.run_map_helper(0) + + def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): + logger.info(f"Checking the value percentiles of metric {metric_name}...") + processed_samples = 0 + current_percentile = 5 + for key in sorted(num_sample_per_value.keys()): + processed_samples += num_sample_per_value[key] + if processed_samples >= total_num_samples * current_percentile / 100.0: + logger.info(f"Metric {metric_name} {current_percentile}th percentile: {key}") + current_percentile += 5 + + def merge_gather_map_stats(self, num_workers, num_threads, num_threads_reduce, t_idx_reduce, metric_save_path, + metric_name, return_dict): + results = [] + for w_idx in range(num_workers): + for t_idx in range(num_threads): + if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce: + w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" + w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" + w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) + unique_v = list(np.unique(w_sample_to_metric)) + sample_to_metric_count = len(w_sample_to_metric) + logger.info(f"Finished gathering map stats from worker {w_idx} thread {t_idx}.") + results.append([unique_v, sample_to_metric_count]) + return_dict[t_idx_reduce] = results + + def merge_sample_to_metric(self, t_idx_reduce, metric_save_path, metric_name, metric_value_dtype, + map_worker_thread): + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) + for w_t in map_worker_thread: + w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/" + w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" + w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) + for row in range(len(w_data)): + sample_to_metric_builder.add_item(torch.tensor(w_data[row].astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.") + close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) + + def merge_metric_to_sample(self, t_idx_reduce, metric_save_path, metric_name, sample_idx_dtype, metric_value_dtype, + unique_metric_values, num_workers, num_threads): + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" + index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" + index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) + for unique_v in unique_metric_values: + samples = [] + for w_idx in range(num_workers): + for t_idx in range(num_threads): + w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" + w_metric_to_sample_fname = f"{w_metric_save_path}/{metric_name}_metric_to_sample_{unique_v}.csv" + if os.path.isfile(w_metric_to_sample_fname): + with open(w_metric_to_sample_fname, 'r') as f: + datareader = csv.reader(f) + for row in datareader: + samples += [int(x) for x in row] + index_to_sample_builder.add_item(torch.tensor(samples, dtype=torch.long)) + index_to_metric_builder.add_item(torch.tensor([unique_v], dtype=torch.long)) + logger.info(f"Finished reducing metric {metric_name} value {unique_v}.") + close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) + close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) + + def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_workers, num_threads, + num_threads_reduce): + total_num_samples = len(dataset) + sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) + logger.info( + f"Total number of data samples: {total_num_samples}. Will use {sample_idx_dtype} to store the sample indexes." + ) + for m_idx in range(len(metric_names)): + metric_name, metric_type = metric_names[m_idx], metric_types[m_idx] + if metric_type == 'single_value_per_sample': + metric_save_path = f"{save_path}/{metric_name}/" + sample_to_metric_count = 0 + unique_metric_values = set([]) + manager = Manager() + return_dict = manager.dict() + p = [] + for t_idx_reduce in range(num_threads_reduce): + p.append( + Process(target=self.merge_gather_map_stats, + args=( + num_workers, + num_threads, + num_threads_reduce, + t_idx_reduce, + metric_save_path, + metric_name, + return_dict, + ))) + p[t_idx_reduce].start() + for t_idx_reduce in range(num_threads_reduce): + p[t_idx_reduce].join() + for t_idx_reduce in range(num_threads_reduce): + results = return_dict[t_idx_reduce] + for res in results: + unique_metric_values = unique_metric_values.union(set(res[0])) + sample_to_metric_count += res[1] + value_max = max(unique_metric_values) + value_min = min(unique_metric_values) + assert sample_to_metric_count == total_num_samples, "The number of samples in map result files are not correct. It's possible that some map worker didn't finish successfully." + metric_value_dtype = find_fit_int_dtype(value_min, value_max) + logger.info( + f"Metric {metric_name} has values between {value_min} and {value_max}. Will use {metric_value_dtype} to store the metric values." + ) + + # sample_to_metric + map_worker_thread = [] + for w_idx in range(num_workers): + for t_idx in range(num_threads): + map_worker_thread.append([w_idx, t_idx]) + thread_splits = split_index(0, len(map_worker_thread), num_threads_reduce) + p = [] + for t_idx_reduce in range(num_threads_reduce): + start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] + p.append( + Process(target=self.merge_sample_to_metric, + args=( + t_idx_reduce, + metric_save_path, + metric_name, + metric_value_dtype, + map_worker_thread[start_idx:end_idx], + ))) + p[t_idx_reduce].start() + for t_idx_reduce in range(num_threads_reduce): + p[t_idx_reduce].join() + + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) + for t_idx_reduce in range(num_threads_reduce): + chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" + logger.info(f"Merging file {chunk_fname}") + sample_to_metric_builder.merge_file_(chunk_fname) + close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) + sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, skip_warmup=True) + assert len(sample_to_metric) == total_num_samples + + # metric_to_sample + unique_metric_values = list(sorted(unique_metric_values)) + thread_splits = split_index(0, len(unique_metric_values), num_threads_reduce) + p = [] + for t_idx_reduce in range(num_threads_reduce): + start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] + p.append( + Process(target=self.merge_metric_to_sample, + args=( + t_idx_reduce, + metric_save_path, + metric_name, + sample_idx_dtype, + metric_value_dtype, + unique_metric_values[start_idx:end_idx], + num_workers, + num_threads, + ))) + p[t_idx_reduce].start() + for t_idx_reduce in range(num_threads_reduce): + p[t_idx_reduce].join() + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" + index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" + index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) + for t_idx_reduce in range(num_threads_reduce): + chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" + logger.info(f"Merging file {chunk_is_fname}") + index_to_sample_builder.merge_file_(chunk_is_fname) + chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" + logger.info(f"Merging file {chunk_im_fname}") + index_to_metric_builder.merge_file_(chunk_im_fname) + close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) + close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) + + num_sample_per_value = DataAnalyzer.output_index_to_sample_percentile( + index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples, + sample_idx_dtype) + self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) + elif metric_type == 'accumulate_value_over_samples': + metric_save_path = f"{save_path}/{metric_name}/" + metric_value = None + for w_idx in range(num_workers): + for t_idx in range(num_threads): + w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" + w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value" + w_metric_value = MMapIndexedDataset(w_metric_value_fname, skip_warmup=True) + if metric_value is None: + metric_value = np.copy(w_metric_value[0]) + else: + metric_value += np.copy(w_metric_value[0]) + value_max = int(max(metric_value)) + value_min = int(min(metric_value)) + metric_value_dtype = find_fit_int_dtype(value_min, value_max) + metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" + metric_value_builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long)) + close_mmap_dataset_builder(metric_value_builder, metric_value_fname) + + @staticmethod + def output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, + total_num_samples, sample_idx_dtype): + """ read index_to_metric and index_to_sample files and write distribution to index_to_sample_percentage_merged """ + num_sample_per_value = {} + index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) + index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) + index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" + index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, sample_idx_dtype) + for v_idx in range(len(index_to_sample)): + if v_idx > 0: + assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] + num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) + assert sum(list(num_sample_per_value.values())) == total_num_samples + merge_step = max(1, len(index_to_sample) // 100) + for v_idx in range(0, len(index_to_sample), merge_step): + merged_samples = np.copy( + np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], axis=None)) + index_to_sample_merged_builder.add_item(torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") + close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) + return num_sample_per_value + + def run_reduce(self): + if self.custom_reduce is None: + self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, + self.num_workers, self.num_threads, self.num_threads_reduce) + else: + self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, + self.num_threads, self.num_threads_reduce) + + def run_map_reduce(self, comm_group=None): + self.run_map() + # wait for the mapping operation, where all nodes outputs their own (partial) result files + dist.barrier(group=comm_group) + if self.worker_id == 0: + self.run_reduce() + # wait for the reduce, where rank 0 merges all (partial) files. Dataset can then be used by all nodes. + dist.barrier(group=comm_group) + + +class DistributedDataAnalyzer(object): + + def __init__( + self, + dataset, + num_workers=1, + worker_id=0, + batch_size=1, + metric_names=[], + metric_functions=[], + metric_types=[], + save_path="./", + collate_fn=None, + device='cuda', + comm_group=None, + sample_indices=None, + ) -> None: + self.dataset = dataset + self.batch_size = batch_size + self.metric_names = metric_names + self.metric_functions = metric_functions + self.metric_types = metric_types + self.save_path = save_path + self.collate_fn = collate_fn + self.device = device + self.sample_indices = sample_indices + + if not dist.is_initialized(): + dist.init_distributed() + + # comm_group and worker_id+num_workers are mutually exclusive + self.comm_group = comm_group + if self.comm_group is None: + # self.comm_group = deepspeed.utils.groups._clone_world_group() + self.num_workers = num_workers + self.worker_id = worker_id + else: + self.num_workers = self.comm_group.size() + self.worker_id = self.comm_group.rank() + + if self.worker_id == 0: + logger.info(f"Distributed data analyzer initialized with {self.num_workers} workers.") + + def run_map_reduce(self): + + # setup individual dataloaders + worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1) + start_idx, end_idx = worker_splits[self.worker_id] + logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") + worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) + sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) + dataloader = DataLoader(dataset=worker_dataset, + batch_sampler=sampler, + num_workers=0, + collate_fn=self.collate_fn, + pin_memory=False) + + # set initial results list + metric_results = [] + for metric_type in self.metric_types: + assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \ + f"metric_type {metric_type} not implemented." + metric_results.append([] if metric_type == 'single_value_per_sample' else None) + + # iterate dataloader and store metric results + batch_start_idx = start_idx + for data in dataloader: + for m_idx in range(len(self.metric_names)): + metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx] + metric_values = metric_function(data) + assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \ + "metric_function must return a tensor or array" + if isinstance(metric_values, np.ndarray): + metric_values = torch.from_numpy(metric_values) + assert metric_values.dtype in valid_dtypes, \ + f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_dtypes}" + + if metric_type == 'single_value_per_sample': + for row in range(metric_values.size()[0]): + value = metric_values[row].item() + sample_idx = batch_start_idx + row # sample idx following dataset iteration order + if isinstance(data, dict) and 'index' in data: # Megatron use case + sample_idx = data['index'][row][0].item() + elif self.sample_indices is not None: # user defined shuffling of indices + sample_idx = self.sample_indices[sample_idx] + metric_results[m_idx].append((value, sample_idx)) + elif metric_type == 'accumulate_value_over_samples': + if metric_results[m_idx] is None: + metric_results[m_idx] = metric_values + else: + metric_results[m_idx].add_(metric_values) + batch_start_idx += len(data) + + # compute dtype for sample ids + total_num_samples = len(self.dataset) + sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) + logger.info(f"Total number of data samples: {total_num_samples}.") + logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") + + # convert to list of tensors + metric_results = [torch.tensor(m).to(self.device) for m in metric_results] + + for m_idx in range(len(self.metric_names)): + metric_values, metric_name, metric_type = \ + metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] + metric_save_path = f"{self.save_path}/{metric_name}/" + os.makedirs(metric_save_path, exist_ok=True) + + if metric_type == 'single_value_per_sample': + + # Compute sample and metric value dtypes based on range + values, samples = metric_values[:, 0], metric_values[:, 1] + value_min, value_max = Dist.min_max(values, self.comm_group) + sample_min, sample_max = Dist.min_max(samples, self.comm_group) + metric_value_dtype = find_fit_int_dtype(value_min, value_max) + sample_value_dtype = find_fit_int_dtype(sample_min, sample_max) + + # sample_to_metric maps sample ids to metric values, as a list of metric values + sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" + values = [torch.tensor([x]) for x in metric_values[:, 0]] + self.file_write_ordered(values, sample_to_metric_fname, metric_value_dtype) + + # distributed sorting by values, gives an ordered disjoint subset of keys on nodes + metric_values = Dist.sample_sort(metric_values, self.comm_group, self.num_workers) + metric_to_samples_dict = {} + if len(metric_values) > 0: + for value, sample in metric_values: + if value.item() not in metric_to_samples_dict: + metric_to_samples_dict[value.item()] = [] + metric_to_samples_dict[value.item()].append(sample.item()) + + # index_to_metric and index_to_sample serialize a dicitonary from metric to samples + # index_to_metric stores a key per row, index_to_sample stores the values per row + values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()] + samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()] + index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" #dict keys + index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" #dict values + self.file_write_ordered(values, index_to_metric_fname, metric_value_dtype) + self.file_write_ordered(samples, index_to_sample_fname, sample_value_dtype) + + if self.worker_id == 0: + DataAnalyzer.output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, + metric_name, metric_save_path, total_num_samples, + sample_idx_dtype) + dist.barrier(self.comm_group) + + elif metric_type == 'accumulate_value_over_samples': + metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" + dist.reduce(metric_values, dst=0, op=dist.ReduceOp.SUM, group=self.comm_group) + metric_value_dtype = find_fit_int_dtype(metric_values.min(), metric_values.max()) + + if self.worker_id == 0: + builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + builder.add_item(metric_values.cpu()) + close_mmap_dataset_builder(builder, metric_value_fname) + dist.barrier(self.comm_group) + + def file_write_ordered(self, tensor_list, fname, numpy_dtype): + """ MPI_file_write_ordered extended to write a list of tensors, by one rank, iteratively """ + + # each not has a list of rows (tensors) to be written to the file. + # we will serialize it to communicate it in one comm step. + + tkwargs = dict(dtype=torch.int64, device=self.device) + + # 1. gather on rank 0 the number of rows to be sent/recv + row_count = torch.tensor([len(tensor_list)], **tkwargs) + row_counts = torch.zeros(self.num_workers, **tkwargs) + dist.all_gather_into_tensor(row_counts, row_count, group=self.comm_group) + assert row_counts[self.worker_id] == row_count == len(tensor_list), "all_gather failed" + + # 2. gather on rank 0 the sizes of the rows to be sent/recv + row_len = torch.tensor([len(l) for l in tensor_list], **tkwargs) + row_lens = Dist.gather_v(row_len, 0, self.comm_group, self.num_workers, self.worker_id) + + # 4. gather on rank 0 of the total size (sum of all row lengths) to be received + size = torch.tensor([sum(row_len).item()], **tkwargs) + sizes = torch.zeros(self.num_workers, **tkwargs) + dist.all_gather_into_tensor(sizes, size, group=self.comm_group) + assert sizes[self.worker_id] == size.item(), "all_gather did not return the same sizes" #sanity check + + # method to deserializes a buffer into rows of different lengths and write them to file + def write_buffer_to_file(buff, src, builder): + assert self.worker_id == 0, "only rank 0 can write to file" + + # # write one buffer at a time + # for row_len in row_lens[src]: + # builder.add_item(buff[:row_len].cpu()) + # buff = buff[row_len:] + + # collect all buffers and write them all at once + buffer_list = [] + for row_len in row_lens[src]: + buffer_list.append(buff[:row_len].cpu()) + buff = buff[row_len:] + builder.add_items(buffer_list) + + # 5. rank 0 prepares output folder and file + if self.worker_id == 0: + os.makedirs(os.path.dirname(fname), exist_ok=True) + builder = create_mmap_dataset_builder(fname, numpy_dtype) + + # iterate through ranks that have data to be sent/recv/written + for src in [rank for rank, count in enumerate(row_counts) if count > 0]: + + dist.barrier(group=self.comm_group) + if self.worker_id == 0 and src == 0: # rank 0's write its own data + buffer = torch.cat(tensor_list, dim=0).to(self.device) + write_buffer_to_file(buffer, 0, builder) + elif self.worker_id == 0 and src > 0: # rank 0 receives other rank's data and writes it + buffer = torch.empty(sizes[src].item(), dtype=buffer.dtype, device=buffer.device) + err = dist.recv(buffer, src=src, group=self.comm_group, tag=src) + assert err == src and len(buffer) > 0, "recv failed" + write_buffer_to_file(buffer, src, builder) + elif self.worker_id == src: # current rank sends data to rank 0 + buffer = torch.cat(tensor_list, dim=0).to(self.device) + dist.send(buffer, 0, group=self.comm_group, tag=src) + + # rank 0 closes the file + if self.worker_id == 0: + close_mmap_dataset_builder(builder, fname) # close file + dist.barrier(self.comm_group) + + +class Dist: + """ auxiliary class to perform distributed operations on tensors""" + + @staticmethod + def min_max(tensor, comm_group): + """ given a distributed tensor, return the min/max values across all ranks""" + + value_min, value_max = tensor.min(), tensor.max() + dist.reduce(value_min, 0, op=dist.ReduceOp.MIN, group=comm_group) + dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group) + return value_min.item(), value_max.item() + + @staticmethod + def gather_v(tensor, dst, comm_group, num_workers, worker_id): + """ MPI_Gatherv. gather tensors of variable sizes in a single rank """ + + # gather the number of rows to be sent/recv + size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device) + sizes = torch.zeros(num_workers, dtype=torch.int64, device=tensor.device) + dist.all_gather_into_tensor(sizes, size, group=comm_group) + assert sizes[worker_id] == size, "all_gather failed" + + # all_gather requires all tensors to be of same size so we need to pad them + max_size = max(sizes).item() + buffer = torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) + buffer[0:size] = torch.tensor(tensor, dtype=tensor.dtype, device=tensor.device) + buffer_list = None + if worker_id == 0: # create padded recv buffers + buffer_list = [torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) for _ in range(num_workers)] + dist.gather(buffer, buffer_list, dst=dst, group=comm_group) + + # revert padding and return value + if worker_id == 0: + buffer_list = [r[:s.item()] for r, s in zip(buffer_list, sizes)] + return buffer_list + + @staticmethod + def sample_sort(tensor, comm_group, num_workers, n_samples=100): + """ perform a distributed random sort of a tensor, and returns the sorted partial tensor""" + device, dims = tensor.device, tensor.size()[1] + + # 1 - sort rows by first column, then second column, then third, etc... + tensor = torch.tensor(sorted(tensor.tolist()), dtype=tensor.dtype, device=tensor.device) + + # 2 - collect few samples per rank + idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples)).to(int) + samples = tensor[idx][:, 0].contiguous().to(device) #only first column, all but last row + + # 2 - Allgather samples + all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device) for _ in range(num_workers)] + dist.all_gather(all_samples, samples, group=comm_group) + all_samples = torch.cat(all_samples, dim=0).to(device) + + # 3 - Sort all samples and collect the ranges of each rank as equidistant + all_samples = all_samples.sort()[0] + idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int) + ranges = all_samples[idx] # range of each rank r as ranges[r] <= x < ranges[r+1] + ranges[-1] += 1 # increase upper limit of last rank so that x < ranges[r+1]. + + # 4 - collect elements to send to each rank, based on the rank ranges + send = [] + for rank in range(num_workers): + mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1]) + send.append(tensor[mask]) + + # 5. all to all to communicate the sizes to be sent/recv + send_count = [torch.tensor([len(s) * dims], dtype=torch.int64, device=device) for s in send] + recv_count = list(torch.empty([num_workers], dtype=torch.int64, device=device).chunk(num_workers)) + dist.all_to_all(recv_count, send_count, group=comm_group) + + # 6. all-to-all-v to communicate the elements to be sent/recv as a single tensor + send = torch.cat(send, dim=0).flatten().to(device) + recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device) + send_count = [s.item() for s in send_count] # convert to list of ints + recv_count = [r.item() for r in recv_count] + dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group) + del send + + # 7. the received tensor is the 1D disjoint subset of the distributed tensor. + # We will recover the original dimensionality and sort it by columns again. + recv = recv.view(-1, dims) + recv = torch.tensor(sorted(recv.tolist()), dtype=recv.dtype, device=recv.device) + return recv + + +def test_compare_both_data_analyzers(dataset): + """ given a dataset, compare file and memory based data analyser""" + + id = lambda t: torch.tensor(t).to(torch.int64) # identity + batch_sum = lambda t: id(t).sum() #sum batch + kwargs = dict( + dataset=dataset, + batch_size=3, + worker_id=int(os.environ['RANK']), + num_workers=int(os.environ['WORLD_SIZE']), + metric_names=["mod", "batch_sum"], + metric_functions=[id, batch_sum], + metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], + ) + + dda = DistributedDataAnalyzer( + save_path="./output_dist", + device=f"cuda:{int(os.environ['LOCAL_RANK'])}", + **kwargs, + ) + start_time = time.time() + dda.run_map_reduce() + if dda.worker_id == 0: + print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + + da = DataAnalyzer(num_threads=2, + num_threads_reduce=2, + metric_dtypes=[torch.int64, torch.int64], + save_path="./output_disk", + **kwargs) + start_time = time.time() + da.run_map_reduce() + if da.worker_id == 0: + print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time)) + + output_paths = [ + "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", \ + "mod/mod_index_to_metric.bin", "mod/mod_index_to_metric.idx", \ + "mod/mod_index_to_sample.bin", "mod/mod_index_to_sample.idx", \ + "mod/mod_index_to_sample_percentile_merged.bin", "mod/mod_index_to_sample_percentile_merged.idx", \ + "mod/mod_sample_to_metric.bin", "mod/mod_sample_to_metric.idx" + ] + + if dda.worker_id == 0: + for path in output_paths: + with open(os.path.join(da.save_path, path), 'rb') as f1, \ + open(os.path.join(dda.save_path, path), 'rb') as f2: + if f1.read() != f2.read(): + print(f"files {path} are not identical.") + + +if __name__ == "__main__": + + class TestDataset(torch.utils.data.Dataset): + + def __init__(self, size=20): + self.values = [1001 + x % 6 for x in range(size)] + self.size = size + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return self.values[idx] + + test_compare_both_data_analyzers(TestDataset()) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py new file mode 100644 index 000000000000..453e6ba6039d --- /dev/null +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -0,0 +1,627 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +""" +Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py +""" + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# copied from fairseq/fairseq/data/indexed_dataset.py +# Removed IndexedRawTextDataset since it relied on Fairseq dictionary +# other slight modifications to remove fairseq dependencies +# Added document index to index file and made it accessible. +# An empty sentence no longer separates documents. + +# Some of the fixes/improvements are adopted from +# https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/main/megatron/data/indexed_dataset.py + +from functools import lru_cache +import os +import shutil +import struct +from itertools import accumulate + +import numpy as np +import torch + + +def __best_fitting_dtype(vocab_size=None): + if vocab_size is not None and vocab_size < 65500: + return np.uint16 + else: + return np.int32 + + +def get_available_dataset_impl(): + return ['lazy', 'cached', 'mmap'] + + +def infer_dataset_impl(path): + if IndexedDataset.exists(path): + with open(index_file_path(path), 'rb') as f: + magic = f.read(8) + if magic == IndexedDataset._HDR_MAGIC: + return 'cached' + elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: + return 'mmap' + else: + return None + else: + print(f"Dataset does not exist: {path}") + print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") + return None + + +def make_builder(out_file, impl, vocab_size=None): + if impl == 'mmap': + return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size)) + else: + return IndexedDatasetBuilder(out_file) + + +def make_dataset(path, impl, skip_warmup=False): + if not IndexedDataset.exists(path): + print(f"Dataset does not exist: {path}") + print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") + return None + if impl == 'infer': + impl = infer_dataset_impl(path) + if impl == 'lazy' and IndexedDataset.exists(path): + return IndexedDataset(path) + elif impl == 'cached' and IndexedDataset.exists(path): + return IndexedCachedDataset(path) + elif impl == 'mmap' and MMapIndexedDataset.exists(path): + return MMapIndexedDataset(path, skip_warmup) + print(f"Unknown dataset implementation: {impl}") + return None + + +def dataset_exists(path, impl): + if impl == 'mmap': + return MMapIndexedDataset.exists(path) + else: + return IndexedDataset.exists(path) + + +def read_longs(f, n): + a = np.empty(n, dtype=np.int64) + f.readinto(a) + return a + + +def write_longs(f, a): + f.write(np.array(a, dtype=np.int64)) + + +# valid metric_dtypes as numpy and torch types +dtypes = { + 1: (np.uint8, torch.uint8), + 2: (np.int8, torch.int8), + 3: (np.int16, torch.int16), + 4: (np.int32, torch.int32), + 5: (np.int64, torch.int64), + 6: (np.uint16, None), + 7: (np.uint32, None), + 8: (np.uint64, None), +} + +valid_dtypes = set([dt[0] for dt in dtypes.values()] + [dt[1] for dt in dtypes.values() if dt[1] is not None]) + + +def code(dtype): + for c, (np_dt, torch_dt) in dtypes.items(): + if dtype in [np_dt, torch_dt]: + return c + raise ValueError(f"{dtype} not supported. Supported types: {valid_dtypes}") + + +def index_file_path(prefix_path): + return prefix_path + '.idx' + + +def data_file_path(prefix_path): + return prefix_path + '.bin' + + +def create_doc_idx(sizes): + doc_idx = [0] + for i, s in enumerate(sizes): + if s == 0: + doc_idx.append(i + 1) + return doc_idx + + +class IndexedDataset(torch.utils.data.Dataset): + """Loader for IndexedDataset""" + _HDR_MAGIC = b'TNTIDX\x00\x00' + + def __init__(self, path): + super().__init__() + self.path = path + self.data_file = None + self.read_index(path) + + def read_index(self, path): + with open(index_file_path(path), 'rb') as f: + magic = f.read(8) + assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. ' + 'Make sure that --dataset-impl is configured properly.') + version = f.read(8) + assert struct.unpack('= self._len: + raise IndexError('index out of range') + + def __del__(self): + if self.data_file: + self.data_file.close() + + # @lru_cache(maxsize=8) + def __getitem__(self, idx): + if not self.data_file: + self.read_data(self.path) + if isinstance(idx, int): + i = idx + self.check_index(i) + tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] + a = np.empty(tensor_size, dtype=self.dtype) + self.data_file.seek(self.data_offsets[i] * self.element_size) + self.data_file.readinto(a) + return a + elif isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + if step != 1: + raise ValueError("Slices into indexed_dataset must be contiguous") + sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]] + size = sum(sizes) + a = np.empty(size, dtype=self.dtype) + self.data_file.seek(self.data_offsets[start] * self.element_size) + self.data_file.readinto(a) + offsets = list(accumulate(sizes)) + sents = np.split(a, offsets[:-1]) + return sents + + def __len__(self): + return self._len + + def num_tokens(self, index): + return self.sizes[index] + + def size(self, index): + return self.sizes[index] + + @staticmethod + def exists(path): + return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))) + + @property + def supports_prefetch(self): + return False # avoid prefetching to save memory + + +class IndexedCachedDataset(IndexedDataset): + + def __init__(self, path): + super().__init__(path) + self.cache = None + self.cache_index = {} + + @property + def supports_prefetch(self): + return True + + def prefetch(self, indices): + if all(i in self.cache_index for i in indices): + return + if not self.data_file: + self.read_data(self.path) + indices = sorted(set(indices)) + total_size = 0 + for i in indices: + total_size += self.data_offsets[i + 1] - self.data_offsets[i] + self.cache = np.empty(total_size, dtype=self.dtype) + ptx = 0 + self.cache_index.clear() + for i in indices: + self.cache_index[i] = ptx + size = self.data_offsets[i + 1] - self.data_offsets[i] + a = self.cache[ptx:ptx + size] + self.data_file.seek(self.data_offsets[i] * self.element_size) + self.data_file.readinto(a) + ptx += size + if self.data_file: + # close and delete data file after prefetch so we can pickle + self.data_file.close() + self.data_file = None + + # @lru_cache(maxsize=8) + def __getitem__(self, idx): + if isinstance(idx, int): + i = idx + self.check_index(i) + tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] + a = np.empty(tensor_size, dtype=self.dtype) + ptx = self.cache_index[i] + np.copyto(a, self.cache[ptx:ptx + a.size]) + return a + elif isinstance(idx, slice): + # Hack just to make this work, can optimizer later if necessary + sents = [] + for i in range(*idx.indices(len(self))): + sents.append(self[i]) + return sents + + +class IndexedDatasetBuilder(object): + + def __init__(self, out_file, dtype=np.int32): + self.out_file = open(out_file, 'wb') + self.dtype = dtype + self.data_offsets = [0] + self.dim_offsets = [0] + self.sizes = [] + self.element_size = self.dtype().itemsize + self.doc_idx = [0] + + def add_item(self, tensor): + bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) + self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) + for s in tensor.size(): + self.sizes.append(s) + self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) + + def end_document(self): + self.doc_idx.append(len(self.sizes)) + + def merge_file_(self, another_file): + index = IndexedDataset(another_file) + assert index.dtype == self.dtype + + doc_offset = len(self.sizes) + + begin = self.data_offsets[-1] + for data_offset in index.data_offsets[1:]: + self.data_offsets.append(begin + data_offset) + self.sizes.extend(index.sizes) + begin = self.dim_offsets[-1] + for dim_offset in index.dim_offsets[1:]: + self.dim_offsets.append(begin + dim_offset) + self.doc_idx.extend((doc_offset + index.doc_idx)[1:]) + + with open(data_file_path(another_file), 'rb') as f: + while True: + data = f.read(1024) + if data: + self.out_file.write(data) + else: + break + + def finalize(self, index_file): + self.out_file.close() + index = open(index_file, 'wb') + index.write(b'TNTIDX\x00\x00') + index.write(struct.pack(' [0, 10, 30, 35] + if arr.size > 1: + arr[1:] = arr[:-1] + if arr.size > 0: + arr[0] = 0 + + +def get_pointers_with_total(sizes, elemsize, dtype): + """Return a numpy array of type np.dtype giving the byte offsets. + + Multiplies values in the sizes array by elemsize (bytes), + and then computes an exclusive scan to get byte offsets. + Returns the total number of bytes as second item in a tuple. + """ + + # scale values in sizes array by elemsize to get sizes in bytes + pointers = np.array(sizes, dtype=dtype) + pointers *= elemsize + np.cumsum(pointers, axis=0, out=pointers) + + # get total number of bytes from all sizes (last element) + bytes_last = pointers[-1] if len(sizes) > 0 else 0 + + # convert to byte offsets + exscan_from_cumsum_(pointers) + + return pointers, bytes_last + + +class MMapIndexedDataset(torch.utils.data.Dataset): + + class Index(object): + _HDR_MAGIC = b'MMIDIDX\x00\x00' + + @classmethod + def writer(cls, path, dtype): + + class _Writer(object): + + def __enter__(self): + self._file = open(path, 'wb') + + self._file.write(cls._HDR_MAGIC) + self._file.write(struct.pack(' Date: Fri, 8 Mar 2024 14:43:20 +0000 Subject: [PATCH 41/64] pipepile parallelism missing. all good --- .../variable_batch_size_and_lr.py | 204 +++++++++--------- 1 file changed, 98 insertions(+), 106 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 6897d75bdef4..6efd3d13cecb 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -1,19 +1,14 @@ import random +import os import torch -from deepspeed.utils import logger -from torch.utils.data import DistributedSampler from torch.optim.lr_scheduler import LRScheduler -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, DistributedSampler +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist import torch.nn.functional as F import deepspeed - - -# see https://github.com/facebookresearch/fairseq/blob/b5a039c292facba9c73f59ff34621ec131d82341/fairseq/data/data_utils.py#L282 -# see how to set new batch size here: -# https://github.com/microsoft/DeepSpeed/issues/2798#issuecomment-1435475061 -# engine.set_train_micro_batch_size and set_train_batch_size (only changes grad acc steps) in -# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/engine.py -# TODO we need same batch size per GPU per grad step! +from deepspeed.utils import logger +from deepspeed.pipe import PipelineModule def batch_by_size( @@ -28,6 +23,7 @@ def batch_by_size( gradient_accumulation_steps=1, required_microbatches_of_same_size=False, verbose=False, + seed=0, ): """ @@ -62,7 +58,8 @@ def batch_by_size( metrics = list(zip(metric_values, sample_ids)) if shuffle_metric_values: - random.shuffle(metrics) + metric_random = random.Random(seed) + metric_random.shuffle(metrics) if order_by_metric_value: metrics = sorted(metrics) @@ -74,7 +71,7 @@ def batch_by_size( metrics = [ m for m in metrics if m[1] not in long_ids ] def is_microbatch_valid(metrics): - if len(metrics) < min_batch_size: return False # insufficient sample count + if min_batch_size and len(metrics)max_batch_size: return False # too many samples if sum([m[0] for m in metrics]) > max_metric_value_per_batch: return False # exceeds max return True @@ -341,13 +338,17 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( return dataloader, lr_scheduler, deepspeed_io_kwargs + + +########## Main includes few examples on how to use this module ############### + if __name__ == "__main__": - # A small example/test on how to use this module class TestData(torch.utils.data.Dataset): """ A test dataset with sequences of random length, and their sum as the target""" - def __init__(self, seq_count, min_seq_len=1, max_seq_len=21): - self.seqs = [ torch.ones(random.randrange(min_seq_len,max_seq_len)) for _ in range(seq_count) ] + def __init__(self, seq_count, min_seq_len=1, max_seq_len=21, seed=0): + data_random = random.Random(seed) + self.seqs = [ torch.ones(data_random.randrange(min_seq_len,max_seq_len)) for _ in range(seq_count) ] __len__ = lambda self: len(self.seqs) __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] @@ -365,127 +366,118 @@ def collate_fn(batch, max_seq_len=None): return padded, labels class TestFeedForward(torch.nn.Module): + """ a test feedforward model """ def __init__(self): super(TestFeedForward, self).__init__() - # an affine operation: y = Wx + b self.fc1 = torch.nn.Linear(max_seq_len, 128) self.fc2 = torch.nn.Linear(128, 128) + self.fc3 = torch.nn.Linear(128, 128) + self.fc4 = torch.nn.Linear(128, 128) def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) - return x.sum(dim=1) + x = F.relu(self.fc3(x)) + x = F.relu(self.fc4(x)) + return x.sum() + def to_layers(self): + return [self.fc1, self.fc2, self.fc3, self.fc4, lambda x: x.sum()] + + dataloader_rank=int(os.environ.get('RANK',0)) + dataloader_num_replicas=int(os.environ.get('WORLD_SIZE',1)) + device_id=int(os.environ.get('LOCAL_RANK',0)) + device = f"cuda:{device_id}" max_seq_len=15 - dataset = TestData(seq_count=30, min_seq_len=5, max_seq_len=max_seq_len) max_metric_value_per_batch=40 - dataloader_num_workers=2 - gradient_accumulation_steps=2 - base_batch_size=8 - model = TestFeedForward().to("cuda") + base_batch_size = 8 base_lr=1e-3 - optimizer = torch.optim.Adam(model.parameters(), lr=base_lr) + gradient_accumulation_steps=base_batch_size//dataloader_num_replicas + pipeline_parallelism=True + order_by_metric_value=True #enable for curriculum + + dist.init_process_group(backend='nccl') + model = TestFeedForward().to(device) + dataset = TestData(seq_count=300, min_seq_len=5, max_seq_len=max_seq_len) + model_ddp = DDP(model, device_ids=[device]) + optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) metric_values = [ len(s[0]) for s in dataset] # difficulty = input sequence length dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( - dataset=dataset, - dataset_metric_values=metric_values, - base_batch_size=base_batch_size, - max_metric_value_per_batch=max_metric_value_per_batch, - dataloader_rank=0, - dataloader_num_replicas=1, - sample_ids=None, - pipeline_parallelism=False, - lr_scaling_method="linear", - min_batch_size=1, - max_batch_size=None, - shuffle_metric_values=False, - order_by_metric_value=False, - gradient_accumulation_steps=gradient_accumulation_steps, - dataloader_num_workers=0, - dataloader_collate_fn=lambda b : TestData.collate_fn(b, max_seq_len=max_seq_len), - dataloader_pin_memory=False, - optimizer=optimizer, - # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, - # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), + dataset=dataset, + dataset_metric_values=metric_values, + base_batch_size=base_batch_size, + max_metric_value_per_batch=max_metric_value_per_batch, + dataloader_rank=dataloader_rank, + dataloader_num_replicas=dataloader_num_replicas, + pipeline_parallelism=pipeline_parallelism, + lr_scaling_method="linear", + order_by_metric_value=order_by_metric_value, + gradient_accumulation_steps=gradient_accumulation_steps, + dataloader_num_workers=0, + dataloader_collate_fn=lambda b : TestData.collate_fn(b, max_seq_len=max_seq_len), + optimizer=optimizer, + # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, + # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), ) # PyTorch example iterating whole dataset in one epoch - with torch.set_grad_enabled(True): - for epoch in range(2): - for sample_idx, (inputs, labels) in enumerate(dataloader): - batch_id = sample_idx // gradient_accumulation_steps - microbatch_id = sample_idx % gradient_accumulation_steps - inputs, labels = inputs.to("cuda"), labels.to("cuda") - outputs = model(inputs) - loss = F.mse_loss(outputs, labels) - loss.backward() - if (microbatch_id+1) % gradient_accumulation_steps == 0: - print(f"Epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") - optimizer.step() - optimizer.zero_grad() - lr_scheduler.step() - - - # Pytorch example with loop around data. - # To handle loop-around data, we either pass the batch id as epoch value - # to the scheduler step (option 1 below) or reset the LR scheduler (option 2) - dataloader_it = iter(dataloader) - sample_idx, num_sentences_processed, num_tokens_processed = 0, 0, 0 - while True: - try: - inputs, labels = next(dataloader_it) - inputs, labels = inputs.to("cuda"), labels.to("cuda") - outputs = model(inputs) + for epoch in range(2): + for sample_idx, (inputs, labels) in enumerate(dataloader): + batch_id = sample_idx // gradient_accumulation_steps + batch_id = sample_idx % gradient_accumulation_steps + inputs, labels = inputs.to(device), labels.to(device) + outputs = model_ddp(inputs) loss = F.mse_loss(outputs, labels) loss.backward() - batch_id = sample_idx // gradient_accumulation_steps - microbatch_id = sample_idx % gradient_accumulation_steps - num_sentences_processed += lr_scheduler.batch_sizes[batch_id] - num_tokens_processed += lr_scheduler.batch_metrics[batch_id] - sample_idx += 1 - if (microbatch_id+1) % gradient_accumulation_steps == 0: - print(f"Batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, size {lr_scheduler.batch_sizes[batch_id]}, metric {lr_scheduler.batch_metrics[batch_id]}") + if (batch_id+1) % gradient_accumulation_steps == 0: + if dataloader_rank==0: + print(f"rank {dataloader_rank}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") optimizer.step() optimizer.zero_grad() - lr_scheduler.step(epoch=batch_id+1) # option 1: specify next batch - - # stop after updating model for 100 sentences or 1000 tokens - if num_sentences_processed>=100 or num_tokens_processed>=1000: - break - except StopIteration: - dataloader_it = iter(dataloader) - sample_idx = 0 - lr_scheduler.step(0) # option 2: reset scheduler + lr_scheduler.step() + + dist.destroy_process_group() # DeepSpeed example config = { "train_batch_size": base_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, - "optimizer": { "type": "Adam", "params": { "lr": base_lr, } }, + "optimizer": { "type": "Adam", "params": { "lr": base_lr } }, } - engine, optimizer, _, _ = deepspeed.initialize(config=config, + + engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) # engine.training_dataloader = dataloader #use this or the deepspeed_io() engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - dataloader_it = iter(engine.training_dataloader) - for epoch in range(10): - try: - for batch_id in range(len(engine.training_dataloader)//gradient_accumulation_steps): - for microbatch_id in range(gradient_accumulation_steps): - inputs, labels = next(dataloader_it) - inputs, labels = inputs.to("cuda"), labels.to("cuda") - outputs = engine(inputs) - loss = F.mse_loss(outputs, labels) - engine.backward(loss) - print(f"Epoch {epoch}, batch {batch_id}, microbatch {microbatch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") - engine.step(lr_kwargs={'epoch': batch_id+1}) - except StopIteration: - # if we run out of data, we restart the dataloader and LR scheduler - dataloader_it = iter(engine.training_dataloader) - lr_scheduler.step(0) - - \ No newline at end of file + lr_scheduler.step(0) # reset LR scheduler + for epoch in range(2): + for sample_idx, (inputs, labels) in enumerate(dataloader): + batch_id = sample_idx // gradient_accumulation_steps + batch_id = sample_idx % gradient_accumulation_steps + inputs, labels = inputs.to(device), labels.to(device) + outputs = engine(inputs) + loss = F.mse_loss(outputs, labels) + engine.backward(loss) + if dataloader_rank==0: + print(f"rank {dataloader_rank}, batch {batch_id}, microbatch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + engine.step() + + # Deepspeed example for pipeline parallelism + if pipeline_parallelism: + model = PipelineModule(layers=model.to_layers(), num_stages=2) + engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, + model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + # engine.training_dataloader = dataloader #use this or the deepspeed_io() + engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) + + dataloader_it = iter(dataloader) # reset dataloader + lr_scheduler.step(0) # reset LR scheduler + for epoch in range(2): + for batch_id in range(len(dataloader)//gradient_accumulation_steps): + loss = engine.train_batch(data_iter=dataloader_it) + if dataloader_rank==0: + print(f"rank {dataloader_rank}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") From 3658080e36cd581b2c032cb7974d8140593b2fce Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 8 Mar 2024 17:19:05 +0000 Subject: [PATCH 42/64] pre-commit hooks --- .../variable_batch_size_and_lr.py | 377 ++++++++++-------- 1 file changed, 201 insertions(+), 176 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 6efd3d13cecb..6bdc1e210144 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -1,10 +1,15 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import random import os import torch from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader, DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP -import torch.distributed as dist +from torch import distributed as torch_dist import torch.nn.functional as F import deepspeed from deepspeed.utils import logger @@ -24,11 +29,10 @@ def batch_by_size( required_microbatches_of_same_size=False, verbose=False, seed=0, - ): - +): """ Yield mini-batches of indices bucketed by size. Batches may contain sequences of different lengths. - Similar to "Attention is all you need", Section 5.1: + Similar to "Attention is all you need", Section 5.1: "Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately X source tokens and X target tokens" @@ -54,7 +58,7 @@ def batch_by_size( assert not shuffle_metric_values or not order_by_metric_value, \ "either sort_metric_values or shuffle_metric_values can be True, not both." - sample_ids = sample_ids or list(range(len(metric_values))) + sample_ids = sample_ids or list(range(len(metric_values))) metrics = list(zip(metric_values, sample_ids)) if shuffle_metric_values: @@ -64,18 +68,18 @@ def batch_by_size( metrics = sorted(metrics) # go through metrics and warn user and filter samples that alone exceed the max batch threshold - long_ids = [ idx for val, idx in metrics if val>max_metric_value_per_batch ] - if len(long_ids)>0: + long_ids = [idx for val, idx in metrics if val > max_metric_value_per_batch] + if len(long_ids) > 0: logger.warning(f"Data indices {long_ids} ignored as metrics exceed {max_metric_value_per_batch}.") logger.info(f"Original dataset length: {len(metrics)}. New dataset length: {len(long_ids)}") - metrics = [ m for m in metrics if m[1] not in long_ids ] + metrics = [m for m in metrics if m[1] not in long_ids] def is_microbatch_valid(metrics): - if min_batch_size and len(metrics)max_batch_size: return False # too many samples - if sum([m[0] for m in metrics]) > max_metric_value_per_batch: return False # exceeds max + if min_batch_size and len(metrics) < min_batch_size: return False # insufficient sample count + if max_batch_size and len(metrics) > max_batch_size: return False # too many samples + if sum([m[0] for m in metrics]) > max_metric_value_per_batch: return False # exceeds max return True - + # go through all samples and pack then in microbatches of metric sums below the threshold # `required_microbatches_of_same_size` means all minibatches in a batch must be of equal size num_microbatches_per_batch = dataloader_num_replicas * gradient_accumulation_steps @@ -88,43 +92,45 @@ def is_microbatch_valid(metrics): for batch_size in range(equal_size_multiple, len(metrics), equal_size_multiple): # attempt effective batch - batch = metrics[batch_init:batch_init+batch_size] + batch = metrics[batch_init:batch_init + batch_size] # pick interleaved samples for each microbatch to help with load balancing # (in the ordered use case), and to replicate what the distributed sampler does. - microbatch = [ batch[b::equal_size_multiple] for b in range(equal_size_multiple) ] + microbatch = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] # if they are all valid micro-batches, keep them until you find longer mbatches, if any - is_batch_valid = all([is_microbatch_valid(mb) for mb in microbatch] ) + is_batch_valid = all([is_microbatch_valid(mb) for mb in microbatch]) if not is_batch_valid: break - if not is_batch_valid: batch_size -= equal_size_multiple #ignore last iteration (not valid) - batch = metrics[batch_init:batch_init+batch_size] - microbatch = [ batch[b::equal_size_multiple] for b in range(equal_size_multiple) ] - batch_init += sum( [ len(l) for l in microbatch ] ) + if not is_batch_valid: batch_size -= equal_size_multiple #ignore last iteration (not valid) + batch = metrics[batch_init:batch_init + batch_size] + microbatch = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] + batch_init += sum([len(l) for l in microbatch]) microbatches += microbatch # make sure we give the same number of (micro-)batches to each dataloader by trimming dataset microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] #compute the effective batch size for each microbatch. - batch_sizes, batch_metrics, microbatch_sample_ids = [], [], [] + batch_sizes, batch_sum_metrics, batch_max_metrics, microbatch_sample_ids = [], [], [], [] for rank in range(0, len(microbatches), num_microbatches_per_batch): - microbatch = microbatches[rank: rank+num_microbatches_per_batch] + microbatch = microbatches[rank:rank + num_microbatches_per_batch] batch_size = sum([len(mb) for mb in microbatch]) - batch_metric = sum([m[0] for m in microbatch[0]]) - batch_sample_ids = [ [m[1] for m in metrics] for metrics in microbatch] + batch_sum_metric = sum([m[0] for m in microbatch[0]]) + batch_max_metric = max([m[0] for m in microbatch[0]]) + batch_sample_ids = [[m[1] for m in metrics] for metrics in microbatch] batch_sizes.append(batch_size) - batch_metrics.append(batch_metric) + batch_sum_metrics.append(batch_sum_metric) + batch_max_metrics.append(batch_max_metric) microbatch_sample_ids += batch_sample_ids if verbose: - print(f"Batch size {batch_size} samples, metric value {batch_metric}, samples: {batch_sample_ids}") + print(f"Batch size {batch_size} samples, metric value {batch_sum_metric}, samples: {batch_sample_ids}") + + # return the sample ids of each microbatch, and the batch sizes + assert len(batch_sizes) == len(microbatch_sample_ids) // num_microbatches_per_batch + return microbatch_sample_ids, batch_sizes, batch_sum_metrics, batch_max_metrics - # return the sample ids of each microbatch, and the batch sizes - assert len(batch_sizes) == len(microbatch_sample_ids)//num_microbatches_per_batch - return microbatch_sample_ids, batch_sizes, batch_metrics - def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): """ given a reference lr and batch_size, compute the new LR for a given batch size """ @@ -134,61 +140,73 @@ def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): return base_lr * batch_size / base_batch_size if method == "sqrt": # Square Root scaling: "when multiplying the batch size by k, multiply the learning rate - # by √k, to keep the variance in the gradient expectation constant" + # by √k, to keep the variance in the gradient expectation constant" # (A. Krizhevsky. One weird trick for parallelizing convolutional neural networks) return base_lr * torch.sqrt(batch_size / base_batch_size) raise ValueError("Unknown scaling method: {}".format(method)) -def dataloader_for_variable_batch_size(dataset, - microbatch_sample_ids, dataloader_rank, dataloader_num_replicas, dataloader_collate_fn, - dataloader_num_workers=2, dataloader_pin_memory=False): - - # equidistantly distribute the microbatches across the replicas in an interleaved fashion. - sampler = DistributedSampler( - dataset=microbatch_sample_ids, - num_replicas=dataloader_num_replicas, - rank=dataloader_rank, - shuffle=False, - drop_last=False, - ) - - # collate function applies wraps user defined collate function to the variable batch data - def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): - # batch is a list of sample ids per microbatch - assert len(batch_sample_ids)==1, "only 1 element should be returned by the sampler." - batch_data = [dataset[idx] for idx in batch_sample_ids[0]] - return collate_fn(batch_data) if collate_fn else batch_data - - collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) - - dataloader = DataLoader( - dataset=microbatch_sample_ids, - sampler=sampler, - num_workers = dataloader_num_workers, - collate_fn = collate_fn, - pin_memory=dataloader_pin_memory,) - - deepspeed_io_kwargs = dict( - dataset=microbatch_sample_ids, - batch_size=1, - pin_memory=dataloader_pin_memory, - data_sampler=sampler, - collate_fn=collate_fn, - num_local_io_workers=dataloader_num_workers,) - - return dataloader, deepspeed_io_kwargs - +def dataloader_for_variable_batch_size(dataset, + microbatch_sample_ids, + dataloader_rank, + dataloader_num_replicas, + dataloader_collate_fn, + dataloader_num_workers=2, + dataloader_pin_memory=False): + + # equidistantly distribute the microbatches across the replicas in an interleaved fashion. + sampler = DistributedSampler( + dataset=microbatch_sample_ids, + num_replicas=dataloader_num_replicas, + rank=dataloader_rank, + shuffle=False, + drop_last=False, + ) + + # collate function applies wraps user defined collate function to the variable batch data + def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): + # batch is a list of sample ids per microbatch + assert len(batch_sample_ids) == 1, "only 1 element should be returned by the sampler." + batch_data = [dataset[idx] for idx in batch_sample_ids[0]] + return collate_fn(batch_data) if collate_fn else batch_data + + collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) + + dataloader = DataLoader( + dataset=microbatch_sample_ids, + sampler=sampler, + num_workers=dataloader_num_workers, + collate_fn=collate_fn, + pin_memory=dataloader_pin_memory, + ) + + deepspeed_io_kwargs = dict( + dataset=microbatch_sample_ids, + batch_size=1, + pin_memory=dataloader_pin_memory, + data_sampler=sampler, + collate_fn=collate_fn, + num_local_io_workers=dataloader_num_workers, + ) + + return dataloader, deepspeed_io_kwargs + class StubLRScheduler(LRScheduler): """ a stub LR scheduler that does not change the LR, keeps it constant """ + def get_lr(self) -> float: return self.base_lrs -def lr_scheduler_for_variable_batch_size( - base_batch_size, batch_sizes, dataloader, batch_metrics, - lr_scaling_method='linear', optimizer=None, lr_scheduler_class=None, - **lr_scheduler_kwargs): + +def lr_scheduler_for_variable_batch_size(base_batch_size, + batch_sizes, + dataloader, + batch_metrics, + lr_scaling_method='linear', + optimizer=None, + lr_scheduler_class=None, + **lr_scheduler_kwargs): """ returns a class that provides an LR scheduler that scales learning rate at every epoch taking into account the batch size of each epoch. @@ -216,14 +234,14 @@ def __init__(self, optimizer, **lr_scheduler_kwargs): self.dataloader = dataloader self._last_lr = [p['lr'] for p in optimizer.param_groups] super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) - + def state_dict(self): return { 'base': super().state_dict(), 'base_batch_size': self.base_batch_size, 'lr_scaling_method': self.lr_scaling_method, 'batch_sizes': self.batch_sizes, - } + } def load_state_dict(self, state_dict): super().load_state_dict(state_dict['base']) @@ -236,18 +254,18 @@ def get_lr(self): def step(self, epoch=None): # call the base scheduler's step method to get LR for next epoch - # Note: optimizer.step preceeds lr_scheduler.step(), so the stepping workflow is: + # Note: optimizer.step preecceds lr_scheduler.step(), so the stepping workflow is: # init: lr_scheduler.step(0) --> set LR for epoch 0 # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 # reset unscaled LRs (to the original scheduler's one) for the current epoch # Note: epoch==0: reset LR scheduler; epoch==None: scale LR for next epoch; - unscaled_lrs = self.base_lrs if epoch==0 else self._last_lr + unscaled_lrs = self.base_lrs if epoch == 0 else self._last_lr for group, lr in zip(self.optimizer.param_groups, unscaled_lrs): - group['lr'] = lr + group['lr'] = lr - super().step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch + super().step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch # scale the learning rate for next epoch for each parameter group. batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] @@ -257,7 +275,6 @@ def step(self, epoch=None): if self.verbose: print(f"Batch id {self.last_epoch}, unscaled LR: {unscaled_lrs}, scaled LR: {self.get_lr()}") - #### main loop: double check arguments and returns correctly-instantiated LR scheduler if lr_scheduler_class is None: @@ -273,71 +290,68 @@ def step(self, epoch=None): def get_dataloader_and_lr_scheduler_for_variable_batch_size( - dataset, - dataset_metric_values, - max_metric_value_per_batch, - base_batch_size, - sample_ids=None, - lr_scaling_method="linear", - min_batch_size=1, - max_batch_size=None, - shuffle_metric_values=False, - order_by_metric_value=False, - gradient_accumulation_steps=1, - pipeline_parallelism=False, - dataloader_rank=0, - dataloader_num_replicas=1, - dataloader_num_workers=0, - dataloader_collate_fn=None, - dataloader_pin_memory=False, - optimizer=None, - lr_scheduler_class=None, - lr_scheduler_kwargs={'verbose':False}, - verbose=False, + dataset, + dataset_metric_values, + max_metric_value_per_batch, + base_batch_size, + sample_ids=None, + lr_scaling_method="linear", + min_batch_size=1, + max_batch_size=None, + shuffle_metric_values=False, + order_by_metric_value=False, + gradient_accumulation_steps=1, + pipeline_parallelism=False, + dataloader_rank=0, + dataloader_num_replicas=1, + dataloader_num_workers=0, + dataloader_collate_fn=None, + dataloader_pin_memory=False, + optimizer=None, + lr_scheduler_class=None, + lr_scheduler_kwargs={'verbose': False}, + verbose=False, ): - # batch_by_size returns the effective batch size and the sample ids for each microbatch. - # We will use the sample ids to retrieve the batches from the dataset, - # and the effective batch size to retrieve the scaled learning rate for each batch - # Note: pipelining in DeepSpeed takes the first micro-batch activation shape as reference. - # So we need to make sure batch size remains contant across all microbatches in a batch. - microbatch_sample_ids, batch_sizes, batch_metrics = batch_by_size( - metric_values=dataset_metric_values, - max_metric_value_per_batch=max_metric_value_per_batch, - sample_ids=sample_ids, - min_batch_size=min_batch_size, - max_batch_size=max_batch_size, - shuffle_metric_values=shuffle_metric_values, - order_by_metric_value=order_by_metric_value, - dataloader_num_replicas=dataloader_num_replicas, - gradient_accumulation_steps=gradient_accumulation_steps, - required_microbatches_of_same_size=pipeline_parallelism, - verbose=verbose, - ) - - dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( - dataset=dataset, - microbatch_sample_ids=microbatch_sample_ids, - dataloader_rank=dataloader_rank, - dataloader_num_replicas=dataloader_num_replicas, - dataloader_collate_fn=dataloader_collate_fn, - dataloader_num_workers=dataloader_num_workers, - dataloader_pin_memory=dataloader_pin_memory, - ) - - lr_scheduler = lr_scheduler_for_variable_batch_size( - base_batch_size=base_batch_size, - batch_sizes=batch_sizes, - batch_metrics=batch_metrics, - lr_scaling_method=lr_scaling_method, - optimizer=optimizer, - dataloader=dataloader, - lr_scheduler_class=lr_scheduler_class, - **lr_scheduler_kwargs) - - return dataloader, lr_scheduler, deepspeed_io_kwargs + # batch_by_size returns the effective batch size and the sample ids for each microbatch. + # We will use the sample ids to retrieve the batches from the dataset, + # and the effective batch size to retrieve the scaled learning rate for each batch + # Note: pipelining in DeepSpeed takes the first micro-batch activation shape as reference. + # So we need to make sure batch size remains constant across all microbatches in a batch. + microbatch_sample_ids, batch_sizes, batch_sum_metrics, batch_max_metrics = batch_by_size( + metric_values=dataset_metric_values, + max_metric_value_per_batch=max_metric_value_per_batch, + sample_ids=sample_ids, + min_batch_size=min_batch_size, + max_batch_size=max_batch_size, + shuffle_metric_values=shuffle_metric_values, + order_by_metric_value=order_by_metric_value, + dataloader_num_replicas=dataloader_num_replicas, + gradient_accumulation_steps=gradient_accumulation_steps, + required_microbatches_of_same_size=pipeline_parallelism, + verbose=verbose, + ) + + dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( + dataset=dataset, + microbatch_sample_ids=microbatch_sample_ids, + dataloader_rank=dataloader_rank, + dataloader_num_replicas=dataloader_num_replicas, + dataloader_collate_fn=dataloader_collate_fn, + dataloader_num_workers=dataloader_num_workers, + dataloader_pin_memory=dataloader_pin_memory, + ) + lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=base_batch_size, + batch_sizes=batch_sizes, + batch_metrics=batch_sum_metrics, + lr_scaling_method=lr_scaling_method, + optimizer=optimizer, + dataloader=dataloader, + lr_scheduler_class=lr_scheduler_class, + **lr_scheduler_kwargs) + return dataloader, lr_scheduler, deepspeed_io_kwargs ########## Main includes few examples on how to use this module ############### @@ -346,10 +360,11 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( class TestData(torch.utils.data.Dataset): """ A test dataset with sequences of random length, and their sum as the target""" + def __init__(self, seq_count, min_seq_len=1, max_seq_len=21, seed=0): - data_random = random.Random(seed) - self.seqs = [ torch.ones(data_random.randrange(min_seq_len,max_seq_len)) for _ in range(seq_count) ] - + data_random = random.Random(seed) + self.seqs = [torch.ones(data_random.randrange(min_seq_len, max_seq_len)) for _ in range(seq_count)] + __len__ = lambda self: len(self.seqs) __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] @@ -385,26 +400,25 @@ def forward(self, x): def to_layers(self): return [self.fc1, self.fc2, self.fc3, self.fc4, lambda x: x.sum()] - - dataloader_rank=int(os.environ.get('RANK',0)) - dataloader_num_replicas=int(os.environ.get('WORLD_SIZE',1)) - device_id=int(os.environ.get('LOCAL_RANK',0)) + dataloader_rank = int(os.environ.get('RANK', 0)) + dataloader_num_replicas = int(os.environ.get('WORLD_SIZE', 1)) + device_id = int(os.environ.get('LOCAL_RANK', 0)) device = f"cuda:{device_id}" - max_seq_len=15 - max_metric_value_per_batch=40 + max_seq_len = 15 + max_metric_value_per_batch = 40 base_batch_size = 8 - base_lr=1e-3 - gradient_accumulation_steps=base_batch_size//dataloader_num_replicas - pipeline_parallelism=True - order_by_metric_value=True #enable for curriculum + base_lr = 1e-3 + gradient_accumulation_steps = base_batch_size // dataloader_num_replicas + pipeline_parallelism = True + order_by_metric_value = True #enable for curriculum - dist.init_process_group(backend='nccl') + torch_dist.init_process_group(backend='nccl') model = TestFeedForward().to(device) dataset = TestData(seq_count=300, min_seq_len=5, max_seq_len=max_seq_len) model_ddp = DDP(model, device_ids=[device]) optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) - metric_values = [ len(s[0]) for s in dataset] # difficulty = input sequence length + metric_values = [len(s[0]) for s in dataset] # difficulty = input sequence length dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_metric_values=metric_values, @@ -417,12 +431,12 @@ def to_layers(self): order_by_metric_value=order_by_metric_value, gradient_accumulation_steps=gradient_accumulation_steps, dataloader_num_workers=0, - dataloader_collate_fn=lambda b : TestData.collate_fn(b, max_seq_len=max_seq_len), + dataloader_collate_fn=lambda b: TestData.collate_fn(b, max_seq_len=max_seq_len), optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), ) - + # PyTorch example iterating whole dataset in one epoch for epoch in range(2): for sample_idx, (inputs, labels) in enumerate(dataloader): @@ -432,28 +446,35 @@ def to_layers(self): outputs = model_ddp(inputs) loss = F.mse_loss(outputs, labels) loss.backward() - if (batch_id+1) % gradient_accumulation_steps == 0: - if dataloader_rank==0: - print(f"rank {dataloader_rank}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + if (batch_id + 1) % gradient_accumulation_steps == 0: + if dataloader_rank == 0: + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") optimizer.step() - optimizer.zero_grad() + optimizer.zero_grad() lr_scheduler.step() - dist.destroy_process_group() - + torch_dist.destroy_process_group() + # DeepSpeed example config = { "train_batch_size": base_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, - "optimizer": { "type": "Adam", "params": { "lr": base_lr } }, + "optimizer": { + "type": "Adam", + "params": { + "lr": base_lr + } + }, } engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, - model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + model=model, + optimizer=optimizer, + lr_scheduler=lr_scheduler) # engine.training_dataloader = dataloader #use this or the deepspeed_io() engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - lr_scheduler.step(0) # reset LR scheduler + lr_scheduler.step(0) # reset LR scheduler for epoch in range(2): for sample_idx, (inputs, labels) in enumerate(dataloader): batch_id = sample_idx // gradient_accumulation_steps @@ -462,22 +483,26 @@ def to_layers(self): outputs = engine(inputs) loss = F.mse_loss(outputs, labels) engine.backward(loss) - if dataloader_rank==0: - print(f"rank {dataloader_rank}, batch {batch_id}, microbatch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + if dataloader_rank == 0: + print( + f"batch {batch_id}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" + ) engine.step() # Deepspeed example for pipeline parallelism if pipeline_parallelism: model = PipelineModule(layers=model.to_layers(), num_stages=2) engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, - model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + model=model, + optimizer=optimizer, + lr_scheduler=lr_scheduler) # engine.training_dataloader = dataloader #use this or the deepspeed_io() engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - - dataloader_it = iter(dataloader) # reset dataloader - lr_scheduler.step(0) # reset LR scheduler + + dataloader_it = iter(dataloader) # reset dataloader + lr_scheduler.step(0) # reset LR scheduler for epoch in range(2): - for batch_id in range(len(dataloader)//gradient_accumulation_steps): + for batch_id in range(len(dataloader) // gradient_accumulation_steps): loss = engine.train_batch(data_iter=dataloader_it) - if dataloader_rank==0: - print(f"rank {dataloader_rank}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + if dataloader_rank == 0: + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") From 713f87b1a3b86c1e31d705ae86783f2a3fcaae30 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Tue, 12 Mar 2024 10:02:27 +0000 Subject: [PATCH 43/64] fixed collate_fn to include padding --- .../variable_batch_size_and_lr.py | 290 ++++-------------- .../variable_batch_size_and_lr_test.py | 170 ++++++++++ 2 files changed, 238 insertions(+), 222 deletions(-) create mode 100644 deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 6bdc1e210144..c996033d77d3 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -4,26 +4,21 @@ # DeepSpeed Team import random -import os import torch from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader, DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP -from torch import distributed as torch_dist -import torch.nn.functional as F -import deepspeed from deepspeed.utils import logger -from deepspeed.pipe import PipelineModule def batch_by_size( - metric_values, - max_metric_value_per_batch, + seqlens, + max_tokens_per_batch, sample_ids=None, min_batch_size=1, max_batch_size=None, - shuffle_metric_values=False, - order_by_metric_value=False, + shuffle_seqlens=False, + order_by_seqlen=False, dataloader_num_replicas=1, gradient_accumulation_steps=1, required_microbatches_of_same_size=False, @@ -37,47 +32,49 @@ def batch_by_size( contained a set of sentence pairs containing approximately X source tokens and X target tokens" Arguments: - - `metric_values`: a list of difficulties (metric values) for every sample in the dataset; - - `max_metric_value_per_batch`: upper cap in total difficulty in a batch; - - `sample_ids`: user-defined ids of the samples in metric_values. If not provided, + - `seqlens`: a list of difficulties (metric values) for every sample in the dataset; + - `max_tokens_per_batch`: upper cap in total difficulty in a batch; + - `sample_ids`: user-defined ids of the samples in seqlens. If not provided, automatically assigns a sequential order; - `min_batch_size`: smallest allowed size of a batch; - `min_batch_size`: largest allowed size of a batch; - - `shuffle_metric_values`: shuffle metric values before packing samples into batches; - - `order_by_metric_value`: order samples by ascending metric values before packing into batches; + - `shuffle_seqlens`: shuffle metric values before packing samples into batches; + - `order_by_seqlen`: order samples by ascending metric values before packing into batches; - `dataloader_num_replicas`: number of dataloaders - `gradient_accumulation_steps`: number of gradient accumulation steps; - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` - micro-batches per batch), should have all micro-batches with the same batch size. - Required for pipeline parallelism (as activation shapes is uniform across mini-batches), or - in regular data parallelism if we want the same number of samples per accumulation step. + micro-batches per batch), should have all micro-batches with the same batch size ie the same + number of sentences. - Returns a list of the ids of each micro-batch and a list of effective batch sizes. + Returns: + - `microbatch_ids`: list of tuple of batch id and samples ids per microbatch + - `batch_sizes`: the effective batch size of each batch, used for to compute the scaled LR + - `batch_max_seqlens`: the max seqlen across all microbatches in a batch """ - assert not shuffle_metric_values or not order_by_metric_value, \ - "either sort_metric_values or shuffle_metric_values can be True, not both." + assert not shuffle_seqlens or not order_by_seqlen, \ + "either sort_seqlens or shuffle_seqlens can be True, not both." - sample_ids = sample_ids or list(range(len(metric_values))) - metrics = list(zip(metric_values, sample_ids)) + sample_ids = sample_ids or list(range(len(seqlens))) + metrics = list(zip(seqlens, sample_ids)) - if shuffle_metric_values: + if shuffle_seqlens: metric_random = random.Random(seed) metric_random.shuffle(metrics) - if order_by_metric_value: + if order_by_seqlen: metrics = sorted(metrics) # go through metrics and warn user and filter samples that alone exceed the max batch threshold - long_ids = [idx for val, idx in metrics if val > max_metric_value_per_batch] + long_ids = [idx for val, idx in metrics if val > max_tokens_per_batch] if len(long_ids) > 0: - logger.warning(f"Data indices {long_ids} ignored as metrics exceed {max_metric_value_per_batch}.") + logger.warning(f"Data indices {long_ids} ignored as metrics exceed {max_tokens_per_batch}.") logger.info(f"Original dataset length: {len(metrics)}. New dataset length: {len(long_ids)}") metrics = [m for m in metrics if m[1] not in long_ids] def is_microbatch_valid(metrics): if min_batch_size and len(metrics) < min_batch_size: return False # insufficient sample count if max_batch_size and len(metrics) > max_batch_size: return False # too many samples - if sum([m[0] for m in metrics]) > max_metric_value_per_batch: return False # exceeds max + if sum([m[0] for m in metrics]) > max_tokens_per_batch: return False # exceeds max return True # go through all samples and pack then in microbatches of metric sums below the threshold @@ -113,23 +110,24 @@ def is_microbatch_valid(metrics): microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] #compute the effective batch size for each microbatch. - batch_sizes, batch_sum_metrics, batch_max_metrics, microbatch_sample_ids = [], [], [], [] + batch_sizes, batch_max_seqlens, microbatch_ids = [], [], [] for rank in range(0, len(microbatches), num_microbatches_per_batch): + batch_id = rank // num_microbatches_per_batch microbatch = microbatches[rank:rank + num_microbatches_per_batch] batch_size = sum([len(mb) for mb in microbatch]) - batch_sum_metric = sum([m[0] for m in microbatch[0]]) - batch_max_metric = max([m[0] for m in microbatch[0]]) - batch_sample_ids = [[m[1] for m in metrics] for metrics in microbatch] + batch_max_seqlen = max([m[0] for m in microbatch[0]]) + mb_sample_ids = [[m[1] for m in metrics] for metrics in microbatch] batch_sizes.append(batch_size) - batch_sum_metrics.append(batch_sum_metric) - batch_max_metrics.append(batch_max_metric) - microbatch_sample_ids += batch_sample_ids + batch_max_seqlens.append(batch_max_seqlen) + microbatch_ids.append( (batch_id, mb_sample_ids) ) + n_tokens_in_batch = sum([m[0] for m in microbatch[0]]) + assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - print(f"Batch size {batch_size} samples, metric value {batch_sum_metric}, samples: {batch_sample_ids}") + print(f"Batch size {batch_size} samples, {n_tokens_in_batch} tokens, samples: {mb_sample_ids}") # return the sample ids of each microbatch, and the batch sizes - assert len(batch_sizes) == len(microbatch_sample_ids) // num_microbatches_per_batch - return microbatch_sample_ids, batch_sizes, batch_sum_metrics, batch_max_metrics + assert len(batch_sizes) == len(microbatch_ids) // num_microbatches_per_batch + return microbatch_ids, batch_sizes, batch_max_seqlens def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): @@ -143,20 +141,24 @@ def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): # by √k, to keep the variance in the gradient expectation constant" # (A. Krizhevsky. One weird trick for parallelizing convolutional neural networks) return base_lr * torch.sqrt(batch_size / base_batch_size) + elif method == None or method.upper() == "NONE": + return base_lr raise ValueError("Unknown scaling method: {}".format(method)) def dataloader_for_variable_batch_size(dataset, - microbatch_sample_ids, + microbatch_ids, + batch_max_seqlens, dataloader_rank, dataloader_num_replicas, - dataloader_collate_fn, + dataloader_collate_fn=None, dataloader_num_workers=2, - dataloader_pin_memory=False): + dataloader_pin_memory=False, + required_return_of_batch_max_seqlen=False): # equidistantly distribute the microbatches across the replicas in an interleaved fashion. sampler = DistributedSampler( - dataset=microbatch_sample_ids, + dataset=microbatch_ids, num_replicas=dataloader_num_replicas, rank=dataloader_rank, shuffle=False, @@ -164,16 +166,19 @@ def dataloader_for_variable_batch_size(dataset, ) # collate function applies wraps user defined collate function to the variable batch data - def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): - # batch is a list of sample ids per microbatch - assert len(batch_sample_ids) == 1, "only 1 element should be returned by the sampler." - batch_data = [dataset[idx] for idx in batch_sample_ids[0]] - return collate_fn(batch_data) if collate_fn else batch_data + def collate_fn_wrapper(list_microbatch_ids, dataset, collate_fn=None): + assert len(list_microbatch_ids) == 1, "only 1 element should be returned by the sampler." + batch_id, microbatch_sample_ids = list_microbatch_ids[0] + batch = [dataset[idx] for idx in microbatch_sample_ids] + if required_return_of_batch_max_seqlen: + max_seqlen = batch_max_seqlens[batch_id] + return collate_fn(batch, max_seqlen) if collate_fn else (batch, max_seqlen) + return collate_fn(batch) if collate_fn else batch collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) dataloader = DataLoader( - dataset=microbatch_sample_ids, + dataset=microbatch_ids, sampler=sampler, num_workers=dataloader_num_workers, collate_fn=collate_fn, @@ -181,7 +186,7 @@ def collate_fn_wrapper(batch_sample_ids, dataset, collate_fn=None): ) deepspeed_io_kwargs = dict( - dataset=microbatch_sample_ids, + dataset=microbatch_ids, batch_size=1, pin_memory=dataloader_pin_memory, data_sampler=sampler, @@ -202,7 +207,6 @@ def get_lr(self) -> float: def lr_scheduler_for_variable_batch_size(base_batch_size, batch_sizes, dataloader, - batch_metrics, lr_scaling_method='linear', optimizer=None, lr_scheduler_class=None, @@ -228,7 +232,6 @@ class VariableBatchSizeLR(lr_scheduler_class or StubLRScheduler): def __init__(self, optimizer, **lr_scheduler_kwargs): self.batch_sizes = batch_sizes - self.batch_metrics = batch_metrics self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method self.dataloader = dataloader @@ -291,17 +294,16 @@ def step(self, epoch=None): def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, - dataset_metric_values, - max_metric_value_per_batch, + dataset_seqlens, + max_seqlen_per_batch, base_batch_size, sample_ids=None, lr_scaling_method="linear", min_batch_size=1, max_batch_size=None, - shuffle_metric_values=False, - order_by_metric_value=False, + shuffle_seqlens=False, + order_by_seqlen=False, gradient_accumulation_steps=1, - pipeline_parallelism=False, dataloader_rank=0, dataloader_num_replicas=1, dataloader_num_workers=0, @@ -310,41 +312,39 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( optimizer=None, lr_scheduler_class=None, lr_scheduler_kwargs={'verbose': False}, + required_microbatches_of_same_size=False, + required_return_of_batch_max_seqlen=False, verbose=False, ): - # batch_by_size returns the effective batch size and the sample ids for each microbatch. - # We will use the sample ids to retrieve the batches from the dataset, - # and the effective batch size to retrieve the scaled learning rate for each batch - # Note: pipelining in DeepSpeed takes the first micro-batch activation shape as reference. - # So we need to make sure batch size remains constant across all microbatches in a batch. - microbatch_sample_ids, batch_sizes, batch_sum_metrics, batch_max_metrics = batch_by_size( - metric_values=dataset_metric_values, - max_metric_value_per_batch=max_metric_value_per_batch, + microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( + seqlens=dataset_seqlens, + max_tokens_per_batch=max_seqlen_per_batch, sample_ids=sample_ids, min_batch_size=min_batch_size, max_batch_size=max_batch_size, - shuffle_metric_values=shuffle_metric_values, - order_by_metric_value=order_by_metric_value, + shuffle_seqlens=shuffle_seqlens, + order_by_seqlen=order_by_seqlen, dataloader_num_replicas=dataloader_num_replicas, gradient_accumulation_steps=gradient_accumulation_steps, - required_microbatches_of_same_size=pipeline_parallelism, + required_microbatches_of_same_size=required_microbatches_of_same_size, verbose=verbose, ) dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( dataset=dataset, - microbatch_sample_ids=microbatch_sample_ids, + microbatch_ids=microbatch_ids, + batch_max_seqlens=batch_max_seqlens, dataloader_rank=dataloader_rank, dataloader_num_replicas=dataloader_num_replicas, dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, + required_return_of_batch_max_seqlen=required_return_of_batch_max_seqlen, ) lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=base_batch_size, batch_sizes=batch_sizes, - batch_metrics=batch_sum_metrics, lr_scaling_method=lr_scaling_method, optimizer=optimizer, dataloader=dataloader, @@ -352,157 +352,3 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( **lr_scheduler_kwargs) return dataloader, lr_scheduler, deepspeed_io_kwargs - - -########## Main includes few examples on how to use this module ############### - -if __name__ == "__main__": - - class TestData(torch.utils.data.Dataset): - """ A test dataset with sequences of random length, and their sum as the target""" - - def __init__(self, seq_count, min_seq_len=1, max_seq_len=21, seed=0): - data_random = random.Random(seed) - self.seqs = [torch.ones(data_random.randrange(min_seq_len, max_seq_len)) for _ in range(seq_count)] - - __len__ = lambda self: len(self.seqs) - __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] - - # collate_fn merges sequences, padded to the max length, or trimmed/paded to a value - @staticmethod - def collate_fn(batch, max_seq_len=None): - # if max_seq_len in enforces, trim/pad them to the max_len specified - if max_seq_len is not None: - for i, (seq, _) in enumerate(batch): - batch[i][0] = torch.nn.ConstantPad1d((0, max_seq_len - seq.shape[0]), 0)(seq) - seqs, labels = zip(*batch) - padded = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0) - labels = torch.tensor(labels) - return padded, labels - - class TestFeedForward(torch.nn.Module): - """ a test feedforward model """ - - def __init__(self): - super(TestFeedForward, self).__init__() - self.fc1 = torch.nn.Linear(max_seq_len, 128) - self.fc2 = torch.nn.Linear(128, 128) - self.fc3 = torch.nn.Linear(128, 128) - self.fc4 = torch.nn.Linear(128, 128) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = F.relu(self.fc3(x)) - x = F.relu(self.fc4(x)) - return x.sum() - - def to_layers(self): - return [self.fc1, self.fc2, self.fc3, self.fc4, lambda x: x.sum()] - - dataloader_rank = int(os.environ.get('RANK', 0)) - dataloader_num_replicas = int(os.environ.get('WORLD_SIZE', 1)) - device_id = int(os.environ.get('LOCAL_RANK', 0)) - device = f"cuda:{device_id}" - max_seq_len = 15 - max_metric_value_per_batch = 40 - base_batch_size = 8 - base_lr = 1e-3 - gradient_accumulation_steps = base_batch_size // dataloader_num_replicas - pipeline_parallelism = True - order_by_metric_value = True #enable for curriculum - - torch_dist.init_process_group(backend='nccl') - model = TestFeedForward().to(device) - dataset = TestData(seq_count=300, min_seq_len=5, max_seq_len=max_seq_len) - model_ddp = DDP(model, device_ids=[device]) - optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) - - metric_values = [len(s[0]) for s in dataset] # difficulty = input sequence length - dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( - dataset=dataset, - dataset_metric_values=metric_values, - base_batch_size=base_batch_size, - max_metric_value_per_batch=max_metric_value_per_batch, - dataloader_rank=dataloader_rank, - dataloader_num_replicas=dataloader_num_replicas, - pipeline_parallelism=pipeline_parallelism, - lr_scaling_method="linear", - order_by_metric_value=order_by_metric_value, - gradient_accumulation_steps=gradient_accumulation_steps, - dataloader_num_workers=0, - dataloader_collate_fn=lambda b: TestData.collate_fn(b, max_seq_len=max_seq_len), - optimizer=optimizer, - # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, - # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), - ) - - # PyTorch example iterating whole dataset in one epoch - for epoch in range(2): - for sample_idx, (inputs, labels) in enumerate(dataloader): - batch_id = sample_idx // gradient_accumulation_steps - batch_id = sample_idx % gradient_accumulation_steps - inputs, labels = inputs.to(device), labels.to(device) - outputs = model_ddp(inputs) - loss = F.mse_loss(outputs, labels) - loss.backward() - if (batch_id + 1) % gradient_accumulation_steps == 0: - if dataloader_rank == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") - optimizer.step() - optimizer.zero_grad() - lr_scheduler.step() - - torch_dist.destroy_process_group() - - # DeepSpeed example - config = { - "train_batch_size": base_batch_size, - "gradient_accumulation_steps": gradient_accumulation_steps, - "optimizer": { - "type": "Adam", - "params": { - "lr": base_lr - } - }, - } - - engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, - model=model, - optimizer=optimizer, - lr_scheduler=lr_scheduler) - # engine.training_dataloader = dataloader #use this or the deepspeed_io() - engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - - lr_scheduler.step(0) # reset LR scheduler - for epoch in range(2): - for sample_idx, (inputs, labels) in enumerate(dataloader): - batch_id = sample_idx // gradient_accumulation_steps - batch_id = sample_idx % gradient_accumulation_steps - inputs, labels = inputs.to(device), labels.to(device) - outputs = engine(inputs) - loss = F.mse_loss(outputs, labels) - engine.backward(loss) - if dataloader_rank == 0: - print( - f"batch {batch_id}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" - ) - engine.step() - - # Deepspeed example for pipeline parallelism - if pipeline_parallelism: - model = PipelineModule(layers=model.to_layers(), num_stages=2) - engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, - model=model, - optimizer=optimizer, - lr_scheduler=lr_scheduler) - # engine.training_dataloader = dataloader #use this or the deepspeed_io() - engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - - dataloader_it = iter(dataloader) # reset dataloader - lr_scheduler.step(0) # reset LR scheduler - for epoch in range(2): - for batch_id in range(len(dataloader) // gradient_accumulation_steps): - loss = engine.train_batch(data_iter=dataloader_it) - if dataloader_rank == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py new file mode 100644 index 000000000000..b4d0ad5a7856 --- /dev/null +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py @@ -0,0 +1,170 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import random +import os +import torch +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import distributed as torch_dist +import torch.nn.functional as F +import deepspeed +from deepspeed.pipe import PipelineModule + +from deepspeed.runtime.data_pipeline.data_sampling.variable_batch_size_and_lr import get_dataloader_and_lr_scheduler_for_variable_batch_size + + +if __name__ == "__main__": + + class TestData(torch.utils.data.Dataset): + """ A test dataset with sequences of random length, and their sum as the target""" + + def __init__(self, seq_count, min_seq_len=1, max_seq_len=21, seed=0): + data_random = random.Random(seed) + self.seqs = [torch.ones(data_random.randrange(min_seq_len, max_seq_len)) for _ in range(seq_count)] + + __len__ = lambda self: len(self.seqs) + __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] + + # collate_fn merges sequences and trims/pads them to the max_len specified + @staticmethod + def collate_fn(batch, max_seqlen=None, padding_value=0): + if max_seqlen is not None: + for i, (seq, _) in enumerate(batch): + batch[i][0] = torch.nn.ConstantPad1d((0, max_seqlen - seq.shape[0]), padding_value)(seq) + seqs, labels = zip(*batch) + padded = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=padding_value) + labels = torch.tensor(labels) + return padded, labels + + + class TestFeedForward(torch.nn.Module): + """ a test feedforward model """ + + def __init__(self): + super(TestFeedForward, self).__init__() + self.fc1 = torch.nn.Linear(max_seq_len, 128) + self.fc2 = torch.nn.Linear(128, 128) + self.fc3 = torch.nn.Linear(128, 128) + self.fc4 = torch.nn.Linear(128, 128) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = F.relu(self.fc4(x)) + return x.sum() + + def to_layers(self): + return [self.fc1, self.fc2, self.fc3, self.fc4, lambda x: x.sum()] + + dataloader_rank = int(os.environ.get('RANK', 0)) + dataloader_num_replicas = int(os.environ.get('WORLD_SIZE', 1)) + device_id = int(os.environ.get('LOCAL_RANK', 0)) + device = f"cuda:{device_id}" + max_seqlen_per_batch = 40 + base_batch_size = 8 + base_lr = 1e-3 + gradient_accumulation_steps = base_batch_size // dataloader_num_replicas + pipeline_parallelism = True + order_by_seqlen = True #enable for curriculum + + torch_dist.init_process_group(backend='nccl') + model = TestFeedForward().to(device) + dataset = TestData(seq_count=300, min_seq_len=5, max_seq_len=15) + model_ddp = DDP(model, device_ids=[device]) + optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) + + seqlens = [len(s[0]) for s in dataset] # difficulty = input sequence length + + if pipeline_parallelism: + collate_fn = lambda b, m: TestData.collate_fn(b, m, padding_value=0) + else: + collate_fn = lambda b: TestData.collate_fn(b, padding_value=0) + + dataloader, lr_scheduler, deepspeed_io_kwargs = \ + get_dataloader_and_lr_scheduler_for_variable_batch_size( + dataset=dataset, + dataset_seqlens=seqlens, + base_batch_size=base_batch_size, + max_seqlen_per_batch=max_seqlen_per_batch, + dataloader_rank=dataloader_rank, + dataloader_num_replicas=dataloader_num_replicas, + lr_scaling_method="linear", + order_by_seqlen=order_by_seqlen, + gradient_accumulation_steps=gradient_accumulation_steps, + dataloader_num_workers=0, + dataloader_collate_fn=collate_fn, + optimizer=optimizer, + # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, + # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), + required_microbatches_of_same_size=pipeline_parallelism, + required_return_of_batch_max_seqlen=pipeline_parallelism, + ) + + # PyTorch example iterating whole dataset in one epoch + for epoch in range(2): + for sample_idx, (inputs, labels) in enumerate(dataloader): + batch_id = sample_idx // gradient_accumulation_steps + microbatch_id = sample_idx % gradient_accumulation_steps + inputs, labels = inputs.to(device), labels.to(device) + outputs = model_ddp(inputs) + loss = F.mse_loss(outputs, labels) + loss.backward() + if (microbatch_id + 1) % gradient_accumulation_steps == 0: + if dataloader_rank == 0: + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + optimizer.step() + optimizer.zero_grad() + lr_scheduler.step() + + torch_dist.destroy_process_group() + + # DeepSpeed example + config = { + "train_batch_size": base_batch_size, + "gradient_accumulation_steps": gradient_accumulation_steps, + "optimizer": { + "type": "Adam", + "params": { + "lr": base_lr + } + }, + } + + engine, optimizer, _, lr_scheduler = deepspeed.initialize( + config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + # engine.training_dataloader = dataloader #use this or the deepspeed_io() + engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) + + lr_scheduler.step(0) # reset LR scheduler + for epoch in range(2): + for sample_idx, (inputs, labels) in enumerate(dataloader): + batch_id = sample_idx // gradient_accumulation_steps + microbatch_id = sample_idx % gradient_accumulation_steps + inputs, labels = inputs.to(device), labels.to(device) + outputs = engine(inputs) + loss = F.mse_loss(outputs, labels) + engine.backward(loss) + if dataloader_rank == 0: + print( + f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" + ) + engine.step() + + # Deepspeed example for pipeline parallelism + if pipeline_parallelism: + model = PipelineModule(layers=model.to_layers(), num_stages=2) + engine, optimizer, _, lr_scheduler = deepspeed.initialize( + config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) + + dataloader_it = iter(dataloader) # reset dataloader + lr_scheduler.step(0) # reset LR scheduler + for epoch in range(2): + for batch_id in range(len(dataloader) // gradient_accumulation_steps): + engine.reset_activation_shape() # each batch has a diff length + loss = engine.train_batch(data_iter=dataloader_it) + if dataloader_rank == 0: + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") From ed7d8ea2be397d9eac11a3b5d77976fd8d0c6b3b Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Tue, 12 Mar 2024 17:08:55 +0000 Subject: [PATCH 44/64] attention head by hand --- deepspeed/runtime/data_pipeline/config.py | 28 ++++- deepspeed/runtime/data_pipeline/constants.py | 16 +++ .../variable_batch_size_and_lr.py | 29 ++--- .../variable_batch_size_and_lr_test.py | 118 +++++++++++------- 4 files changed, 123 insertions(+), 68 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index 623480518925..49f3a5ff614d 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -47,7 +47,7 @@ def get_data_sampling(param_dict): param_dict[DATA_SAMPLING] = {} sub_param_dict = param_dict[DATA_SAMPLING] output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict) - + # output[DYNAMIC_BATCHING] = get_dynamic_batching(sub_param_dict) return output @@ -87,6 +87,32 @@ def get_curriculum_learning(param_dict): return output +def get_dynamic_batching_params(param_dict): + if DYNAMIC_BATCHING in param_dict.keys(): + dynamic_batching_params = copy.copy(param_dict[DYNAMIC_BATCHING]) + dynamic_batching_params.pop(DYNAMIC_BATCHING_ENABLED) + return dynamic_batching_params + else: + return {} + + +def get_dynamic_batching(param_dict): + output = {} + if DYNAMIC_BATCHING not in param_dict.keys(): + sub_param_dict = param_dict[DYNAMIC_BATCHING] + sub_param_dict[DYNAMIC_BATCHING_ENABLED] = DYNAMIC_BATCHING_ENABLED_DEFAULT + sub_param_dict[DYNAMIC_BATCHING_LR_SCALING] = DYNAMIC_BATCHING_LR_SCALING + sub_param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT + sub_param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = None + sub_param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT + if sub_param_dict[DYNAMIC_BATCHING_ENABLED]: + assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in sub_param_dict.keys(), \ + f"Dynamic batching is enabled, {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" + for key, val in get_dynamic_batching_params(param_dict).items(): + output[key] = val + return output + + def get_curriculum_learning_enabled(param_dict): if CURRICULUM_LEARNING in param_dict.keys(): return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED, diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py index 1ade640e38d9..0ba32039e106 100644 --- a/deepspeed/runtime/data_pipeline/constants.py +++ b/deepspeed/runtime/data_pipeline/constants.py @@ -62,6 +62,22 @@ CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION = "data_cluster_current_position" CURRICULUM_LEARNING_NP_RNG_STATE = "np_rng_state" +######################################### +# Data efficiency - Dynamic batching and LR scaling +######################################### +DYNAMIC_BATCHING = "dynamic_batching" +DYNAMIC_BATCHING_ENABLED = "enabled" +DYNAMIC_BATCHING_ENABLED_DEFAULT = False +DYNAMIC_BATCHING_LR_SCALING = "lr_scaling" # "linear" / "sqrt" / "none" +DYNAMIC_BATCHING_LR_SCALING_DEFAULT = "linear" +DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size" +DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1 +DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size" +DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None +DYNAMIC_BATCHING_SAMPLES_ORDER = "samples_order" # "random" / "order" / "default" +DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT = "dataloader" # "random" / "order" / "dataloader" +DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH = "max_tokens_per_batch" + ######################################### # Curriculum Learning legacy implementation ######################################### diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index c996033d77d3..0abe1161fb44 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -101,6 +101,7 @@ def is_microbatch_valid(metrics): break if not is_batch_valid: batch_size -= equal_size_multiple #ignore last iteration (not valid) + if batch_size == 0 : break # last batch is not valid (size zero), so we are done batch = metrics[batch_init:batch_init + batch_size] microbatch = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] batch_init += sum([len(l) for l in microbatch]) @@ -110,24 +111,22 @@ def is_microbatch_valid(metrics): microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] #compute the effective batch size for each microbatch. - batch_sizes, batch_max_seqlens, microbatch_ids = [], [], [] + batch_sizes, microbatch_ids = [], [] for rank in range(0, len(microbatches), num_microbatches_per_batch): batch_id = rank // num_microbatches_per_batch microbatch = microbatches[rank:rank + num_microbatches_per_batch] batch_size = sum([len(mb) for mb in microbatch]) - batch_max_seqlen = max([m[0] for m in microbatch[0]]) - mb_sample_ids = [[m[1] for m in metrics] for metrics in microbatch] + mb_ids = [ [m[1] for m in metrics] for metrics in microbatch] batch_sizes.append(batch_size) - batch_max_seqlens.append(batch_max_seqlen) - microbatch_ids.append( (batch_id, mb_sample_ids) ) + microbatch_ids += mb_ids n_tokens_in_batch = sum([m[0] for m in microbatch[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - print(f"Batch size {batch_size} samples, {n_tokens_in_batch} tokens, samples: {mb_sample_ids}") + print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {mb_ids}") # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_ids) // num_microbatches_per_batch - return microbatch_ids, batch_sizes, batch_max_seqlens + return microbatch_ids, batch_sizes def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): @@ -148,13 +147,11 @@ def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): def dataloader_for_variable_batch_size(dataset, microbatch_ids, - batch_max_seqlens, dataloader_rank, dataloader_num_replicas, dataloader_collate_fn=None, dataloader_num_workers=2, - dataloader_pin_memory=False, - required_return_of_batch_max_seqlen=False): + dataloader_pin_memory=False): # equidistantly distribute the microbatches across the replicas in an interleaved fashion. sampler = DistributedSampler( @@ -168,11 +165,8 @@ def dataloader_for_variable_batch_size(dataset, # collate function applies wraps user defined collate function to the variable batch data def collate_fn_wrapper(list_microbatch_ids, dataset, collate_fn=None): assert len(list_microbatch_ids) == 1, "only 1 element should be returned by the sampler." - batch_id, microbatch_sample_ids = list_microbatch_ids[0] - batch = [dataset[idx] for idx in microbatch_sample_ids] - if required_return_of_batch_max_seqlen: - max_seqlen = batch_max_seqlens[batch_id] - return collate_fn(batch, max_seqlen) if collate_fn else (batch, max_seqlen) + microbatch_ids = list_microbatch_ids[0] + batch = [dataset[idx] for idx in microbatch_ids] return collate_fn(batch) if collate_fn else batch collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) @@ -313,11 +307,10 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( lr_scheduler_class=None, lr_scheduler_kwargs={'verbose': False}, required_microbatches_of_same_size=False, - required_return_of_batch_max_seqlen=False, verbose=False, ): - microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( + microbatch_ids, batch_sizes = batch_by_size( seqlens=dataset_seqlens, max_tokens_per_batch=max_seqlen_per_batch, sample_ids=sample_ids, @@ -334,13 +327,11 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( dataset=dataset, microbatch_ids=microbatch_ids, - batch_max_seqlens=batch_max_seqlens, dataloader_rank=dataloader_rank, dataloader_num_replicas=dataloader_num_replicas, dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, - required_return_of_batch_max_seqlen=required_return_of_batch_max_seqlen, ) lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=base_batch_size, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py index b4d0ad5a7856..723b019489cf 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py @@ -6,6 +6,7 @@ import random import os import torch +import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP from torch import distributed as torch_dist import torch.nn.functional as F @@ -18,46 +19,72 @@ if __name__ == "__main__": class TestData(torch.utils.data.Dataset): - """ A test dataset with sequences of random length, and their sum as the target""" + """ A test dataset with sequences of random length, and the sequence length as the label""" - def __init__(self, seq_count, min_seq_len=1, max_seq_len=21, seed=0): + def __init__(self, seq_count, min_seqlen=1, max_seqlen=21, embed_dim=5, seed=0): data_random = random.Random(seed) - self.seqs = [torch.ones(data_random.randrange(min_seq_len, max_seq_len)) for _ in range(seq_count)] + self.mask_size = max_seqlen # M: size of mask + self.padding_value = 0 + self.embed_dim = embed_dim + self.seqs = [torch.ones(data_random.randrange(min_seqlen, max_seqlen), embed_dim) for _ in range(seq_count)] __len__ = lambda self: len(self.seqs) - __getitem__ = lambda self, idx: [self.seqs[idx], self.seqs[idx].sum()] - - # collate_fn merges sequences and trims/pads them to the max_len specified - @staticmethod - def collate_fn(batch, max_seqlen=None, padding_value=0): - if max_seqlen is not None: - for i, (seq, _) in enumerate(batch): - batch[i][0] = torch.nn.ConstantPad1d((0, max_seqlen - seq.shape[0]), padding_value)(seq) + __getitem__ = lambda self, idx: ( self.seqs[idx], len(self.seqs[idx]) ) + + def collate_fn(self, batch): + """ pad sequences of different lenghts into batch of size BxTxE """ seqs, labels = zip(*batch) - padded = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=padding_value) - labels = torch.tensor(labels) - return padded, labels + seqlens = torch.tensor([ len(s) for s in seqs ]) + seqs = nn.utils.rnn.pad_sequence([s[0] for s in batch], batch_first=True, padding_value=self.padding_value) + labels = torch.tensor([s[1] for s in batch], dtype=float) + return seqs, seqlens, labels - class TestFeedForward(torch.nn.Module): + class SingleHeadAttentionAndFeedForward(nn.Module): """ a test feedforward model """ - def __init__(self): - super(TestFeedForward, self).__init__() - self.fc1 = torch.nn.Linear(max_seq_len, 128) - self.fc2 = torch.nn.Linear(128, 128) - self.fc3 = torch.nn.Linear(128, 128) - self.fc4 = torch.nn.Linear(128, 128) + def __init__(self, max_seqlen, embed_dim): + super(SingleHeadAttentionAndFeedForward, self).__init__() + + self.padding_value = 0 + self.max_seqlen = max_seqlen # M: size of mask + self.attn_head = nn.MultiheadAttention(embed_dim, num_heads=1) + self.fc1 = nn.Linear(embed_dim, 128) + self.fc2 = nn.Linear(128, embed_dim) + + def forward(self, x, attn_mask_seqlens=None): + + # optional: 3D masks for attention, padded to individual input sequence lengths + B, T = len(x), max(attn_mask_seqlens) + if attn_mask_seqlens is not None: + masks = torch.tril(torch.ones((B,T,T), dtype=torch.float32)).to(x[0].device) + for i, seqlen in enumerate(attn_mask_seqlens): + masks[i, seqlen:, :] = masks[i, :, seqlen:] = 0 + + # collates sequences of different lengths into a batch of size BxTxE + x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_value) + + # compute q@k / sqrt(d_k) on input of shape BxTxE (where B and T can change) + k, q, v = x, x, x + out = q@k.transpose(-2,-1) # BxTxE @ BxExT --> BxTxT + out = out / (x.shape[-1]**0.5) # √d_k + + if attn_mask_seqlens is not None: # mask, if needed + out = out.masked_fill(masks==0, value=float('-inf')) + + # softmax and multiply by values vector + out = F.softmax(out, dim=-1) # softmax --> BxTxT + out = out@v # BxTxT @ BxTxE --> BxTxE + + # feedforward: needs to converts BxTxE to BxMxE by padding extra tokens + out = F.pad(out, pad=(0, 0, 0, self.max_seqlen-T), value=self.padding_value) + out = F.relu(self.fc1(out)) + out = F.relu(self.fc2(out)) + return torch.tensor(out.sum(-1).sum(-1), requires_grad=True, dtype=float) - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = F.relu(self.fc3(x)) - x = F.relu(self.fc4(x)) - return x.sum() def to_layers(self): - return [self.fc1, self.fc2, self.fc3, self.fc4, lambda x: x.sum()] + return [self.fc1, self.fc2, lambda x: x.sum(-1).sum(-1)] dataloader_rank = int(os.environ.get('RANK', 0)) dataloader_num_replicas = int(os.environ.get('WORLD_SIZE', 1)) @@ -70,19 +97,15 @@ def to_layers(self): pipeline_parallelism = True order_by_seqlen = True #enable for curriculum + max_seqlen = 15 torch_dist.init_process_group(backend='nccl') - model = TestFeedForward().to(device) - dataset = TestData(seq_count=300, min_seq_len=5, max_seq_len=15) + dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) + model = SingleHeadAttentionAndFeedForward(max_seqlen, dataset.embed_dim).to(device) model_ddp = DDP(model, device_ids=[device]) optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) + loss_fn = lambda x, y: F.mse_loss(x, y) - seqlens = [len(s[0]) for s in dataset] # difficulty = input sequence length - - if pipeline_parallelism: - collate_fn = lambda b, m: TestData.collate_fn(b, m, padding_value=0) - else: - collate_fn = lambda b: TestData.collate_fn(b, padding_value=0) - + seqlens = [len(s[0]) for s in dataset] dataloader, lr_scheduler, deepspeed_io_kwargs = \ get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, @@ -95,22 +118,21 @@ def to_layers(self): order_by_seqlen=order_by_seqlen, gradient_accumulation_steps=gradient_accumulation_steps, dataloader_num_workers=0, - dataloader_collate_fn=collate_fn, + dataloader_collate_fn=dataset.collate_fn, optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), required_microbatches_of_same_size=pipeline_parallelism, - required_return_of_batch_max_seqlen=pipeline_parallelism, ) # PyTorch example iterating whole dataset in one epoch for epoch in range(2): - for sample_idx, (inputs, labels) in enumerate(dataloader): + for sample_idx, (seqs, seqlens, labels) in enumerate(dataloader): batch_id = sample_idx // gradient_accumulation_steps microbatch_id = sample_idx % gradient_accumulation_steps - inputs, labels = inputs.to(device), labels.to(device) - outputs = model_ddp(inputs) - loss = F.mse_loss(outputs, labels) + seqs, labels = seqs.to(device), labels.to(device) + outputs = model_ddp(seqs, seqlens) + loss = loss_fn(outputs, labels) loss.backward() if (microbatch_id + 1) % gradient_accumulation_steps == 0: if dataloader_rank == 0: @@ -140,12 +162,12 @@ def to_layers(self): lr_scheduler.step(0) # reset LR scheduler for epoch in range(2): - for sample_idx, (inputs, labels) in enumerate(dataloader): + for sample_idx, (seqs, seqlens, labels) in enumerate(dataloader): batch_id = sample_idx // gradient_accumulation_steps microbatch_id = sample_idx % gradient_accumulation_steps - inputs, labels = inputs.to(device), labels.to(device) - outputs = engine(inputs) - loss = F.mse_loss(outputs, labels) + seqs, labels = seqs.to(device), labels.to(device) + outputs = engine(seqs, seqlens) + loss = loss_fn(outputs, labels) engine.backward(loss) if dataloader_rank == 0: print( @@ -155,7 +177,7 @@ def to_layers(self): # Deepspeed example for pipeline parallelism if pipeline_parallelism: - model = PipelineModule(layers=model.to_layers(), num_stages=2) + model = PipelineModule(layers=model.to_layers(), num_stages=2, loss_fn=loss_fn) engine, optimizer, _, lr_scheduler = deepspeed.initialize( config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) From b516356d3a5895f1a99ed5105a0378687b012f2a Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Tue, 12 Mar 2024 20:29:34 +0000 Subject: [PATCH 45/64] fixed seq lens computation --- .../variable_batch_size_and_lr_test.py | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py index 723b019489cf..0ad3644dbaa5 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py @@ -34,53 +34,53 @@ def __init__(self, seq_count, min_seqlen=1, max_seqlen=21, embed_dim=5, seed=0): def collate_fn(self, batch): """ pad sequences of different lenghts into batch of size BxTxE """ seqs, labels = zip(*batch) - seqlens = torch.tensor([ len(s) for s in seqs ]) seqs = nn.utils.rnn.pad_sequence([s[0] for s in batch], batch_first=True, padding_value=self.padding_value) + seqs = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=self.padding_value) labels = torch.tensor([s[1] for s in batch], dtype=float) - return seqs, seqlens, labels + return seqs, labels - class SingleHeadAttentionAndFeedForward(nn.Module): - """ a test feedforward model """ - - def __init__(self, max_seqlen, embed_dim): - super(SingleHeadAttentionAndFeedForward, self).__init__() + class AttentionHeadAndFeedForward(nn.Module): + """ A single attention head followed by a feed forward. No embeddings """ + def __init__(self, max_seqlen, embed_dim, device): + super(AttentionHeadAndFeedForward, self).__init__() self.padding_value = 0 self.max_seqlen = max_seqlen # M: size of mask - self.attn_head = nn.MultiheadAttention(embed_dim, num_heads=1) + self.device = device + self.qe = nn.Linear(embed_dim, embed_dim) + self.ke = nn.Linear(embed_dim, embed_dim) + self.ve = nn.Linear(embed_dim, embed_dim) + self.attn_head = nn.MultiheadAttention(embed_dim, num_heads=1, batch_first=True) self.fc1 = nn.Linear(embed_dim, 128) self.fc2 = nn.Linear(128, embed_dim) - def forward(self, x, attn_mask_seqlens=None): + def forward(self, x): + + # compute length of each sequence as first index of padding value, or max length if no padding + B, T, E = x.shape + seqlens = torch.full(size=(B,), fill_value=T, dtype=int, device=x.device) + seq_ids, seq_padding_ids = torch.where(x[:,:,0]==self.padding_value) + seqlens[seq_ids] = seq_padding_ids # optional: 3D masks for attention, padded to individual input sequence lengths - B, T = len(x), max(attn_mask_seqlens) - if attn_mask_seqlens is not None: - masks = torch.tril(torch.ones((B,T,T), dtype=torch.float32)).to(x[0].device) - for i, seqlen in enumerate(attn_mask_seqlens): - masks[i, seqlen:, :] = masks[i, :, seqlen:] = 0 + masks = torch.tril(torch.ones((B,T,T), dtype=bool)).to(self.device) + for i, seqlen in enumerate(seqlens): + masks[i, seqlen:, :] = masks[i, :, seqlen:] = False # collates sequences of different lengths into a batch of size BxTxE x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_value) + x = x.to(self.device) - # compute q@k / sqrt(d_k) on input of shape BxTxE (where B and T can change) - k, q, v = x, x, x - out = q@k.transpose(-2,-1) # BxTxE @ BxExT --> BxTxT - out = out / (x.shape[-1]**0.5) # √d_k - - if attn_mask_seqlens is not None: # mask, if needed - out = out.masked_fill(masks==0, value=float('-inf')) - - # softmax and multiply by values vector - out = F.softmax(out, dim=-1) # softmax --> BxTxT - out = out@v # BxTxT @ BxTxE --> BxTxE + # linear projections and attention head + q, k, v = self.qe(x), self.ke(x), self.ve(x) + out, _ = self.attn_head(q, k, v, need_weights=False, attn_mask=masks) - # feedforward: needs to converts BxTxE to BxMxE by padding extra tokens + # feedforward: needs to convert BxTxE to BxMxE by padding extra tokens out = F.pad(out, pad=(0, 0, 0, self.max_seqlen-T), value=self.padding_value) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) - return torch.tensor(out.sum(-1).sum(-1), requires_grad=True, dtype=float) + return torch.tensor(out.nansum(-1).nansum(-1), requires_grad=True) def to_layers(self): @@ -95,15 +95,15 @@ def to_layers(self): base_lr = 1e-3 gradient_accumulation_steps = base_batch_size // dataloader_num_replicas pipeline_parallelism = True - order_by_seqlen = True #enable for curriculum + order_by_seqlen = False #enable for curriculum max_seqlen = 15 torch_dist.init_process_group(backend='nccl') dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) - model = SingleHeadAttentionAndFeedForward(max_seqlen, dataset.embed_dim).to(device) + model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim, device).to(device) model_ddp = DDP(model, device_ids=[device]) optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) - loss_fn = lambda x, y: F.mse_loss(x, y) + loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) seqlens = [len(s[0]) for s in dataset] dataloader, lr_scheduler, deepspeed_io_kwargs = \ @@ -127,16 +127,16 @@ def to_layers(self): # PyTorch example iterating whole dataset in one epoch for epoch in range(2): - for sample_idx, (seqs, seqlens, labels) in enumerate(dataloader): + for sample_idx, (seqs, labels) in enumerate(dataloader): batch_id = sample_idx // gradient_accumulation_steps microbatch_id = sample_idx % gradient_accumulation_steps seqs, labels = seqs.to(device), labels.to(device) - outputs = model_ddp(seqs, seqlens) + outputs = model_ddp(seqs) loss = loss_fn(outputs, labels) loss.backward() if (microbatch_id + 1) % gradient_accumulation_steps == 0: if dataloader_rank == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + print(f"torch batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") optimizer.step() optimizer.zero_grad() lr_scheduler.step() @@ -157,21 +157,21 @@ def to_layers(self): engine, optimizer, _, lr_scheduler = deepspeed.initialize( config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) - # engine.training_dataloader = dataloader #use this or the deepspeed_io() + # engine.training_dataloader = dataloader # use this or the deepspeed_io() below engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) lr_scheduler.step(0) # reset LR scheduler for epoch in range(2): - for sample_idx, (seqs, seqlens, labels) in enumerate(dataloader): + for sample_idx, (seqs, labels) in enumerate(dataloader): batch_id = sample_idx // gradient_accumulation_steps microbatch_id = sample_idx % gradient_accumulation_steps seqs, labels = seqs.to(device), labels.to(device) - outputs = engine(seqs, seqlens) + outputs = engine(seqs) loss = loss_fn(outputs, labels) engine.backward(loss) if dataloader_rank == 0: print( - f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" + f"deepspeed batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" ) engine.step() @@ -186,7 +186,7 @@ def to_layers(self): lr_scheduler.step(0) # reset LR scheduler for epoch in range(2): for batch_id in range(len(dataloader) // gradient_accumulation_steps): - engine.reset_activation_shape() # each batch has a diff length + engine.reset_activation_shape() # each batch has a diff BxT dimension loss = engine.train_batch(data_iter=dataloader_it) if dataloader_rank == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + print(f"pipeline batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") From 8b41845d3180658e85b4609bf9b6591498b7ece0 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Tue, 12 Mar 2024 22:58:09 +0000 Subject: [PATCH 46/64] pipeline parallelism for enforced max seq size --- .../variable_batch_size_and_lr.py | 32 ++--- .../variable_batch_size_and_lr_test.py | 130 +++++++----------- 2 files changed, 66 insertions(+), 96 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 0abe1161fb44..bab47d5e7b47 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -86,26 +86,26 @@ def is_microbatch_valid(metrics): while batch_init < len(metrics): # we iterate over possible effective batch sizes (groups of microbatches of same size) - for batch_size in range(equal_size_multiple, len(metrics), equal_size_multiple): + valid_batch_end = batch_init + for batch_end in range(batch_init+equal_size_multiple, len(metrics), equal_size_multiple): # attempt effective batch - batch = metrics[batch_init:batch_init + batch_size] + batch = metrics[batch_init:batch_end] # pick interleaved samples for each microbatch to help with load balancing # (in the ordered use case), and to replicate what the distributed sampler does. - microbatch = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] + mbs = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] # if they are all valid micro-batches, keep them until you find longer mbatches, if any - is_batch_valid = all([is_microbatch_valid(mb) for mb in microbatch]) - if not is_batch_valid: - break + is_batch_valid = all([is_microbatch_valid(mb) for mb in mbs]) + if is_batch_valid: + valid_batch_end = batch_end - if not is_batch_valid: batch_size -= equal_size_multiple #ignore last iteration (not valid) - if batch_size == 0 : break # last batch is not valid (size zero), so we are done - batch = metrics[batch_init:batch_init + batch_size] - microbatch = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] - batch_init += sum([len(l) for l in microbatch]) - microbatches += microbatch + if batch_init == valid_batch_end: break # last batch is not valid (size zero), so we are done + batch = metrics[batch_init:valid_batch_end] + mbs = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] + batch_init += sum([len(l) for l in mbs]) + microbatches += mbs # make sure we give the same number of (micro-)batches to each dataloader by trimming dataset microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] @@ -114,12 +114,12 @@ def is_microbatch_valid(metrics): batch_sizes, microbatch_ids = [], [] for rank in range(0, len(microbatches), num_microbatches_per_batch): batch_id = rank // num_microbatches_per_batch - microbatch = microbatches[rank:rank + num_microbatches_per_batch] - batch_size = sum([len(mb) for mb in microbatch]) - mb_ids = [ [m[1] for m in metrics] for metrics in microbatch] + mbs = microbatches[rank:rank + num_microbatches_per_batch] + batch_size = sum([len(mb) for mb in mbs]) + mb_ids = [ [m[1] for m in metrics] for metrics in mbs] batch_sizes.append(batch_size) microbatch_ids += mb_ids - n_tokens_in_batch = sum([m[0] for m in microbatch[0]]) + n_tokens_in_batch = sum([m[0] for m in mbs[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {mb_ids}") diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py index 0ad3644dbaa5..53083dbbd9e4 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py @@ -7,10 +7,9 @@ import os import torch import torch.nn as nn -from torch.nn.parallel import DistributedDataParallel as DDP -from torch import distributed as torch_dist import torch.nn.functional as F import deepspeed +import deepspeed.comm as dist from deepspeed.pipe import PipelineModule from deepspeed.runtime.data_pipeline.data_sampling.variable_batch_size_and_lr import get_dataloader_and_lr_scheduler_for_variable_batch_size @@ -21,7 +20,7 @@ class TestData(torch.utils.data.Dataset): """ A test dataset with sequences of random length, and the sequence length as the label""" - def __init__(self, seq_count, min_seqlen=1, max_seqlen=21, embed_dim=5, seed=0): + def __init__(self, seq_count, min_seqlen=1, max_seqlen=20, embed_dim=5, seed=0): data_random = random.Random(seed) self.mask_size = max_seqlen # M: size of mask self.padding_value = 0 @@ -35,7 +34,7 @@ def collate_fn(self, batch): """ pad sequences of different lenghts into batch of size BxTxE """ seqs, labels = zip(*batch) seqs = nn.utils.rnn.pad_sequence([s[0] for s in batch], batch_first=True, padding_value=self.padding_value) - seqs = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=self.padding_value) + seqs = F.pad(seqs, pad=(0, 0, 0, self.mask_size-seqs.shape[1]), value=self.padding_value) labels = torch.tensor([s[1] for s in batch], dtype=float) return seqs, labels @@ -80,31 +79,44 @@ def forward(self, x): out = F.pad(out, pad=(0, 0, 0, self.max_seqlen-T), value=self.padding_value) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) - return torch.tensor(out.nansum(-1).nansum(-1), requires_grad=True) + return torch.tensor(out.nansum(-1).nansum(-1).data, requires_grad=True) def to_layers(self): return [self.fc1, self.fc2, lambda x: x.sum(-1).sum(-1)] - dataloader_rank = int(os.environ.get('RANK', 0)) - dataloader_num_replicas = int(os.environ.get('WORLD_SIZE', 1)) - device_id = int(os.environ.get('LOCAL_RANK', 0)) - device = f"cuda:{device_id}" + deepspeed.init_distributed() + device = f"cuda:{dist.get_local_rank()}" max_seqlen_per_batch = 40 base_batch_size = 8 base_lr = 1e-3 - gradient_accumulation_steps = base_batch_size // dataloader_num_replicas - pipeline_parallelism = True + pipeline_num_stages = 2 order_by_seqlen = False #enable for curriculum max_seqlen = 15 - torch_dist.init_process_group(backend='nccl') dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim, device).to(device) - model_ddp = DDP(model, device_ids=[device]) - optimizer = torch.optim.Adam(model_ddp.parameters(), lr=1e-3) + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) + if pipeline_num_stages: + model = PipelineModule(layers=model.to_layers(), num_stages=pipeline_num_stages, loss_fn=loss_fn) + + # DeepSpeed config + config = { + "train_batch_size": base_batch_size, + "train_micro_batch_size_per_gpu": 1, # due to variable batch size + "optimizer": { + "type": "Adam", + "params": { + "lr": base_lr + } + }, + } + + engine, optimizer, _, lr_scheduler = deepspeed.initialize( + config=config, model=model, optimizer=optimizer) + seqlens = [len(s[0]) for s in dataset] dataloader, lr_scheduler, deepspeed_io_kwargs = \ get_dataloader_and_lr_scheduler_for_variable_batch_size( @@ -112,81 +124,39 @@ def to_layers(self): dataset_seqlens=seqlens, base_batch_size=base_batch_size, max_seqlen_per_batch=max_seqlen_per_batch, - dataloader_rank=dataloader_rank, - dataloader_num_replicas=dataloader_num_replicas, + dataloader_rank=engine.data_parallel_group.rank(), + dataloader_num_replicas=engine.data_parallel_group.size(), lr_scaling_method="linear", order_by_seqlen=order_by_seqlen, - gradient_accumulation_steps=gradient_accumulation_steps, + gradient_accumulation_steps=engine.gradient_accumulation_steps(), dataloader_num_workers=0, dataloader_collate_fn=dataset.collate_fn, optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), - required_microbatches_of_same_size=pipeline_parallelism, + required_microbatches_of_same_size=pipeline_num_stages>0, ) - - # PyTorch example iterating whole dataset in one epoch - for epoch in range(2): - for sample_idx, (seqs, labels) in enumerate(dataloader): - batch_id = sample_idx // gradient_accumulation_steps - microbatch_id = sample_idx % gradient_accumulation_steps - seqs, labels = seqs.to(device), labels.to(device) - outputs = model_ddp(seqs) - loss = loss_fn(outputs, labels) - loss.backward() - if (microbatch_id + 1) % gradient_accumulation_steps == 0: - if dataloader_rank == 0: - print(f"torch batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") - optimizer.step() - optimizer.zero_grad() - lr_scheduler.step() - - torch_dist.destroy_process_group() - - # DeepSpeed example - config = { - "train_batch_size": base_batch_size, - "gradient_accumulation_steps": gradient_accumulation_steps, - "optimizer": { - "type": "Adam", - "params": { - "lr": base_lr - } - }, - } - - engine, optimizer, _, lr_scheduler = deepspeed.initialize( - config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) + # engine.training_dataloader = dataloader # use this or the deepspeed_io() below engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - - lr_scheduler.step(0) # reset LR scheduler - for epoch in range(2): - for sample_idx, (seqs, labels) in enumerate(dataloader): - batch_id = sample_idx // gradient_accumulation_steps - microbatch_id = sample_idx % gradient_accumulation_steps - seqs, labels = seqs.to(device), labels.to(device) - outputs = engine(seqs) - loss = loss_fn(outputs, labels) - engine.backward(loss) - if dataloader_rank == 0: - print( - f"deepspeed batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" - ) - engine.step() - - # Deepspeed example for pipeline parallelism - if pipeline_parallelism: - model = PipelineModule(layers=model.to_layers(), num_stages=2, loss_fn=loss_fn) - engine, optimizer, _, lr_scheduler = deepspeed.initialize( - config=config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) - engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - - dataloader_it = iter(dataloader) # reset dataloader - lr_scheduler.step(0) # reset LR scheduler - for epoch in range(2): - for batch_id in range(len(dataloader) // gradient_accumulation_steps): + engine.client_lr_scheduler = lr_scheduler + engine._configure_lr_scheduler(lr_scheduler) + gradient_acc_steps = engine.gradient_accumulation_steps() + + n_batches_per_rank = len(dataloader)//gradient_acc_steps + for epoch in range(10): + if pipeline_num_stages: + dataloader_it = iter(dataloader) # point dataloader to first batch + lr_scheduler.step(0) # point LR scheduler to first batch + for batch_id in range(n_batches_per_rank): engine.reset_activation_shape() # each batch has a diff BxT dimension loss = engine.train_batch(data_iter=dataloader_it) - if dataloader_rank == 0: - print(f"pipeline batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + else: + for i, (seqs, labels) in enumerate(dataloader): + seqs, labels = seqs.to(device), labels.to(device) + outputs = engine(seqs) + loss = loss_fn(outputs, labels) + engine.backward(loss) + if engine.data_parallel_group.rank() == 0: + batch_id = i//gradient_acc_steps + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") \ No newline at end of file From ce85b9d2b5bde73b8b7565276ea84d90f0238434 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 00:21:41 +0000 Subject: [PATCH 47/64] pipeline parallelism --- .../variable_batch_size_and_lr.py | 64 +++++++++++-------- .../variable_batch_size_and_lr_test.py | 56 ++++++++-------- 2 files changed, 69 insertions(+), 51 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index bab47d5e7b47..aee0d28fda17 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -7,7 +7,6 @@ import torch from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader, DistributedSampler -from torch.nn.parallel import DistributedDataParallel as DDP from deepspeed.utils import logger @@ -87,7 +86,7 @@ def is_microbatch_valid(metrics): # we iterate over possible effective batch sizes (groups of microbatches of same size) valid_batch_end = batch_init - for batch_end in range(batch_init+equal_size_multiple, len(metrics), equal_size_multiple): + for batch_end in range(batch_init + equal_size_multiple, len(metrics), equal_size_multiple): # attempt effective batch batch = metrics[batch_init:batch_end] @@ -101,7 +100,7 @@ def is_microbatch_valid(metrics): if is_batch_valid: valid_batch_end = batch_end - if batch_init == valid_batch_end: break # last batch is not valid (size zero), so we are done + if batch_init == valid_batch_end: break # last batch is not valid (size zero), so we are done batch = metrics[batch_init:valid_batch_end] mbs = [batch[b::equal_size_multiple] for b in range(equal_size_multiple)] batch_init += sum([len(l) for l in mbs]) @@ -111,22 +110,25 @@ def is_microbatch_valid(metrics): microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] #compute the effective batch size for each microbatch. - batch_sizes, microbatch_ids = [], [] + batch_sizes, batch_max_seqlens, microbatch_ids = [], [], [] for rank in range(0, len(microbatches), num_microbatches_per_batch): batch_id = rank // num_microbatches_per_batch mbs = microbatches[rank:rank + num_microbatches_per_batch] batch_size = sum([len(mb) for mb in mbs]) - mb_ids = [ [m[1] for m in metrics] for metrics in mbs] + batch_max_seqlen = max([m[0] for metrics in mbs for m in metrics]) + sample_ids = [[m[1] for m in metrics] for metrics in mbs] + batch_and_mb_ids = zip([batch_id] * num_microbatches_per_batch, sample_ids) batch_sizes.append(batch_size) - microbatch_ids += mb_ids + batch_max_seqlens.append(batch_max_seqlen) + microbatch_ids += batch_and_mb_ids n_tokens_in_batch = sum([m[0] for m in mbs[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {mb_ids}") + print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {sample_ids}") # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_ids) // num_microbatches_per_batch - return microbatch_ids, batch_sizes + return microbatch_ids, batch_sizes, batch_max_seqlens def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): @@ -145,13 +147,18 @@ def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"): raise ValueError("Unknown scaling method: {}".format(method)) -def dataloader_for_variable_batch_size(dataset, - microbatch_ids, - dataloader_rank, - dataloader_num_replicas, - dataloader_collate_fn=None, - dataloader_num_workers=2, - dataloader_pin_memory=False): +def dataloader_for_variable_batch_size( + dataset, + microbatch_ids, + batch_max_seqlens, + dataloader_rank, + dataloader_num_replicas, + dataloader_collate_fn=None, + dataloader_num_workers=2, + dataloader_pin_memory=False, + dataloader_padding_fn=None, + required_microbatches_of_same_seqlen=False, +): # equidistantly distribute the microbatches across the replicas in an interleaved fashion. sampler = DistributedSampler( @@ -162,20 +169,23 @@ def dataloader_for_variable_batch_size(dataset, drop_last=False, ) - # collate function applies wraps user defined collate function to the variable batch data - def collate_fn_wrapper(list_microbatch_ids, dataset, collate_fn=None): + # collate function wraps user-defined collate function to the variable batch data + def collate_fn_wrapper(list_microbatch_ids): assert len(list_microbatch_ids) == 1, "only 1 element should be returned by the sampler." - microbatch_ids = list_microbatch_ids[0] + batch_id, microbatch_ids = list_microbatch_ids[0] batch = [dataset[idx] for idx in microbatch_ids] - return collate_fn(batch) if collate_fn else batch - - collate_fn = lambda b: collate_fn_wrapper(b, dataset, dataloader_collate_fn) + if required_microbatches_of_same_seqlen: + assert dataloader_padding_fn, \ + "padding dataloader_padding_fn must be provided if required_microbatches_of_same_seqlen is True" + pad_len = batch_max_seqlens[batch_id] + batch = [dataloader_padding_fn(b, pad_len) for b in batch] + return dataloader_collate_fn(batch) if dataloader_collate_fn else batch dataloader = DataLoader( dataset=microbatch_ids, sampler=sampler, num_workers=dataloader_num_workers, - collate_fn=collate_fn, + collate_fn=collate_fn_wrapper, pin_memory=dataloader_pin_memory, ) @@ -184,7 +194,7 @@ def collate_fn_wrapper(list_microbatch_ids, dataset, collate_fn=None): batch_size=1, pin_memory=dataloader_pin_memory, data_sampler=sampler, - collate_fn=collate_fn, + collate_fn=dataloader_collate_fn, num_local_io_workers=dataloader_num_workers, ) @@ -302,15 +312,16 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_num_replicas=1, dataloader_num_workers=0, dataloader_collate_fn=None, + dataloader_padding_fn=None, dataloader_pin_memory=False, optimizer=None, lr_scheduler_class=None, lr_scheduler_kwargs={'verbose': False}, required_microbatches_of_same_size=False, + required_microbatches_of_same_seqlen=False, verbose=False, ): - - microbatch_ids, batch_sizes = batch_by_size( + microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, max_tokens_per_batch=max_seqlen_per_batch, sample_ids=sample_ids, @@ -327,11 +338,14 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader, deepspeed_io_kwargs = dataloader_for_variable_batch_size( dataset=dataset, microbatch_ids=microbatch_ids, + batch_max_seqlens=batch_max_seqlens, dataloader_rank=dataloader_rank, dataloader_num_replicas=dataloader_num_replicas, dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, + dataloader_padding_fn=dataloader_padding_fn, + required_microbatches_of_same_seqlen=required_microbatches_of_same_seqlen, ) lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=base_batch_size, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py index 53083dbbd9e4..6169967d3e90 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py @@ -4,7 +4,6 @@ # DeepSpeed Team import random -import os import torch import torch.nn as nn import torch.nn.functional as F @@ -14,7 +13,6 @@ from deepspeed.runtime.data_pipeline.data_sampling.variable_batch_size_and_lr import get_dataloader_and_lr_scheduler_for_variable_batch_size - if __name__ == "__main__": class TestData(torch.utils.data.Dataset): @@ -22,22 +20,28 @@ class TestData(torch.utils.data.Dataset): def __init__(self, seq_count, min_seqlen=1, max_seqlen=20, embed_dim=5, seed=0): data_random = random.Random(seed) - self.mask_size = max_seqlen # M: size of mask + self.mask_size = max_seqlen # M: size of mask self.padding_value = 0 self.embed_dim = embed_dim - self.seqs = [torch.ones(data_random.randrange(min_seqlen, max_seqlen), embed_dim) for _ in range(seq_count)] + self.seqs = [ + torch.ones(data_random.randrange(min_seqlen, max_seqlen), embed_dim) for _ in range(seq_count) + ] __len__ = lambda self: len(self.seqs) - __getitem__ = lambda self, idx: ( self.seqs[idx], len(self.seqs[idx]) ) + __getitem__ = lambda self, idx: (self.seqs[idx], len(self.seqs[idx])) def collate_fn(self, batch): - """ pad sequences of different lenghts into batch of size BxTxE """ + """ collate sequences of different lengths into batch of size BxTxE, where T is max seqlen """ seqs, labels = zip(*batch) - seqs = nn.utils.rnn.pad_sequence([s[0] for s in batch], batch_first=True, padding_value=self.padding_value) - seqs = F.pad(seqs, pad=(0, 0, 0, self.mask_size-seqs.shape[1]), value=self.padding_value) - labels = torch.tensor([s[1] for s in batch], dtype=float) + seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=self.padding_value) + labels = torch.tensor(labels, dtype=float) return seqs, labels + def padding_fn(self, sample, size): + """ pad sequence `seq` of shape TxE to size T'xE where T' is give by `size` """ + seq, label = sample + seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) + return seq, label class AttentionHeadAndFeedForward(nn.Module): """ A single attention head followed by a feed forward. No embeddings """ @@ -45,7 +49,7 @@ class AttentionHeadAndFeedForward(nn.Module): def __init__(self, max_seqlen, embed_dim, device): super(AttentionHeadAndFeedForward, self).__init__() self.padding_value = 0 - self.max_seqlen = max_seqlen # M: size of mask + self.max_seqlen = max_seqlen # M: size of mask self.device = device self.qe = nn.Linear(embed_dim, embed_dim) self.ke = nn.Linear(embed_dim, embed_dim) @@ -58,12 +62,12 @@ def forward(self, x): # compute length of each sequence as first index of padding value, or max length if no padding B, T, E = x.shape - seqlens = torch.full(size=(B,), fill_value=T, dtype=int, device=x.device) - seq_ids, seq_padding_ids = torch.where(x[:,:,0]==self.padding_value) + seqlens = torch.full(size=(B, ), fill_value=T, dtype=int, device=x.device) + seq_ids, seq_padding_ids = torch.where(x[:, :, 0] == self.padding_value) seqlens[seq_ids] = seq_padding_ids # optional: 3D masks for attention, padded to individual input sequence lengths - masks = torch.tril(torch.ones((B,T,T), dtype=bool)).to(self.device) + masks = torch.tril(torch.ones((B, T, T), dtype=bool)).to(self.device) for i, seqlen in enumerate(seqlens): masks[i, seqlen:, :] = masks[i, :, seqlen:] = False @@ -76,12 +80,11 @@ def forward(self, x): out, _ = self.attn_head(q, k, v, need_weights=False, attn_mask=masks) # feedforward: needs to convert BxTxE to BxMxE by padding extra tokens - out = F.pad(out, pad=(0, 0, 0, self.max_seqlen-T), value=self.padding_value) + out = F.pad(out, pad=(0, 0, 0, self.max_seqlen - T), value=self.padding_value) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) return torch.tensor(out.nansum(-1).nansum(-1).data, requires_grad=True) - def to_layers(self): return [self.fc1, self.fc2, lambda x: x.sum(-1).sum(-1)] @@ -90,7 +93,7 @@ def to_layers(self): max_seqlen_per_batch = 40 base_batch_size = 8 base_lr = 1e-3 - pipeline_num_stages = 2 + pipeline_num_stages = 0 order_by_seqlen = False #enable for curriculum max_seqlen = 15 @@ -105,7 +108,7 @@ def to_layers(self): # DeepSpeed config config = { "train_batch_size": base_batch_size, - "train_micro_batch_size_per_gpu": 1, # due to variable batch size + "train_micro_batch_size_per_gpu": 1, # due to variable batch size "optimizer": { "type": "Adam", "params": { @@ -114,8 +117,7 @@ def to_layers(self): }, } - engine, optimizer, _, lr_scheduler = deepspeed.initialize( - config=config, model=model, optimizer=optimizer) + engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, model=model, optimizer=optimizer) seqlens = [len(s[0]) for s in dataset] dataloader, lr_scheduler, deepspeed_io_kwargs = \ @@ -131,25 +133,27 @@ def to_layers(self): gradient_accumulation_steps=engine.gradient_accumulation_steps(), dataloader_num_workers=0, dataloader_collate_fn=dataset.collate_fn, + dataloader_padding_fn=dataset.padding_fn, optimizer=optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), - required_microbatches_of_same_size=pipeline_num_stages>0, + required_microbatches_of_same_size = pipeline_num_stages>0, + required_microbatches_of_same_seqlen = pipeline_num_stages>0, ) - + # engine.training_dataloader = dataloader # use this or the deepspeed_io() below engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) engine.client_lr_scheduler = lr_scheduler engine._configure_lr_scheduler(lr_scheduler) gradient_acc_steps = engine.gradient_accumulation_steps() - n_batches_per_rank = len(dataloader)//gradient_acc_steps + n_batches_per_rank = len(dataloader) // gradient_acc_steps for epoch in range(10): if pipeline_num_stages: dataloader_it = iter(dataloader) # point dataloader to first batch - lr_scheduler.step(0) # point LR scheduler to first batch + lr_scheduler.step(0) # point LR scheduler to first batch for batch_id in range(n_batches_per_rank): - engine.reset_activation_shape() # each batch has a diff BxT dimension + engine.reset_activation_shape() # each batch has a diff BxT dimension loss = engine.train_batch(data_iter=dataloader_it) else: for i, (seqs, labels) in enumerate(dataloader): @@ -158,5 +162,5 @@ def to_layers(self): loss = loss_fn(outputs, labels) engine.backward(loss) if engine.data_parallel_group.rank() == 0: - batch_id = i//gradient_acc_steps - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") \ No newline at end of file + batch_id = i // gradient_acc_steps + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") From 82e3dd28712cdda8c17d88deb45626f39415af21 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 00:40:55 +0000 Subject: [PATCH 48/64] renamed file --- ..._size_and_lr_test.py => variable_batch_size_and_lr_example.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename deepspeed/runtime/data_pipeline/data_sampling/{variable_batch_size_and_lr_test.py => variable_batch_size_and_lr_example.py} (100%) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py similarity index 100% rename from deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_test.py rename to deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py From d43f9816271f22bda943215870f7e6f0a9c4911c Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 00:42:07 +0000 Subject: [PATCH 49/64] renamed --- .../data_sampling/variable_batch_size_and_lr_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 6169967d3e90..12527751ef54 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -93,7 +93,7 @@ def to_layers(self): max_seqlen_per_batch = 40 base_batch_size = 8 base_lr = 1e-3 - pipeline_num_stages = 0 + pipeline_num_stages = 2 order_by_seqlen = False #enable for curriculum max_seqlen = 15 From 0ae1dc8cea77af2216af7a9bb26ff83400b08d9b Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 09:56:22 +0000 Subject: [PATCH 50/64] train_batch_size_per_gpu >1 --- deepspeed/runtime/data_pipeline/config.py | 2 +- deepspeed/runtime/data_pipeline/constants.py | 6 +- .../variable_batch_size_and_lr.py | 59 ++++++++++--------- .../variable_batch_size_and_lr_example.py | 37 ++++++------ 4 files changed, 54 insertions(+), 50 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index 49f3a5ff614d..38e076100a2b 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -94,7 +94,7 @@ def get_dynamic_batching_params(param_dict): return dynamic_batching_params else: return {} - + def get_dynamic_batching(param_dict): output = {} diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py index 0ba32039e106..6689cd7e5c5e 100644 --- a/deepspeed/runtime/data_pipeline/constants.py +++ b/deepspeed/runtime/data_pipeline/constants.py @@ -68,14 +68,14 @@ DYNAMIC_BATCHING = "dynamic_batching" DYNAMIC_BATCHING_ENABLED = "enabled" DYNAMIC_BATCHING_ENABLED_DEFAULT = False -DYNAMIC_BATCHING_LR_SCALING = "lr_scaling" # "linear" / "sqrt" / "none" +DYNAMIC_BATCHING_LR_SCALING = "lr_scaling" # "linear" / "sqrt" / "none" DYNAMIC_BATCHING_LR_SCALING_DEFAULT = "linear" DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size" DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1 DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size" DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None -DYNAMIC_BATCHING_SAMPLES_ORDER = "samples_order" # "random" / "order" / "default" -DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT = "dataloader" # "random" / "order" / "dataloader" +DYNAMIC_BATCHING_SAMPLES_ORDER = "samples_order" # "random" / "order" / "default" +DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT = "dataloader" # "random" / "order" / "dataloader" DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH = "max_tokens_per_batch" ######################################### diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index aee0d28fda17..14782174228c 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -18,8 +18,7 @@ def batch_by_size( max_batch_size=None, shuffle_seqlens=False, order_by_seqlen=False, - dataloader_num_replicas=1, - gradient_accumulation_steps=1, + effective_batch_size=1, required_microbatches_of_same_size=False, verbose=False, seed=0, @@ -40,7 +39,7 @@ def batch_by_size( - `shuffle_seqlens`: shuffle metric values before packing samples into batches; - `order_by_seqlen`: order samples by ascending metric values before packing into batches; - `dataloader_num_replicas`: number of dataloaders - - `gradient_accumulation_steps`: number of gradient accumulation steps; + - `effective_batch_size`: effective batch size; - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` micro-batches per batch), should have all micro-batches with the same batch size ie the same number of sentences. @@ -78,8 +77,7 @@ def is_microbatch_valid(metrics): # go through all samples and pack then in microbatches of metric sums below the threshold # `required_microbatches_of_same_size` means all minibatches in a batch must be of equal size - num_microbatches_per_batch = dataloader_num_replicas * gradient_accumulation_steps - equal_size_multiple = num_microbatches_per_batch if required_microbatches_of_same_size else 1 + equal_size_multiple = effective_batch_size if required_microbatches_of_same_size else 1 microbatches = [] batch_init = 0 while batch_init < len(metrics): @@ -107,17 +105,17 @@ def is_microbatch_valid(metrics): microbatches += mbs # make sure we give the same number of (micro-)batches to each dataloader by trimming dataset - microbatches = microbatches[:len(microbatches) - len(microbatches) % num_microbatches_per_batch] + microbatches = microbatches[:len(microbatches) - len(microbatches) % effective_batch_size] #compute the effective batch size for each microbatch. batch_sizes, batch_max_seqlens, microbatch_ids = [], [], [] - for rank in range(0, len(microbatches), num_microbatches_per_batch): - batch_id = rank // num_microbatches_per_batch - mbs = microbatches[rank:rank + num_microbatches_per_batch] + for rank in range(0, len(microbatches), effective_batch_size): + batch_id = rank // effective_batch_size + mbs = microbatches[rank:rank + effective_batch_size] batch_size = sum([len(mb) for mb in mbs]) batch_max_seqlen = max([m[0] for metrics in mbs for m in metrics]) sample_ids = [[m[1] for m in metrics] for metrics in mbs] - batch_and_mb_ids = zip([batch_id] * num_microbatches_per_batch, sample_ids) + batch_and_mb_ids = zip([batch_id] * effective_batch_size, sample_ids) batch_sizes.append(batch_size) batch_max_seqlens.append(batch_max_seqlen) microbatch_ids += batch_and_mb_ids @@ -127,7 +125,7 @@ def is_microbatch_valid(metrics): print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {sample_ids}") # return the sample ids of each microbatch, and the batch sizes - assert len(batch_sizes) == len(microbatch_ids) // num_microbatches_per_batch + assert len(batch_sizes) == len(microbatch_ids) // effective_batch_size return microbatch_ids, batch_sizes, batch_max_seqlens @@ -156,8 +154,8 @@ def dataloader_for_variable_batch_size( dataloader_collate_fn=None, dataloader_num_workers=2, dataloader_pin_memory=False, - dataloader_padding_fn=None, required_microbatches_of_same_seqlen=False, + sample_padding_fn=None, ): # equidistantly distribute the microbatches across the replicas in an interleaved fashion. @@ -171,14 +169,17 @@ def dataloader_for_variable_batch_size( # collate function wraps user-defined collate function to the variable batch data def collate_fn_wrapper(list_microbatch_ids): - assert len(list_microbatch_ids) == 1, "only 1 element should be returned by the sampler." - batch_id, microbatch_ids = list_microbatch_ids[0] - batch = [dataset[idx] for idx in microbatch_ids] - if required_microbatches_of_same_seqlen: - assert dataloader_padding_fn, \ - "padding dataloader_padding_fn must be provided if required_microbatches_of_same_seqlen is True" - pad_len = batch_max_seqlens[batch_id] - batch = [dataloader_padding_fn(b, pad_len) for b in batch] + # each batch is a list of sample ids that fill up to the max tokens per batch + # we return the collated batch of all dataset samples of all input batches. + batch = [] + for batch_id, microbatch_ids in list_microbatch_ids: + batch_data = [dataset[idx] for idx in microbatch_ids] + if required_microbatches_of_same_seqlen: + assert sample_padding_fn is not None, \ + "padding dataloader_padding_fn must be provided if required_microbatches_of_same_seqlen is True" + pad_len = batch_max_seqlens[batch_id] + batch_data = [sample_padding_fn(b, pad_len) for b in batch_data] + batch+=batch_data return dataloader_collate_fn(batch) if dataloader_collate_fn else batch dataloader = DataLoader( @@ -299,38 +300,38 @@ def step(self, epoch=None): def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, dataset_seqlens, - max_seqlen_per_batch, - base_batch_size, + max_tokens_per_batch, + effective_batch_size, sample_ids=None, lr_scaling_method="linear", min_batch_size=1, max_batch_size=None, shuffle_seqlens=False, order_by_seqlen=False, - gradient_accumulation_steps=1, dataloader_rank=0, dataloader_num_replicas=1, dataloader_num_workers=0, dataloader_collate_fn=None, - dataloader_padding_fn=None, dataloader_pin_memory=False, optimizer=None, lr_scheduler_class=None, lr_scheduler_kwargs={'verbose': False}, required_microbatches_of_same_size=False, required_microbatches_of_same_seqlen=False, + sample_padding_fn=None, verbose=False, ): + + # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of GPUs. microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, - max_tokens_per_batch=max_seqlen_per_batch, + max_tokens_per_batch=max_tokens_per_batch, sample_ids=sample_ids, min_batch_size=min_batch_size, max_batch_size=max_batch_size, shuffle_seqlens=shuffle_seqlens, order_by_seqlen=order_by_seqlen, - dataloader_num_replicas=dataloader_num_replicas, - gradient_accumulation_steps=gradient_accumulation_steps, + effective_batch_size=effective_batch_size, required_microbatches_of_same_size=required_microbatches_of_same_size, verbose=verbose, ) @@ -344,11 +345,11 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, - dataloader_padding_fn=dataloader_padding_fn, required_microbatches_of_same_seqlen=required_microbatches_of_same_seqlen, + sample_padding_fn=sample_padding_fn, ) - lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=base_batch_size, + lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=effective_batch_size, batch_sizes=batch_sizes, lr_scaling_method=lr_scaling_method, optimizer=optimizer, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 12527751ef54..8ddb7097502a 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -44,7 +44,7 @@ def padding_fn(self, sample, size): return seq, label class AttentionHeadAndFeedForward(nn.Module): - """ A single attention head followed by a feed forward. No embeddings """ + """ An attention head with variable-length inputs, followed by a feed forward of fixed input. No embeddings. """ def __init__(self, max_seqlen, embed_dim, device): super(AttentionHeadAndFeedForward, self).__init__() @@ -75,7 +75,7 @@ def forward(self, x): x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_value) x = x.to(self.device) - # linear projections and attention head + # linear projections and attention head. Attention size BxTxT q, k, v = self.qe(x), self.ke(x), self.ve(x) out, _ = self.attn_head(q, k, v, need_weights=False, attn_mask=masks) @@ -90,16 +90,12 @@ def to_layers(self): deepspeed.init_distributed() device = f"cuda:{dist.get_local_rank()}" - max_seqlen_per_batch = 40 - base_batch_size = 8 - base_lr = 1e-3 + max_tokens_per_batch = 40 pipeline_num_stages = 2 - order_by_seqlen = False #enable for curriculum max_seqlen = 15 dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim, device).to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) if pipeline_num_stages: @@ -107,38 +103,45 @@ def to_layers(self): # DeepSpeed config config = { - "train_batch_size": base_batch_size, - "train_micro_batch_size_per_gpu": 1, # due to variable batch size + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 2, # Note: each microbatch per GPU will fill up to N tokens "optimizer": { "type": "Adam", "params": { - "lr": base_lr + "lr": 1e-3, } }, + # "scheduler": { + # "type": "WarmupLR", + # "params": { + # "warmup_min_lr": 0, + # "warmup_max_lr": 0.001, + # "warmup_num_steps": 1000 + # } + # } } - engine, optimizer, _, lr_scheduler = deepspeed.initialize(config=config, model=model, optimizer=optimizer) + engine, _, _, lr_scheduler = deepspeed.initialize(config=config, model=model) seqlens = [len(s[0]) for s in dataset] dataloader, lr_scheduler, deepspeed_io_kwargs = \ get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_seqlens=seqlens, - base_batch_size=base_batch_size, - max_seqlen_per_batch=max_seqlen_per_batch, + effective_batch_size=engine.train_batch_size(), + max_tokens_per_batch=max_tokens_per_batch, dataloader_rank=engine.data_parallel_group.rank(), dataloader_num_replicas=engine.data_parallel_group.size(), lr_scaling_method="linear", - order_by_seqlen=order_by_seqlen, - gradient_accumulation_steps=engine.gradient_accumulation_steps(), + order_by_seqlen=False, dataloader_num_workers=0, dataloader_collate_fn=dataset.collate_fn, - dataloader_padding_fn=dataset.padding_fn, - optimizer=optimizer, + optimizer=engine.optimizer, # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), required_microbatches_of_same_size = pipeline_num_stages>0, required_microbatches_of_same_seqlen = pipeline_num_stages>0, + sample_padding_fn=dataset.padding_fn, ) # engine.training_dataloader = dataloader # use this or the deepspeed_io() below From c50883f7f05b2d0055597d1b6129dd25b8fb7e5a Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 12:12:18 +0000 Subject: [PATCH 51/64] bug fixes --- .../variable_batch_size_and_lr.py | 187 ++++++++++-------- .../variable_batch_size_and_lr_example.py | 68 +++---- 2 files changed, 132 insertions(+), 123 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 14782174228c..4a08774d8014 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -6,6 +6,7 @@ import random import torch from torch.optim.lr_scheduler import LRScheduler +from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader, DistributedSampler from deepspeed.utils import logger @@ -149,8 +150,9 @@ def dataloader_for_variable_batch_size( dataset, microbatch_ids, batch_max_seqlens, - dataloader_rank, - dataloader_num_replicas, + dataloader_rank=0, + dataloader_batch_size=1, + dataloader_num_replicas=1, dataloader_collate_fn=None, dataloader_num_workers=2, dataloader_pin_memory=False, @@ -178,12 +180,13 @@ def collate_fn_wrapper(list_microbatch_ids): assert sample_padding_fn is not None, \ "padding dataloader_padding_fn must be provided if required_microbatches_of_same_seqlen is True" pad_len = batch_max_seqlens[batch_id] - batch_data = [sample_padding_fn(b, pad_len) for b in batch_data] + batch_data = [sample_padding_fn(sample, pad_len) for sample in batch_data] batch+=batch_data return dataloader_collate_fn(batch) if dataloader_collate_fn else batch dataloader = DataLoader( dataset=microbatch_ids, + batch_size=dataloader_batch_size, sampler=sampler, num_workers=dataloader_num_workers, collate_fn=collate_fn_wrapper, @@ -192,109 +195,125 @@ def collate_fn_wrapper(list_microbatch_ids): deepspeed_io_kwargs = dict( dataset=microbatch_ids, - batch_size=1, + batch_size=dataloader_batch_size, pin_memory=dataloader_pin_memory, data_sampler=sampler, - collate_fn=dataloader_collate_fn, + collate_fn=collate_fn_wrapper, num_local_io_workers=dataloader_num_workers, ) return dataloader, deepspeed_io_kwargs -class StubLRScheduler(LRScheduler): - """ a stub LR scheduler that does not change the LR, keeps it constant """ +class VariableBatchSizeLR(LRScheduler): + """ an LR scheduler that scales the LR of a given scheduler's LR """ + + @property + def verbose(self): + return self.base_lr_scheduler.verbose + + @property + def optimizer(self): + return self.base_lr_scheduler.optimizer + + @property + def base_lrs(self): + return self.base_lr_scheduler.base_lrs + + @property + def last_epoch(self): + return self.base_lr_scheduler.last_epoch + + def __init__(self, lr_scheduler, base_batch_size, batch_sizes, dataloader, lr_scaling_method="linear"): + self.batch_sizes = batch_sizes + self.base_batch_size = base_batch_size + self.lr_scaling_method = lr_scaling_method + self.dataloader = dataloader + self.base_lr_scheduler = lr_scheduler + + def state_dict(self): + return { + 'base_lr_scheduler': self.base_lr_scheduler.state_dict(), + 'base_batch_size': self.base_batch_size, + 'lr_scaling_method': self.lr_scaling_method, + 'batch_sizes': self.batch_sizes, + } + + def load_state_dict(self, state_dict): + self.base_lr_scheduler.load_state_dict(state_dict['base_lr_scheduler']) + self.base_batch_size = state_dict['base_batch_size'] + self.lr_scaling_method = state_dict['lr_scaling_method'] + self.batch_sizes = state_dict['batch_sizes'] + + def get_last_lr(self): + return self.base_lr_scheduler._last_lr + + def get_lr(self): + try: + return self.base_lr_scheduler.get_lr() + except NotImplementedError: + return [group['lr'] for group in self.optimizer.param_groups] + + def step(self, epoch=None): + # call the base scheduler's step method to get LR for next epoch + # Note: optimizer.step precedes lr_scheduler.step(), so the stepping workflow is: + # init: lr_scheduler.step(0) --> set LR for epoch 0 + # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 + # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 + + # reset unscaled LRs (to the original scheduler's one) for the current epoch + # Note: epoch==0: reset LR scheduler; epoch==None: scale LR for next epoch; + unscaled_lrs = self.base_lrs if epoch == 0 else self.get_last_lr() + for group, lr in zip(self.optimizer.param_groups, unscaled_lrs): + group['lr'] = lr + + self.base_lr_scheduler.step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch + + # scale the learning rate for next epoch for each parameter group. + batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] + for group in self.optimizer.param_groups: + group['lr'] = scale_lr(self.base_batch_size, batch_size, group['lr'], self.lr_scaling_method) - def get_lr(self) -> float: - return self.base_lrs + if True: #self.verbose: + print(f"Batch id {self.last_epoch}, unscaled LR: {unscaled_lrs}, scaled LR: {self.get_lr()}") def lr_scheduler_for_variable_batch_size(base_batch_size, batch_sizes, dataloader, - lr_scaling_method='linear', - optimizer=None, - lr_scheduler_class=None, - **lr_scheduler_kwargs): + lr_scheduler_or_optimizer, + lr_scaling_method='linear'): """ returns a class that provides an LR scheduler that scales learning rate at every epoch taking into account the batch size of each epoch. - If learning rate is constant, ie no LR scheduler, then `optimizer` must be provided. - Otherwise, the base `LRScheduler` must be provided as `lr_scheduler_class`. + If learning rate is constant, ie no LR scheduler, then the LR will be taken from the + constant LR values in the optimizer param groups. Otherwise from the scheduler's LR. Arguments: - `base_batch_size`: the batch size that the base LR in the optimizer or scheduler refers to; - `lr_scaling_method`: method to use to scale LR - see `scale_lr()`; + - `lr_scheduler_or_optimizer`: one instance of `LRScheduler` or `Optimizer` to be used as base; - `batch_sizes`: the effective batch size of each batch in the dataloader; - - `optimizer` and `lr_scheduler_class`: the base LR scheduler. It not provided, - will use the constant LRs from the optimizer's param groups instead. If provided, - the initialization of the scheduler will be done with `lr_scheduler_kwargs`. Returns the new LRScheduler """ + + class StubLRScheduler(LRScheduler): + """ a stub LR scheduler that does not change the LR, keeps it constant """ - class VariableBatchSizeLR(lr_scheduler_class or StubLRScheduler): - - def __init__(self, optimizer, **lr_scheduler_kwargs): - self.batch_sizes = batch_sizes - self.base_batch_size = base_batch_size - self.lr_scaling_method = lr_scaling_method - self.dataloader = dataloader - self._last_lr = [p['lr'] for p in optimizer.param_groups] - super().__init__(optimizer=optimizer, **lr_scheduler_kwargs) - - def state_dict(self): - return { - 'base': super().state_dict(), - 'base_batch_size': self.base_batch_size, - 'lr_scaling_method': self.lr_scaling_method, - 'batch_sizes': self.batch_sizes, - } - - def load_state_dict(self, state_dict): - super().load_state_dict(state_dict['base']) - self.base_batch_size = state_dict['base_batch_size'] - self.lr_scaling_method = state_dict['lr_scaling_method'] - self.batch_sizes = state_dict['batch_sizes'] - - def get_lr(self): - return [group['lr'] for group in self.optimizer.param_groups] - - def step(self, epoch=None): - # call the base scheduler's step method to get LR for next epoch - # Note: optimizer.step preecceds lr_scheduler.step(), so the stepping workflow is: - # init: lr_scheduler.step(0) --> set LR for epoch 0 - # epoch 0: optimizer.step(); lr_scheduler.step(1) --> set LR for epoch 1 - # epoch 1: optimizer.step(); lr_scheduler.step(2) --> set LR for epoch 2 - - # reset unscaled LRs (to the original scheduler's one) for the current epoch - # Note: epoch==0: reset LR scheduler; epoch==None: scale LR for next epoch; - unscaled_lrs = self.base_lrs if epoch == 0 else self._last_lr - for group, lr in zip(self.optimizer.param_groups, unscaled_lrs): - group['lr'] = lr - - super().step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch + def get_lr(self) -> float: + return self.base_lrs - # scale the learning rate for next epoch for each parameter group. - batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] - for group in self.optimizer.param_groups: - group['lr'] = scale_lr(self.base_batch_size, batch_size, group['lr'], lr_scaling_method) - - if self.verbose: - print(f"Batch id {self.last_epoch}, unscaled LR: {unscaled_lrs}, scaled LR: {self.get_lr()}") - - #### main loop: double check arguments and returns correctly-instantiated LR scheduler - - if lr_scheduler_class is None: - assert optimizer is not None, "optimizer must be provided if lr_scheduler_class is not" + if isinstance(lr_scheduler_or_optimizer, Optimizer): + lr_scheduler = StubLRScheduler(lr_scheduler_or_optimizer) + elif isinstance(lr_scheduler_or_optimizer, LRScheduler): + lr_scheduler = lr_scheduler_or_optimizer else: - assert issubclass(lr_scheduler_class, LRScheduler), "lr_scheduler should be a LRScheduler" - - if optimizer is None: - assert lr_scheduler_class is not None, "lr_scheduler_class must be provided if optimizer is not" - optimizer = lr_scheduler_kwargs['optimizer'] + raise ValueError("Unknown type for lr_scheduler_or_optimizer: {}".format(type(lr_scheduler_or_optimizer))) - return VariableBatchSizeLR(optimizer=optimizer, **lr_scheduler_kwargs) + return VariableBatchSizeLR( + lr_scheduler=lr_scheduler, base_batch_size=base_batch_size, batch_sizes=batch_sizes, + dataloader=dataloader, lr_scaling_method=lr_scaling_method) def get_dataloader_and_lr_scheduler_for_variable_batch_size( @@ -308,21 +327,20 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( max_batch_size=None, shuffle_seqlens=False, order_by_seqlen=False, + dataloader_batch_size=1, dataloader_rank=0, dataloader_num_replicas=1, dataloader_num_workers=0, dataloader_collate_fn=None, dataloader_pin_memory=False, - optimizer=None, - lr_scheduler_class=None, - lr_scheduler_kwargs={'verbose': False}, + lr_scheduler_or_optimizer=None, required_microbatches_of_same_size=False, required_microbatches_of_same_seqlen=False, sample_padding_fn=None, verbose=False, ): - # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of GPUs. + # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, max_tokens_per_batch=max_tokens_per_batch, @@ -342,6 +360,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( batch_max_seqlens=batch_max_seqlens, dataloader_rank=dataloader_rank, dataloader_num_replicas=dataloader_num_replicas, + dataloader_batch_size=dataloader_batch_size, dataloader_collate_fn=dataloader_collate_fn, dataloader_num_workers=dataloader_num_workers, dataloader_pin_memory=dataloader_pin_memory, @@ -352,9 +371,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( lr_scheduler = lr_scheduler_for_variable_batch_size(base_batch_size=effective_batch_size, batch_sizes=batch_sizes, lr_scaling_method=lr_scaling_method, - optimizer=optimizer, - dataloader=dataloader, - lr_scheduler_class=lr_scheduler_class, - **lr_scheduler_kwargs) + lr_scheduler_or_optimizer=lr_scheduler_or_optimizer, + dataloader=dataloader) return dataloader, lr_scheduler, deepspeed_io_kwargs diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 8ddb7097502a..92c67d699d5d 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -38,7 +38,7 @@ def collate_fn(self, batch): return seqs, labels def padding_fn(self, sample, size): - """ pad sequence `seq` of shape TxE to size T'xE where T' is give by `size` """ + """ pad sequence `seq` of shape TxE to size T'xE where T' is given by `size` """ seq, label = sample seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label @@ -49,7 +49,7 @@ class AttentionHeadAndFeedForward(nn.Module): def __init__(self, max_seqlen, embed_dim, device): super(AttentionHeadAndFeedForward, self).__init__() self.padding_value = 0 - self.max_seqlen = max_seqlen # M: size of mask + self.max_seqlen = max_seqlen # M: max possible seqlen, and input size to feedforward self.device = device self.qe = nn.Linear(embed_dim, embed_dim) self.ke = nn.Linear(embed_dim, embed_dim) @@ -67,14 +67,10 @@ def forward(self, x): seqlens[seq_ids] = seq_padding_ids # optional: 3D masks for attention, padded to individual input sequence lengths - masks = torch.tril(torch.ones((B, T, T), dtype=bool)).to(self.device) + masks = torch.tril(torch.ones((B, T, T), dtype=bool)).to(x.device) for i, seqlen in enumerate(seqlens): masks[i, seqlen:, :] = masks[i, :, seqlen:] = False - # collates sequences of different lengths into a batch of size BxTxE - x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_value) - x = x.to(self.device) - # linear projections and attention head. Attention size BxTxT q, k, v = self.qe(x), self.ke(x), self.ve(x) out, _ = self.attn_head(q, k, v, need_weights=False, attn_mask=masks) @@ -91,10 +87,11 @@ def to_layers(self): deepspeed.init_distributed() device = f"cuda:{dist.get_local_rank()}" max_tokens_per_batch = 40 - pipeline_num_stages = 2 + pipeline_num_stages = 0 max_seqlen = 15 dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) + seqlens = [len(s[0]) for s in dataset] model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim, device).to(device) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) @@ -111,59 +108,54 @@ def to_layers(self): "lr": 1e-3, } }, - # "scheduler": { - # "type": "WarmupLR", - # "params": { - # "warmup_min_lr": 0, - # "warmup_max_lr": 0.001, - # "warmup_num_steps": 1000 - # } - # } } - engine, _, _, lr_scheduler = deepspeed.initialize(config=config, model=model) - seqlens = [len(s[0]) for s in dataset] + # From deepspeed docs: https://deepspeed.readthedocs.io/en/latest/schedulers.html + # if the scheduler is supposed to execute at any other interval (e.g., training epochs), then the + # user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly. + engine, _, _, _ = deepspeed.initialize(config=config, model=model) dataloader, lr_scheduler, deepspeed_io_kwargs = \ get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_seqlens=seqlens, effective_batch_size=engine.train_batch_size(), max_tokens_per_batch=max_tokens_per_batch, - dataloader_rank=engine.data_parallel_group.rank(), - dataloader_num_replicas=engine.data_parallel_group.size(), lr_scaling_method="linear", order_by_seqlen=False, + dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), + dataloader_rank=engine.data_parallel_group.rank(), + dataloader_num_replicas=engine.data_parallel_group.size(), dataloader_num_workers=0, dataloader_collate_fn=dataset.collate_fn, - optimizer=engine.optimizer, - # lr_scheduler_class=torch.optim.lr_scheduler.StepLR, - # lr_scheduler_kwargs=dict(optimizer=optimizer, step_size=1, gamma=0.1), + lr_scheduler_or_optimizer = engine.optimizer or engine.lr_scheduler, required_microbatches_of_same_size = pipeline_num_stages>0, required_microbatches_of_same_seqlen = pipeline_num_stages>0, sample_padding_fn=dataset.padding_fn, ) - # engine.training_dataloader = dataloader # use this or the deepspeed_io() below + # engine.training_dataloader = dataloader # if you need to use a torch dataloader directly engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - engine.client_lr_scheduler = lr_scheduler - engine._configure_lr_scheduler(lr_scheduler) gradient_acc_steps = engine.gradient_accumulation_steps() + # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders + n_batches_per_rank = len(engine.training_dataloader) // (gradient_acc_steps*engine.train_micro_batch_size_per_gpu()) - n_batches_per_rank = len(dataloader) // gradient_acc_steps for epoch in range(10): - if pipeline_num_stages: - dataloader_it = iter(dataloader) # point dataloader to first batch - lr_scheduler.step(0) # point LR scheduler to first batch - for batch_id in range(n_batches_per_rank): + dataloader_it = iter(engine.training_dataloader) # point dataloader to first batch + lr_scheduler.step(0) # point LR scheduler to first batch + for batch_id in range(n_batches_per_rank): + if pipeline_num_stages: engine.reset_activation_shape() # each batch has a diff BxT dimension loss = engine.train_batch(data_iter=dataloader_it) - else: - for i, (seqs, labels) in enumerate(dataloader): - seqs, labels = seqs.to(device), labels.to(device) - outputs = engine(seqs) - loss = loss_fn(outputs, labels) - engine.backward(loss) + if engine.data_parallel_group.rank() == 0: + print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + else: + for i in range(gradient_acc_steps): + seqs, labels = next(dataloader_it) + seqs, labels = seqs.to(device), labels.to(device) + outputs = engine(seqs) + loss = loss_fn(outputs, labels) + engine.backward(loss) if engine.data_parallel_group.rank() == 0: - batch_id = i // gradient_acc_steps print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + lr_scheduler.step() From 26095f3025be72ff9d61ccd1e87ba8b9ceea5b12 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 13:36:49 +0000 Subject: [PATCH 52/64] fixed scheduled step scaling --- .../variable_batch_size_and_lr.py | 34 ++++++++----------- .../variable_batch_size_and_lr_example.py | 31 ++++++++++------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 4a08774d8014..2a2bf2aa3d44 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -216,27 +216,23 @@ def verbose(self): def optimizer(self): return self.base_lr_scheduler.optimizer - @property - def base_lrs(self): - return self.base_lr_scheduler.base_lrs - - @property - def last_epoch(self): - return self.base_lr_scheduler.last_epoch - def __init__(self, lr_scheduler, base_batch_size, batch_sizes, dataloader, lr_scaling_method="linear"): self.batch_sizes = batch_sizes self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method self.dataloader = dataloader self.base_lr_scheduler = lr_scheduler + # the following exist in LRScheduler but not in DeepSpeed's LRScheduler so we create them here + self.base_lrs = self.base_lr_scheduler.get_lr() + self.last_epoch = 0 def state_dict(self): - return { - 'base_lr_scheduler': self.base_lr_scheduler.state_dict(), + return { 'base_lr_scheduler': self.base_lr_scheduler.state_dict() } | { 'base_batch_size': self.base_batch_size, 'lr_scaling_method': self.lr_scaling_method, 'batch_sizes': self.batch_sizes, + 'base_lrs': self.base_lrs, + 'last_epoch': self.last_epoch, } def load_state_dict(self, state_dict): @@ -244,15 +240,14 @@ def load_state_dict(self, state_dict): self.base_batch_size = state_dict['base_batch_size'] self.lr_scaling_method = state_dict['lr_scaling_method'] self.batch_sizes = state_dict['batch_sizes'] + self.base_lrs = state_dict['base_lrs'] + self.last_epoch = state_dict['last_epoch'] def get_last_lr(self): return self.base_lr_scheduler._last_lr def get_lr(self): - try: - return self.base_lr_scheduler.get_lr() - except NotImplementedError: - return [group['lr'] for group in self.optimizer.param_groups] + return [group['lr'] for group in self.base_lr_scheduler.optimizer.param_groups] def step(self, epoch=None): # call the base scheduler's step method to get LR for next epoch @@ -264,18 +259,19 @@ def step(self, epoch=None): # reset unscaled LRs (to the original scheduler's one) for the current epoch # Note: epoch==0: reset LR scheduler; epoch==None: scale LR for next epoch; unscaled_lrs = self.base_lrs if epoch == 0 else self.get_last_lr() - for group, lr in zip(self.optimizer.param_groups, unscaled_lrs): + for group, lr in zip(self.base_lr_scheduler.optimizer.param_groups, unscaled_lrs): group['lr'] = lr self.base_lr_scheduler.step(epoch) # set unscaled lr, _step_count, last_epoch, _last_lr for new epoch # scale the learning rate for next epoch for each parameter group. + self.last_epoch = self.last_epoch + 1 if epoch is None else epoch batch_size = self.batch_sizes[self.last_epoch % len(self.batch_sizes)] - for group in self.optimizer.param_groups: + for group in self.base_lr_scheduler.optimizer.param_groups: group['lr'] = scale_lr(self.base_batch_size, batch_size, group['lr'], self.lr_scaling_method) - if True: #self.verbose: - print(f"Batch id {self.last_epoch}, unscaled LR: {unscaled_lrs}, scaled LR: {self.get_lr()}") + if self.verbose: + print(f"Batch id {self.last_epoch}, unscaled LRs {unscaled_lrs}, scaled LRs {self.get_lr()}") def lr_scheduler_for_variable_batch_size(base_batch_size, @@ -306,7 +302,7 @@ def get_lr(self) -> float: if isinstance(lr_scheduler_or_optimizer, Optimizer): lr_scheduler = StubLRScheduler(lr_scheduler_or_optimizer) - elif isinstance(lr_scheduler_or_optimizer, LRScheduler): + elif hasattr(lr_scheduler_or_optimizer, 'optimizer'): #LRScheduler or DeepSpeed 'object' schedulers lr_scheduler = lr_scheduler_or_optimizer else: raise ValueError("Unknown type for lr_scheduler_or_optimizer: {}".format(type(lr_scheduler_or_optimizer))) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 92c67d699d5d..897647c61b09 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -87,7 +87,7 @@ def to_layers(self): deepspeed.init_distributed() device = f"cuda:{dist.get_local_rank()}" max_tokens_per_batch = 40 - pipeline_num_stages = 0 + pipeline_num_stages = 2 max_seqlen = 15 dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) @@ -108,12 +108,16 @@ def to_layers(self): "lr": 1e-3, } }, + # "scheduler": { + # "type": "WarmupLR", + # "params": { + # "warmup_min_lr": 0.001, + # "warmup_max_lr": 0.005, + # "warmup_num_steps": 1000 + # } + # } } - - # From deepspeed docs: https://deepspeed.readthedocs.io/en/latest/schedulers.html - # if the scheduler is supposed to execute at any other interval (e.g., training epochs), then the - # user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly. engine, _, _, _ = deepspeed.initialize(config=config, model=model) dataloader, lr_scheduler, deepspeed_io_kwargs = \ get_dataloader_and_lr_scheduler_for_variable_batch_size( @@ -128,7 +132,7 @@ def to_layers(self): dataloader_num_replicas=engine.data_parallel_group.size(), dataloader_num_workers=0, dataloader_collate_fn=dataset.collate_fn, - lr_scheduler_or_optimizer = engine.optimizer or engine.lr_scheduler, + lr_scheduler_or_optimizer = engine.lr_scheduler or engine.optimizer, required_microbatches_of_same_size = pipeline_num_stages>0, required_microbatches_of_same_seqlen = pipeline_num_stages>0, sample_padding_fn=dataset.padding_fn, @@ -136,26 +140,27 @@ def to_layers(self): # engine.training_dataloader = dataloader # if you need to use a torch dataloader directly engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) + engine.lr_scheduler = engine.client_lr_scheduler = lr_scheduler gradient_acc_steps = engine.gradient_accumulation_steps() # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders n_batches_per_rank = len(engine.training_dataloader) // (gradient_acc_steps*engine.train_micro_batch_size_per_gpu()) for epoch in range(10): - dataloader_it = iter(engine.training_dataloader) # point dataloader to first batch + engine.data_iterator = iter(engine.training_dataloader) # point data iterator to first batch lr_scheduler.step(0) # point LR scheduler to first batch for batch_id in range(n_batches_per_rank): - if pipeline_num_stages: + if pipeline_num_stages>0: engine.reset_activation_shape() # each batch has a diff BxT dimension - loss = engine.train_batch(data_iter=dataloader_it) - if engine.data_parallel_group.rank() == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + loss = engine.train_batch() # lr_kwargs={"epoch": batch_id} + assert(engine.training_dataloader is not None) else: for i in range(gradient_acc_steps): - seqs, labels = next(dataloader_it) + seqs, labels = next(engine.data_iterator) seqs, labels = seqs.to(device), labels.to(device) outputs = engine(seqs) loss = loss_fn(outputs, labels) engine.backward(loss) + engine.step() # lr_kwargs={"epoch": batch_id}) + if engine.data_parallel_group.rank() == 0: print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") - lr_scheduler.step() From 167f4841cf1b5d2c58ba6142bcde5547fc64ec36 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 13:50:15 +0000 Subject: [PATCH 53/64] pre-commit hooks --- .../variable_batch_size_and_lr.py | 37 ++++++++++++------- .../variable_batch_size_and_lr_example.py | 33 +++++++++-------- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 2a2bf2aa3d44..1ed816e4a1b5 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -181,7 +181,7 @@ def collate_fn_wrapper(list_microbatch_ids): "padding dataloader_padding_fn must be provided if required_microbatches_of_same_seqlen is True" pad_len = batch_max_seqlens[batch_id] batch_data = [sample_padding_fn(sample, pad_len) for sample in batch_data] - batch+=batch_data + batch += batch_data return dataloader_collate_fn(batch) if dataloader_collate_fn else batch dataloader = DataLoader( @@ -208,31 +208,37 @@ def collate_fn_wrapper(list_microbatch_ids): class VariableBatchSizeLR(LRScheduler): """ an LR scheduler that scales the LR of a given scheduler's LR """ - @property - def verbose(self): - return self.base_lr_scheduler.verbose - @property def optimizer(self): return self.base_lr_scheduler.optimizer - - def __init__(self, lr_scheduler, base_batch_size, batch_sizes, dataloader, lr_scaling_method="linear"): + + def __init__(self, + lr_scheduler, + base_batch_size, + batch_sizes, + dataloader, + lr_scaling_method="linear", + verbose=False): self.batch_sizes = batch_sizes self.base_batch_size = base_batch_size self.lr_scaling_method = lr_scaling_method self.dataloader = dataloader self.base_lr_scheduler = lr_scheduler - # the following exist in LRScheduler but not in DeepSpeed's LRScheduler so we create them here + # the following exist in LRScheduler but not in DeepSpeed's LRScheduler so we redefine them here self.base_lrs = self.base_lr_scheduler.get_lr() self.last_epoch = 0 + self.verbose = verbose def state_dict(self): - return { 'base_lr_scheduler': self.base_lr_scheduler.state_dict() } | { + return { + 'base_lr_scheduler': self.base_lr_scheduler.state_dict() + } | { 'base_batch_size': self.base_batch_size, 'lr_scaling_method': self.lr_scaling_method, 'batch_sizes': self.batch_sizes, 'base_lrs': self.base_lrs, 'last_epoch': self.last_epoch, + 'verbose': self.verbose, } def load_state_dict(self, state_dict): @@ -242,6 +248,7 @@ def load_state_dict(self, state_dict): self.batch_sizes = state_dict['batch_sizes'] self.base_lrs = state_dict['base_lrs'] self.last_epoch = state_dict['last_epoch'] + self.verbose = state_dict['verbose'] def get_last_lr(self): return self.base_lr_scheduler._last_lr @@ -293,7 +300,7 @@ def lr_scheduler_for_variable_batch_size(base_batch_size, Returns the new LRScheduler """ - + class StubLRScheduler(LRScheduler): """ a stub LR scheduler that does not change the LR, keeps it constant """ @@ -302,14 +309,16 @@ def get_lr(self) -> float: if isinstance(lr_scheduler_or_optimizer, Optimizer): lr_scheduler = StubLRScheduler(lr_scheduler_or_optimizer) - elif hasattr(lr_scheduler_or_optimizer, 'optimizer'): #LRScheduler or DeepSpeed 'object' schedulers + elif hasattr(lr_scheduler_or_optimizer, 'optimizer'): #LRScheduler or DeepSpeed 'object' schedulers lr_scheduler = lr_scheduler_or_optimizer else: raise ValueError("Unknown type for lr_scheduler_or_optimizer: {}".format(type(lr_scheduler_or_optimizer))) - return VariableBatchSizeLR( - lr_scheduler=lr_scheduler, base_batch_size=base_batch_size, batch_sizes=batch_sizes, - dataloader=dataloader, lr_scaling_method=lr_scaling_method) + return VariableBatchSizeLR(lr_scheduler=lr_scheduler, + base_batch_size=base_batch_size, + batch_sizes=batch_sizes, + dataloader=dataloader, + lr_scaling_method=lr_scaling_method) def get_dataloader_and_lr_scheduler_for_variable_batch_size( diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 897647c61b09..9f4037a40e61 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -108,14 +108,14 @@ def to_layers(self): "lr": 1e-3, } }, - # "scheduler": { - # "type": "WarmupLR", - # "params": { - # "warmup_min_lr": 0.001, - # "warmup_max_lr": 0.005, - # "warmup_num_steps": 1000 - # } - # } + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.001, + "warmup_max_lr": 0.005, + "warmup_num_steps": 1000 + } + } } engine, _, _, _ = deepspeed.initialize(config=config, model=model) @@ -143,16 +143,17 @@ def to_layers(self): engine.lr_scheduler = engine.client_lr_scheduler = lr_scheduler gradient_acc_steps = engine.gradient_accumulation_steps() # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders - n_batches_per_rank = len(engine.training_dataloader) // (gradient_acc_steps*engine.train_micro_batch_size_per_gpu()) + n_batches_per_rank = len( + engine.training_dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) for epoch in range(10): - engine.data_iterator = iter(engine.training_dataloader) # point data iterator to first batch + engine.data_iterator = iter(engine.training_dataloader) # point data iterator to first batch lr_scheduler.step(0) # point LR scheduler to first batch for batch_id in range(n_batches_per_rank): - if pipeline_num_stages>0: + if pipeline_num_stages > 0: engine.reset_activation_shape() # each batch has a diff BxT dimension - loss = engine.train_batch() # lr_kwargs={"epoch": batch_id} - assert(engine.training_dataloader is not None) + loss = engine.train_batch() # lr_kwargs={"epoch": batch_id} + assert (engine.training_dataloader is not None) else: for i in range(gradient_acc_steps): seqs, labels = next(engine.data_iterator) @@ -160,7 +161,9 @@ def to_layers(self): outputs = engine(seqs) loss = loss_fn(outputs, labels) engine.backward(loss) - engine.step() # lr_kwargs={"epoch": batch_id}) + engine.step() # lr_kwargs={"epoch": batch_id}) if engine.data_parallel_group.rank() == 0: - print(f"batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}") + print( + f"batch {batch_id}, dl rank {engine.data_parallel_group.rank()} loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" + ) From 814f1cb60b866f04dd1201f3c23cbbbd303c7191 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 13:59:42 +0000 Subject: [PATCH 54/64] pre-commit hooks --- .../data_sampling/variable_batch_size_and_lr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 1ed816e4a1b5..d3c8f4dca712 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -218,6 +218,7 @@ def __init__(self, batch_sizes, dataloader, lr_scaling_method="linear", + last_epoch=-1, verbose=False): self.batch_sizes = batch_sizes self.base_batch_size = base_batch_size @@ -226,8 +227,9 @@ def __init__(self, self.base_lr_scheduler = lr_scheduler # the following exist in LRScheduler but not in DeepSpeed's LRScheduler so we redefine them here self.base_lrs = self.base_lr_scheduler.get_lr() - self.last_epoch = 0 + self.last_epoch = last_epoch self.verbose = verbose + self.step(0) def state_dict(self): return { @@ -310,6 +312,7 @@ def get_lr(self) -> float: if isinstance(lr_scheduler_or_optimizer, Optimizer): lr_scheduler = StubLRScheduler(lr_scheduler_or_optimizer) elif hasattr(lr_scheduler_or_optimizer, 'optimizer'): #LRScheduler or DeepSpeed 'object' schedulers + assert isinstance(lr_scheduler_or_optimizer.optimizer, Optimizer) lr_scheduler = lr_scheduler_or_optimizer else: raise ValueError("Unknown type for lr_scheduler_or_optimizer: {}".format(type(lr_scheduler_or_optimizer))) From 02cb597d744b5ac950806692040f691d7fd6ea29 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 15:46:11 +0000 Subject: [PATCH 55/64] batching config --- deepspeed/runtime/data_pipeline/config.py | 31 +++++----- deepspeed/runtime/data_pipeline/constants.py | 9 ++- .../variable_batch_size_and_lr.py | 57 ++++++++++++++----- .../variable_batch_size_and_lr_example.py | 41 ++++++------- 4 files changed, 85 insertions(+), 53 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index 38e076100a2b..b8730d03b7b2 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -20,7 +20,6 @@ def get_data_efficiency_config(param_dict): sub_param_dict = param_dict[DATA_EFFICIENCY] output[DATA_SAMPLING] = get_data_sampling(sub_param_dict) output[DATA_ROUTING] = get_data_routing(sub_param_dict) - return output @@ -47,7 +46,7 @@ def get_data_sampling(param_dict): param_dict[DATA_SAMPLING] = {} sub_param_dict = param_dict[DATA_SAMPLING] output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict) - # output[DYNAMIC_BATCHING] = get_dynamic_batching(sub_param_dict) + output[DYNAMIC_BATCHING] = get_dynamic_batching(param_dict.get(DYNAMIC_BATCHING, {})) return output @@ -95,23 +94,19 @@ def get_dynamic_batching_params(param_dict): else: return {} - def get_dynamic_batching(param_dict): - output = {} - if DYNAMIC_BATCHING not in param_dict.keys(): - sub_param_dict = param_dict[DYNAMIC_BATCHING] - sub_param_dict[DYNAMIC_BATCHING_ENABLED] = DYNAMIC_BATCHING_ENABLED_DEFAULT - sub_param_dict[DYNAMIC_BATCHING_LR_SCALING] = DYNAMIC_BATCHING_LR_SCALING - sub_param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT - sub_param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = None - sub_param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT - if sub_param_dict[DYNAMIC_BATCHING_ENABLED]: - assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in sub_param_dict.keys(), \ - f"Dynamic batching is enabled, {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" - for key, val in get_dynamic_batching_params(param_dict).items(): - output[key] = val - return output - + param_dict[DYNAMIC_BATCHING_ENABLED] = bool(param_dict.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT)) + param_dict[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str(param_dict.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT)) + param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int(param_dict.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT)) + param_dict[DYNAMIC_BATCHING_NUM_WORKERS] = int(param_dict.get(DYNAMIC_BATCHING_NUM_WORKERS, DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT)) + param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int(param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) if DYNAMIC_BATCHING_MAX_BATCH_SIZE in param_dict else None + param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = str(param_dict.get(DYNAMIC_BATCHING_SAMPLES_ORDER, DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT)) + if param_dict[DYNAMIC_BATCHING_ENABLED]: + assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in param_dict.keys(), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" + param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH] = int(param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH]) + param_dict[DYNAMIC_BATCHING_PIN_MEMORY] = bool(param_dict.get(DYNAMIC_BATCHING_PIN_MEMORY, DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT)) + param_dict[DYNAMIC_BATCHING_VERBOSE] = bool(param_dict.get(DYNAMIC_BATCHING_VERBOSE, False)) + return param_dict def get_curriculum_learning_enabled(param_dict): if CURRICULUM_LEARNING in param_dict.keys(): diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py index 6689cd7e5c5e..ab2429477eb7 100644 --- a/deepspeed/runtime/data_pipeline/constants.py +++ b/deepspeed/runtime/data_pipeline/constants.py @@ -68,15 +68,20 @@ DYNAMIC_BATCHING = "dynamic_batching" DYNAMIC_BATCHING_ENABLED = "enabled" DYNAMIC_BATCHING_ENABLED_DEFAULT = False -DYNAMIC_BATCHING_LR_SCALING = "lr_scaling" # "linear" / "sqrt" / "none" -DYNAMIC_BATCHING_LR_SCALING_DEFAULT = "linear" +DYNAMIC_BATCHING_LR_SCALING_METHOD = "lr_scaling_method" # "linear" / "sqrt" / "none" +DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT = "linear" DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size" DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1 DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size" DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None +DYNAMIC_BATCHING_NUM_WORKERS = "dataloader_num_workers" +DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT = 0 +DYNAMIC_BATCHING_PIN_MEMORY = "dataloader_pin_memory" +DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT = False DYNAMIC_BATCHING_SAMPLES_ORDER = "samples_order" # "random" / "order" / "default" DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT = "dataloader" # "random" / "order" / "dataloader" DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH = "max_tokens_per_batch" +DYNAMIC_BATCHING_VERBOSE = "verbose" ######################################### # Curriculum Learning legacy implementation diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index d3c8f4dca712..9b8675e641c3 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -9,6 +9,7 @@ from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader, DistributedSampler from deepspeed.utils import logger +from deepspeed.runtime.pipe.engine import PipelineEngine def batch_by_size( @@ -17,8 +18,7 @@ def batch_by_size( sample_ids=None, min_batch_size=1, max_batch_size=None, - shuffle_seqlens=False, - order_by_seqlen=False, + samples_order="dataloader", effective_batch_size=1, required_microbatches_of_same_size=False, verbose=False, @@ -37,8 +37,7 @@ def batch_by_size( automatically assigns a sequential order; - `min_batch_size`: smallest allowed size of a batch; - `min_batch_size`: largest allowed size of a batch; - - `shuffle_seqlens`: shuffle metric values before packing samples into batches; - - `order_by_seqlen`: order samples by ascending metric values before packing into batches; + - `samples_order`: order in which to process samples: "dataloader" (default), "random" or (ascending) "order" - `dataloader_num_replicas`: number of dataloaders - `effective_batch_size`: effective batch size; - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` @@ -51,16 +50,15 @@ def batch_by_size( - `batch_max_seqlens`: the max seqlen across all microbatches in a batch """ - assert not shuffle_seqlens or not order_by_seqlen, \ - "either sort_seqlens or shuffle_seqlens can be True, not both." + assert samples_order in ["random", "order", "default"] sample_ids = sample_ids or list(range(len(seqlens))) metrics = list(zip(seqlens, sample_ids)) - if shuffle_seqlens: + if samples_order=='shuffle': metric_random = random.Random(seed) metric_random.shuffle(metrics) - if order_by_seqlen: + if samples_order=='sort': metrics = sorted(metrics) # go through metrics and warn user and filter samples that alone exceed the max batch threshold @@ -123,7 +121,7 @@ def is_microbatch_valid(metrics): n_tokens_in_batch = sum([m[0] for m in mbs[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - print(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {sample_ids}") + logger.info(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {sample_ids}") # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_ids) // effective_batch_size @@ -280,7 +278,7 @@ def step(self, epoch=None): group['lr'] = scale_lr(self.base_batch_size, batch_size, group['lr'], self.lr_scaling_method) if self.verbose: - print(f"Batch id {self.last_epoch}, unscaled LRs {unscaled_lrs}, scaled LRs {self.get_lr()}") + logger.info(f"Batch id {self.last_epoch}, unscaled LRs {unscaled_lrs}, scaled LRs {self.get_lr()}") def lr_scheduler_for_variable_batch_size(base_batch_size, @@ -324,6 +322,39 @@ def get_lr(self) -> float: lr_scaling_method=lr_scaling_method) +def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( + dataset, + dataset_seqlens, + engine, + batching_config, + sample_ids=None, + dataloader_collate_fn=None, + sample_padding_fn=None + ): + """ a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size on deepspeed runtime""" + return get_dataloader_and_lr_scheduler_for_variable_batch_size( + dataset=dataset, + sample_ids=sample_ids, + dataset_seqlens=dataset_seqlens, + effective_batch_size=engine.train_batch_size(), + max_tokens_per_batch=batching_config["max_tokens_per_batch"], + lr_scaling_method=batching_config["lr_scaling_method"], + samples_order=batching_config["samples_order"], + min_batch_size=batching_config["min_batch_size"], + max_batch_size=batching_config["max_batch_size"], + dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), + dataloader_rank=engine.data_parallel_group.rank(), + dataloader_num_replicas=engine.data_parallel_group.size(), + dataloader_num_workers=batching_config["dataloader_num_workers"], + dataloader_collate_fn=dataloader_collate_fn, + dataloader_pin_memory=batching_config["dataloader_pin_memory"], + sample_padding_fn=sample_padding_fn, + lr_scheduler_or_optimizer = engine.lr_scheduler or engine.optimizer, + required_microbatches_of_same_size = isinstance(engine, PipelineEngine), + required_microbatches_of_same_seqlen = isinstance(engine, PipelineEngine), + verbose=batching_config["verbose"], + ) + def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, dataset_seqlens, @@ -333,8 +364,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( lr_scaling_method="linear", min_batch_size=1, max_batch_size=None, - shuffle_seqlens=False, - order_by_seqlen=False, + samples_order="dataloader", dataloader_batch_size=1, dataloader_rank=0, dataloader_num_replicas=1, @@ -355,8 +385,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( sample_ids=sample_ids, min_batch_size=min_batch_size, max_batch_size=max_batch_size, - shuffle_seqlens=shuffle_seqlens, - order_by_seqlen=order_by_seqlen, + samples_order=samples_order, effective_batch_size=effective_batch_size, required_microbatches_of_same_size=required_microbatches_of_same_size, verbose=verbose, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 9f4037a40e61..179f65825ca1 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -11,7 +11,7 @@ import deepspeed.comm as dist from deepspeed.pipe import PipelineModule -from deepspeed.runtime.data_pipeline.data_sampling.variable_batch_size_and_lr import get_dataloader_and_lr_scheduler_for_variable_batch_size +from deepspeed.runtime.data_pipeline.data_sampling.variable_batch_size_and_lr import get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed if __name__ == "__main__": @@ -91,7 +91,7 @@ def to_layers(self): max_seqlen = 15 dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) - seqlens = [len(s[0]) for s in dataset] + dataset_seqlens = [len(s[0]) for s in dataset] model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim, device).to(device) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) @@ -115,29 +115,32 @@ def to_layers(self): "warmup_max_lr": 0.005, "warmup_num_steps": 1000 } - } + }, + "data_efficiency": { + "enabled": True, + "dynamic_batching": { + "enabled": True, + "dataloader_num_workers": 0, + "dataloader_pin_memory": 0, + "lr_scaling_method": "linear", + "min_batch_size": 1, + "max_batch_size": 10, + "samples_order": "dataloader", # "random" / "order" / "default" + "max_tokens_per_batch": 40, + } + }, } engine, _, _, _ = deepspeed.initialize(config=config, model=model) dataloader, lr_scheduler, deepspeed_io_kwargs = \ - get_dataloader_and_lr_scheduler_for_variable_batch_size( + get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset=dataset, - dataset_seqlens=seqlens, - effective_batch_size=engine.train_batch_size(), - max_tokens_per_batch=max_tokens_per_batch, - lr_scaling_method="linear", - order_by_seqlen=False, - dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), - dataloader_rank=engine.data_parallel_group.rank(), - dataloader_num_replicas=engine.data_parallel_group.size(), - dataloader_num_workers=0, + dataset_seqlens=dataset_seqlens, + engine=engine, + batching_config=config["data_efficiency"]["dynamic_batching"], dataloader_collate_fn=dataset.collate_fn, - lr_scheduler_or_optimizer = engine.lr_scheduler or engine.optimizer, - required_microbatches_of_same_size = pipeline_num_stages>0, - required_microbatches_of_same_seqlen = pipeline_num_stages>0, - sample_padding_fn=dataset.padding_fn, - ) - + sample_padding_fn=dataset.padding_fn) + # engine.training_dataloader = dataloader # if you need to use a torch dataloader directly engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) engine.lr_scheduler = engine.client_lr_scheduler = lr_scheduler From c23f34c321eeef69476f3daf04e2f83b723f5bc4 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 17:16:49 +0000 Subject: [PATCH 56/64] final polishing --- .../variable_batch_size_and_lr.py | 59 ++++++++++++------- .../variable_batch_size_and_lr_example.py | 57 +++++++++--------- 2 files changed, 66 insertions(+), 50 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 9b8675e641c3..9b74874e8d5d 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -5,6 +5,7 @@ import random import torch +import numpy as np from torch.optim.lr_scheduler import LRScheduler from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader, DistributedSampler @@ -15,7 +16,7 @@ def batch_by_size( seqlens, max_tokens_per_batch, - sample_ids=None, + dataset_filter_ids=None, min_batch_size=1, max_batch_size=None, samples_order="dataloader", @@ -33,11 +34,11 @@ def batch_by_size( Arguments: - `seqlens`: a list of difficulties (metric values) for every sample in the dataset; - `max_tokens_per_batch`: upper cap in total difficulty in a batch; - - `sample_ids`: user-defined ids of the samples in seqlens. If not provided, - automatically assigns a sequential order; + - `dataset_filter_ids`: user-defined indices of samples in teh dataset that will be used to + batch. Remaining indices to be ignored. Default is `None` for all indices. - `min_batch_size`: smallest allowed size of a batch; - `min_batch_size`: largest allowed size of a batch; - - `samples_order`: order in which to process samples: "dataloader" (default), "random" or (ascending) "order" + - `samples_order`: order in which to process samples: "dataloader" (default), "random" or "seqlen" (ascending) - `dataloader_num_replicas`: number of dataloaders - `effective_batch_size`: effective batch size; - `required_microbatches_of_same_size`: enable if each mini-batch (in a total of `batch_size_multiple` @@ -50,15 +51,17 @@ def batch_by_size( - `batch_max_seqlens`: the max seqlen across all microbatches in a batch """ - assert samples_order in ["random", "order", "default"] - - sample_ids = sample_ids or list(range(len(seqlens))) - metrics = list(zip(seqlens, sample_ids)) + assert samples_order in ["random", "seqlen", "dataloader"] + if dataset_filter_ids is None: + metrics = list(zip(seqlens, range(len(seqlens)))) # use all samples + else: + metrics = list(zip(np.array(seqlens)[dataset_filter_ids], dataset_filter_ids)) + - if samples_order=='shuffle': + if samples_order=='random': metric_random = random.Random(seed) metric_random.shuffle(metrics) - if samples_order=='sort': + if samples_order=='seqlen': metrics = sorted(metrics) # go through metrics and warn user and filter samples that alone exceed the max batch threshold @@ -113,15 +116,15 @@ def is_microbatch_valid(metrics): mbs = microbatches[rank:rank + effective_batch_size] batch_size = sum([len(mb) for mb in mbs]) batch_max_seqlen = max([m[0] for metrics in mbs for m in metrics]) - sample_ids = [[m[1] for m in metrics] for metrics in mbs] - batch_and_mb_ids = zip([batch_id] * effective_batch_size, sample_ids) + dataset_filter_ids = [[m[1] for m in metrics] for metrics in mbs] + batch_and_mb_ids = zip([batch_id] * effective_batch_size, dataset_filter_ids) batch_sizes.append(batch_size) batch_max_seqlens.append(batch_max_seqlen) microbatch_ids += batch_and_mb_ids n_tokens_in_batch = sum([m[0] for m in mbs[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - logger.info(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {sample_ids}") + logger.info(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {dataset_filter_ids}") # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_ids) // effective_batch_size @@ -326,15 +329,20 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset, dataset_seqlens, engine, - batching_config, - sample_ids=None, + dataset_filter_ids=None, dataloader_collate_fn=None, - sample_padding_fn=None + sample_padding_fn=None, + replace_lr_scheduler=True, + replace_dataloader=True ): - """ a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size on deepspeed runtime""" - return get_dataloader_and_lr_scheduler_for_variable_batch_size( + """ + a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size for the deepspeed runtime. + See `batch_by_size()` for arguments and documentation. + """ + batching_config = engine.config['data_efficiency']['dynamic_batching'] + dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, - sample_ids=sample_ids, + dataset_filter_ids=dataset_filter_ids, dataset_seqlens=dataset_seqlens, effective_batch_size=engine.train_batch_size(), max_tokens_per_batch=batching_config["max_tokens_per_batch"], @@ -354,13 +362,21 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( required_microbatches_of_same_seqlen = isinstance(engine, PipelineEngine), verbose=batching_config["verbose"], ) + if replace_lr_scheduler: + engine.lr_scheduler = lr_scheduler + if replace_dataloader: + engine.training_dataloader = dataloader + engine.data_iterator = iter(engine.training_dataloader) + # engine.deepspeed_io(**deepspeed_io_kwargs) + return dataloader, lr_scheduler, deepspeed_io_kwargs + def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, dataset_seqlens, max_tokens_per_batch, effective_batch_size, - sample_ids=None, + dataset_filter_ids=None, lr_scaling_method="linear", min_batch_size=1, max_batch_size=None, @@ -377,12 +393,13 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( sample_padding_fn=None, verbose=False, ): + """ returns a dataloader and LR scheduler for the variable batch size. see `batch_by_size()` for details. """ # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, max_tokens_per_batch=max_tokens_per_batch, - sample_ids=sample_ids, + dataset_filter_ids=dataset_filter_ids, min_batch_size=min_batch_size, max_batch_size=max_batch_size, samples_order=samples_order, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 179f65825ca1..7f9585fb2e76 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -20,7 +20,6 @@ class TestData(torch.utils.data.Dataset): def __init__(self, seq_count, min_seqlen=1, max_seqlen=20, embed_dim=5, seed=0): data_random = random.Random(seed) - self.mask_size = max_seqlen # M: size of mask self.padding_value = 0 self.embed_dim = embed_dim self.seqs = [ @@ -43,20 +42,24 @@ def padding_fn(self, sample, size): seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label + class AttentionHeadAndFeedForward(nn.Module): - """ An attention head with variable-length inputs, followed by a feed forward of fixed input. No embeddings. """ + """ + A single attention head of batch of shape BxTxE (with variable T) and attention matrix + BxTxT, followed by a feed-forward network of input size BxMxE, where T<0: model = PipelineModule(layers=model.to_layers(), num_stages=pipeline_num_stages, loss_fn=loss_fn) - # DeepSpeed config + # DeepSpeed config includes the dynamic batching config = { "train_batch_size": 16, "train_micro_batch_size_per_gpu": 2, # Note: each microbatch per GPU will fill up to N tokens @@ -125,48 +127,45 @@ def to_layers(self): "lr_scaling_method": "linear", "min_batch_size": 1, "max_batch_size": 10, - "samples_order": "dataloader", # "random" / "order" / "default" + "samples_order": "dataloader", # "random" / "seqlen" / "default" "max_tokens_per_batch": 40, + "verbose": False, } }, } + # initialize deepspeed engine without dataset/dataloader engine, _, _, _ = deepspeed.initialize(config=config, model=model) - dataloader, lr_scheduler, deepspeed_io_kwargs = \ + + # We will simulate a curriculum step, by filtering only a subset of sequences with a given seqlen + dataset_filter_ids = [i for i, seqlen in enumerate(dataset_seqlens) if seqlen>7 and seqlen<14] + dataloader, lr_scheduler, _ = \ get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset=dataset, dataset_seqlens=dataset_seqlens, + dataset_filter_ids=dataset_filter_ids, #remove or None to include the whole dataset engine=engine, - batching_config=config["data_efficiency"]["dynamic_batching"], dataloader_collate_fn=dataset.collate_fn, sample_padding_fn=dataset.padding_fn) - # engine.training_dataloader = dataloader # if you need to use a torch dataloader directly - engine.training_dataloader = engine.deepspeed_io(**deepspeed_io_kwargs) - engine.lr_scheduler = engine.client_lr_scheduler = lr_scheduler gradient_acc_steps = engine.gradient_accumulation_steps() - # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders - n_batches_per_rank = len( - engine.training_dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) + n_batches_per_rank = len(dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) for epoch in range(10): - engine.data_iterator = iter(engine.training_dataloader) # point data iterator to first batch + data_iter = iter(dataloader) # point data iterator to first batch lr_scheduler.step(0) # point LR scheduler to first batch for batch_id in range(n_batches_per_rank): if pipeline_num_stages > 0: - engine.reset_activation_shape() # each batch has a diff BxT dimension - loss = engine.train_batch() # lr_kwargs={"epoch": batch_id} - assert (engine.training_dataloader is not None) + engine.reset_activation_shape() # reset, as each batch has a diff BxT dimension + loss = engine.train_batch(data_iter=data_iter) # lr_kwargs={"epoch": batch_id} else: for i in range(gradient_acc_steps): - seqs, labels = next(engine.data_iterator) + seqs, labels = next(data_iter) seqs, labels = seqs.to(device), labels.to(device) outputs = engine(seqs) loss = loss_fn(outputs, labels) engine.backward(loss) engine.step() # lr_kwargs={"epoch": batch_id}) - if engine.data_parallel_group.rank() == 0: - print( - f"batch {batch_id}, dl rank {engine.data_parallel_group.rank()} loss {loss.item()}, LRs {lr_scheduler.get_lr()}, epoch {epoch}" - ) + if engine.data_parallel_group.rank(): + print( f"epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}" ) From af8b06898e14e0422069564a9ea61b9894fcb3d4 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 18:15:52 +0000 Subject: [PATCH 57/64] pre-commit hooks --- deepspeed/runtime/data_pipeline/config.py | 26 +++++-- .../variable_batch_size_and_lr.py | 73 +++++++++---------- .../variable_batch_size_and_lr_example.py | 9 +-- 3 files changed, 58 insertions(+), 50 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index b8730d03b7b2..1acb9df9384d 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -94,20 +94,30 @@ def get_dynamic_batching_params(param_dict): else: return {} + def get_dynamic_batching(param_dict): - param_dict[DYNAMIC_BATCHING_ENABLED] = bool(param_dict.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT)) - param_dict[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str(param_dict.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT)) - param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int(param_dict.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT)) - param_dict[DYNAMIC_BATCHING_NUM_WORKERS] = int(param_dict.get(DYNAMIC_BATCHING_NUM_WORKERS, DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT)) - param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int(param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) if DYNAMIC_BATCHING_MAX_BATCH_SIZE in param_dict else None - param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = str(param_dict.get(DYNAMIC_BATCHING_SAMPLES_ORDER, DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT)) + param_dict[DYNAMIC_BATCHING_ENABLED] = bool( + param_dict.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT)) + param_dict[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str( + param_dict.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT)) + param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int( + param_dict.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT)) + param_dict[DYNAMIC_BATCHING_NUM_WORKERS] = int( + param_dict.get(DYNAMIC_BATCHING_NUM_WORKERS, DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT)) + param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int( + param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) if DYNAMIC_BATCHING_MAX_BATCH_SIZE in param_dict else None + param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = str( + param_dict.get(DYNAMIC_BATCHING_SAMPLES_ORDER, DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT)) if param_dict[DYNAMIC_BATCHING_ENABLED]: - assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in param_dict.keys(), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" + assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in param_dict.keys( + ), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH] = int(param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH]) - param_dict[DYNAMIC_BATCHING_PIN_MEMORY] = bool(param_dict.get(DYNAMIC_BATCHING_PIN_MEMORY, DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT)) + param_dict[DYNAMIC_BATCHING_PIN_MEMORY] = bool( + param_dict.get(DYNAMIC_BATCHING_PIN_MEMORY, DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT)) param_dict[DYNAMIC_BATCHING_VERBOSE] = bool(param_dict.get(DYNAMIC_BATCHING_VERBOSE, False)) return param_dict + def get_curriculum_learning_enabled(param_dict): if CURRICULUM_LEARNING in param_dict.keys(): return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 9b74874e8d5d..6064281c89da 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -34,7 +34,7 @@ def batch_by_size( Arguments: - `seqlens`: a list of difficulties (metric values) for every sample in the dataset; - `max_tokens_per_batch`: upper cap in total difficulty in a batch; - - `dataset_filter_ids`: user-defined indices of samples in teh dataset that will be used to + - `dataset_filter_ids`: user-defined indices of samples in the dataset that will be used to batch. Remaining indices to be ignored. Default is `None` for all indices. - `min_batch_size`: smallest allowed size of a batch; - `min_batch_size`: largest allowed size of a batch; @@ -53,15 +53,14 @@ def batch_by_size( assert samples_order in ["random", "seqlen", "dataloader"] if dataset_filter_ids is None: - metrics = list(zip(seqlens, range(len(seqlens)))) # use all samples + metrics = list(zip(seqlens, range(len(seqlens)))) # use all samples else: metrics = list(zip(np.array(seqlens)[dataset_filter_ids], dataset_filter_ids)) - - if samples_order=='random': + if samples_order == 'random': metric_random = random.Random(seed) metric_random.shuffle(metrics) - if samples_order=='seqlen': + if samples_order == 'seqlen': metrics = sorted(metrics) # go through metrics and warn user and filter samples that alone exceed the max batch threshold @@ -124,7 +123,9 @@ def is_microbatch_valid(metrics): n_tokens_in_batch = sum([m[0] for m in mbs[0]]) assert n_tokens_in_batch <= max_tokens_per_batch if verbose: - logger.info(f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {dataset_filter_ids}") + logger.info( + f"Batch id {batch_id}, size {batch_size}, tokens {n_tokens_in_batch} tokens, samples: {dataset_filter_ids}" + ) # return the sample ids of each microbatch, and the batch sizes assert len(batch_sizes) == len(microbatch_ids) // effective_batch_size @@ -325,43 +326,41 @@ def get_lr(self) -> float: lr_scaling_method=lr_scaling_method) -def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( - dataset, - dataset_seqlens, - engine, - dataset_filter_ids=None, - dataloader_collate_fn=None, - sample_padding_fn=None, - replace_lr_scheduler=True, - replace_dataloader=True - ): +def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, + dataset_seqlens, + engine, + dataset_filter_ids=None, + dataloader_collate_fn=None, + sample_padding_fn=None, + replace_lr_scheduler=True, + replace_dataloader=True): """ a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size for the deepspeed runtime. See `batch_by_size()` for arguments and documentation. """ batching_config = engine.config['data_efficiency']['dynamic_batching'] dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( - dataset=dataset, - dataset_filter_ids=dataset_filter_ids, - dataset_seqlens=dataset_seqlens, - effective_batch_size=engine.train_batch_size(), - max_tokens_per_batch=batching_config["max_tokens_per_batch"], - lr_scaling_method=batching_config["lr_scaling_method"], - samples_order=batching_config["samples_order"], - min_batch_size=batching_config["min_batch_size"], - max_batch_size=batching_config["max_batch_size"], - dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), - dataloader_rank=engine.data_parallel_group.rank(), - dataloader_num_replicas=engine.data_parallel_group.size(), - dataloader_num_workers=batching_config["dataloader_num_workers"], - dataloader_collate_fn=dataloader_collate_fn, - dataloader_pin_memory=batching_config["dataloader_pin_memory"], - sample_padding_fn=sample_padding_fn, - lr_scheduler_or_optimizer = engine.lr_scheduler or engine.optimizer, - required_microbatches_of_same_size = isinstance(engine, PipelineEngine), - required_microbatches_of_same_seqlen = isinstance(engine, PipelineEngine), - verbose=batching_config["verbose"], - ) + dataset=dataset, + dataset_filter_ids=dataset_filter_ids, + dataset_seqlens=dataset_seqlens, + effective_batch_size=engine.train_batch_size(), + max_tokens_per_batch=batching_config["max_tokens_per_batch"], + lr_scaling_method=batching_config["lr_scaling_method"], + samples_order=batching_config["samples_order"], + min_batch_size=batching_config["min_batch_size"], + max_batch_size=batching_config["max_batch_size"], + dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), + dataloader_rank=engine.data_parallel_group.rank(), + dataloader_num_replicas=engine.data_parallel_group.size(), + dataloader_num_workers=batching_config["dataloader_num_workers"], + dataloader_collate_fn=dataloader_collate_fn, + dataloader_pin_memory=batching_config["dataloader_pin_memory"], + sample_padding_fn=sample_padding_fn, + lr_scheduler_or_optimizer=engine.lr_scheduler or engine.optimizer, + required_microbatches_of_same_size=isinstance(engine, PipelineEngine), + required_microbatches_of_same_seqlen=isinstance(engine, PipelineEngine), + verbose=batching_config["verbose"], + ) if replace_lr_scheduler: engine.lr_scheduler = lr_scheduler if replace_dataloader: diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 7f9585fb2e76..1c67273fbb15 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -42,7 +42,6 @@ def padding_fn(self, sample, size): seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label - class AttentionHeadAndFeedForward(nn.Module): """ A single attention head of batch of shape BxTxE (with variable T) and attention matrix @@ -97,7 +96,7 @@ def to_layers(self): model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim).to(device) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) - if pipeline_num_stages>0: + if pipeline_num_stages > 0: model = PipelineModule(layers=model.to_layers(), num_stages=pipeline_num_stages, loss_fn=loss_fn) # DeepSpeed config includes the dynamic batching @@ -138,7 +137,7 @@ def to_layers(self): engine, _, _, _ = deepspeed.initialize(config=config, model=model) # We will simulate a curriculum step, by filtering only a subset of sequences with a given seqlen - dataset_filter_ids = [i for i, seqlen in enumerate(dataset_seqlens) if seqlen>7 and seqlen<14] + dataset_filter_ids = [i for i, seqlen in enumerate(dataset_seqlens) if seqlen > 7 and seqlen < 14] dataloader, lr_scheduler, _ = \ get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset=dataset, @@ -147,7 +146,7 @@ def to_layers(self): engine=engine, dataloader_collate_fn=dataset.collate_fn, sample_padding_fn=dataset.padding_fn) - + gradient_acc_steps = engine.gradient_accumulation_steps() n_batches_per_rank = len(dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) @@ -168,4 +167,4 @@ def to_layers(self): engine.step() # lr_kwargs={"epoch": batch_id}) if engine.data_parallel_group.rank(): - print( f"epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}" ) + print(f"epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") From 1aabceca73670a19d909b1d65d80b867555c1d36 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Wed, 13 Mar 2024 23:59:28 +0000 Subject: [PATCH 58/64] added line for support contact --- .../data_pipeline/data_sampling/variable_batch_size_and_lr.py | 2 ++ .../data_sampling/variable_batch_size_and_lr_example.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 6064281c89da..f0f832fd4022 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -3,6 +3,8 @@ # DeepSpeed Team +# support/questions/maintenance: github user @brunomaga, @bm-synth or @microsoft/DeepSpeed + import random import torch import numpy as np diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 1c67273fbb15..54cb6e4dce00 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -3,6 +3,8 @@ # DeepSpeed Team +# support/questions/maintenance: github user @brunomaga, @bm-synth or @microsoft/DeepSpeed + import random import torch import torch.nn as nn From 4fd6303b6b90a5574404b5f1b88c7a8927a3776b Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 14 Mar 2024 13:16:29 +0000 Subject: [PATCH 59/64] sample_seqlen_fn --- .../data_sampling/variable_batch_size_and_lr.py | 11 +++++++++-- .../variable_batch_size_and_lr_example.py | 8 +++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index f0f832fd4022..a805fea77568 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -329,11 +329,12 @@ def get_lr(self) -> float: def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, - dataset_seqlens, engine, + dataset_seqlens=None, dataset_filter_ids=None, dataloader_collate_fn=None, sample_padding_fn=None, + sample_seqlen_fn=None, replace_lr_scheduler=True, replace_dataloader=True): """ @@ -358,6 +359,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, dataloader_collate_fn=dataloader_collate_fn, dataloader_pin_memory=batching_config["dataloader_pin_memory"], sample_padding_fn=sample_padding_fn, + sample_seqlen_fn=sample_seqlen_fn, lr_scheduler_or_optimizer=engine.lr_scheduler or engine.optimizer, required_microbatches_of_same_size=isinstance(engine, PipelineEngine), required_microbatches_of_same_seqlen=isinstance(engine, PipelineEngine), @@ -374,9 +376,9 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, - dataset_seqlens, max_tokens_per_batch, effective_batch_size, + dataset_seqlens=None, dataset_filter_ids=None, lr_scaling_method="linear", min_batch_size=1, @@ -392,10 +394,15 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( required_microbatches_of_same_size=False, required_microbatches_of_same_seqlen=False, sample_padding_fn=None, + sample_seqlen_fn=None, verbose=False, ): """ returns a dataloader and LR scheduler for the variable batch size. see `batch_by_size()` for details. """ + if dataset_seqlens is None: + assert sample_seqlen_fn is not None, "sample_seqlen_fn must be provided if dataset_seqlens is None" + dataset_seqlens = [sample_seqlen_fn(dataset[i]) for i in range(len(dataset))] + # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 54cb6e4dce00..8ded16b75261 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -44,6 +44,10 @@ def padding_fn(self, sample, size): seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label + def seqlen_fn(self, sample): + seq, _ = sample + return len(seq) + class AttentionHeadAndFeedForward(nn.Module): """ A single attention head of batch of shape BxTxE (with variable T) and attention matrix @@ -147,7 +151,9 @@ def to_layers(self): dataset_filter_ids=dataset_filter_ids, #remove or None to include the whole dataset engine=engine, dataloader_collate_fn=dataset.collate_fn, - sample_padding_fn=dataset.padding_fn) + sample_padding_fn=dataset.padding_fn, + sample_seqlen_fn=dataset.seqlen_fn, #only used when dataset_seqlens is None + ) gradient_acc_steps = engine.gradient_accumulation_steps() n_batches_per_rank = len(dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) From c0de7e6690c0fdfa83d4110d26a8f15748d9ec0b Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 14 Mar 2024 13:22:45 +0000 Subject: [PATCH 60/64] removed sample_seqlen_fn, should be done in parallel somewhere else --- .../data_sampling/variable_batch_size_and_lr.py | 11 ++--------- .../variable_batch_size_and_lr_example.py | 8 +------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index a805fea77568..f0f832fd4022 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -329,12 +329,11 @@ def get_lr(self) -> float: def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, + dataset_seqlens, engine, - dataset_seqlens=None, dataset_filter_ids=None, dataloader_collate_fn=None, sample_padding_fn=None, - sample_seqlen_fn=None, replace_lr_scheduler=True, replace_dataloader=True): """ @@ -359,7 +358,6 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, dataloader_collate_fn=dataloader_collate_fn, dataloader_pin_memory=batching_config["dataloader_pin_memory"], sample_padding_fn=sample_padding_fn, - sample_seqlen_fn=sample_seqlen_fn, lr_scheduler_or_optimizer=engine.lr_scheduler or engine.optimizer, required_microbatches_of_same_size=isinstance(engine, PipelineEngine), required_microbatches_of_same_seqlen=isinstance(engine, PipelineEngine), @@ -376,9 +374,9 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, def get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset, + dataset_seqlens, max_tokens_per_batch, effective_batch_size, - dataset_seqlens=None, dataset_filter_ids=None, lr_scaling_method="linear", min_batch_size=1, @@ -394,15 +392,10 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size( required_microbatches_of_same_size=False, required_microbatches_of_same_seqlen=False, sample_padding_fn=None, - sample_seqlen_fn=None, verbose=False, ): """ returns a dataloader and LR scheduler for the variable batch size. see `batch_by_size()` for details. """ - if dataset_seqlens is None: - assert sample_seqlen_fn is not None, "sample_seqlen_fn must be provided if dataset_seqlens is None" - dataset_seqlens = [sample_seqlen_fn(dataset[i]) for i in range(len(dataset))] - # effective_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of dataloaders microbatch_ids, batch_sizes, batch_max_seqlens = batch_by_size( seqlens=dataset_seqlens, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 8ded16b75261..54cb6e4dce00 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -44,10 +44,6 @@ def padding_fn(self, sample, size): seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label - def seqlen_fn(self, sample): - seq, _ = sample - return len(seq) - class AttentionHeadAndFeedForward(nn.Module): """ A single attention head of batch of shape BxTxE (with variable T) and attention matrix @@ -151,9 +147,7 @@ def to_layers(self): dataset_filter_ids=dataset_filter_ids, #remove or None to include the whole dataset engine=engine, dataloader_collate_fn=dataset.collate_fn, - sample_padding_fn=dataset.padding_fn, - sample_seqlen_fn=dataset.seqlen_fn, #only used when dataset_seqlens is None - ) + sample_padding_fn=dataset.padding_fn) gradient_acc_steps = engine.gradient_accumulation_steps() n_batches_per_rank = len(dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu()) From 427c70d325b42053cdf6da6232f66f3f6018b2fd Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 15 Mar 2024 16:09:03 +0000 Subject: [PATCH 61/64] minor bug fixes --- deepspeed/runtime/config.py | 1 - deepspeed/runtime/data_pipeline/config.py | 46 +++++++------------ deepspeed/runtime/data_pipeline/constants.py | 5 +- .../data_sampling/data_analyzer.py | 2 +- .../variable_batch_size_and_lr.py | 14 ++---- .../variable_batch_size_and_lr_example.py | 4 +- 6 files changed, 25 insertions(+), 47 deletions(-) diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 975fb1f21501..a105d70de4cb 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -797,7 +797,6 @@ def __init__(self, config: Union[str, dict], mpu=None): def _initialize_params(self, param_dict): self.train_batch_size = get_train_batch_size(param_dict) - #print(f"beginning get_train_batch_size = {get_train_batch_size}") self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict) self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict) self.steps_per_print = get_steps_per_print(param_dict) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index 1acb9df9384d..4973cdccb366 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -46,7 +46,7 @@ def get_data_sampling(param_dict): param_dict[DATA_SAMPLING] = {} sub_param_dict = param_dict[DATA_SAMPLING] output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict) - output[DYNAMIC_BATCHING] = get_dynamic_batching(param_dict.get(DYNAMIC_BATCHING, {})) + output[DYNAMIC_BATCHING] = get_dynamic_batching(sub_param_dict) return output @@ -86,36 +86,24 @@ def get_curriculum_learning(param_dict): return output -def get_dynamic_batching_params(param_dict): - if DYNAMIC_BATCHING in param_dict.keys(): - dynamic_batching_params = copy.copy(param_dict[DYNAMIC_BATCHING]) - dynamic_batching_params.pop(DYNAMIC_BATCHING_ENABLED) - return dynamic_batching_params - else: - return {} - - def get_dynamic_batching(param_dict): - param_dict[DYNAMIC_BATCHING_ENABLED] = bool( - param_dict.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT)) - param_dict[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str( - param_dict.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT)) - param_dict[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int( - param_dict.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT)) - param_dict[DYNAMIC_BATCHING_NUM_WORKERS] = int( - param_dict.get(DYNAMIC_BATCHING_NUM_WORKERS, DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT)) - param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int( - param_dict[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) if DYNAMIC_BATCHING_MAX_BATCH_SIZE in param_dict else None - param_dict[DYNAMIC_BATCHING_SAMPLES_ORDER] = str( - param_dict.get(DYNAMIC_BATCHING_SAMPLES_ORDER, DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT)) - if param_dict[DYNAMIC_BATCHING_ENABLED]: - assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in param_dict.keys( + output = copy.copy(param_dict.get(DYNAMIC_BATCHING, {})) + output[DYNAMIC_BATCHING_ENABLED] = bool(output.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT)) + output[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str( + output.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT)) + output[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int( + output.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT)) + output[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int(output[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) \ + if DYNAMIC_BATCHING_MAX_BATCH_SIZE in output.keys() \ + else DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT + output[DYNAMIC_BATCHING_SAMPLES_ORDER] = str( + output.get(DYNAMIC_BATCHING_SAMPLES_ORDER, DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT)) + if output[DYNAMIC_BATCHING_ENABLED]: + assert DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH in output.keys( ), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH} must be specified" - param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH] = int(param_dict[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH]) - param_dict[DYNAMIC_BATCHING_PIN_MEMORY] = bool( - param_dict.get(DYNAMIC_BATCHING_PIN_MEMORY, DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT)) - param_dict[DYNAMIC_BATCHING_VERBOSE] = bool(param_dict.get(DYNAMIC_BATCHING_VERBOSE, False)) - return param_dict + output[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH] = int(output[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH]) + output[DYNAMIC_BATCHING_VERBOSE] = bool(output.get(DYNAMIC_BATCHING_VERBOSE, False)) + return output def get_curriculum_learning_enabled(param_dict): diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py index ab2429477eb7..7634c023d6ad 100644 --- a/deepspeed/runtime/data_pipeline/constants.py +++ b/deepspeed/runtime/data_pipeline/constants.py @@ -68,16 +68,13 @@ DYNAMIC_BATCHING = "dynamic_batching" DYNAMIC_BATCHING_ENABLED = "enabled" DYNAMIC_BATCHING_ENABLED_DEFAULT = False +DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH = "seqlen_sample_to_metric_path" DYNAMIC_BATCHING_LR_SCALING_METHOD = "lr_scaling_method" # "linear" / "sqrt" / "none" DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT = "linear" DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size" DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1 DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size" DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None -DYNAMIC_BATCHING_NUM_WORKERS = "dataloader_num_workers" -DYNAMIC_BATCHING_NUM_WORKERS_DEFAULT = 0 -DYNAMIC_BATCHING_PIN_MEMORY = "dataloader_pin_memory" -DYNAMIC_BATCHING_PIN_MEMORY_DEFAULT = False DYNAMIC_BATCHING_SAMPLES_ORDER = "samples_order" # "random" / "order" / "default" DYNAMIC_BATCHING_SAMPLES_ORDER_DEFAULT = "dataloader" # "random" / "order" / "dataloader" DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH = "max_tokens_per_batch" diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 7088df223bd8..7d565c0c6049 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -582,7 +582,7 @@ def run_map_reduce(self): metric_to_samples_dict[value.item()] = [] metric_to_samples_dict[value.item()].append(sample.item()) - # index_to_metric and index_to_sample serialize a dicitonary from metric to samples + # index_to_metric and index_to_sample serialize a dictionary from metric to samples # index_to_metric stores a key per row, index_to_sample stores the values per row values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()] samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()] diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index f0f832fd4022..7e099a277e34 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -13,6 +13,7 @@ from torch.utils.data import DataLoader, DistributedSampler from deepspeed.utils import logger from deepspeed.runtime.pipe.engine import PipelineEngine +from deepspeed.runtime.data_pipeline.constants import DYNAMIC_BATCHING, DYNAMIC_BATCHING_ENABLED def batch_by_size( @@ -333,14 +334,13 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, engine, dataset_filter_ids=None, dataloader_collate_fn=None, - sample_padding_fn=None, - replace_lr_scheduler=True, - replace_dataloader=True): + sample_padding_fn=None): """ a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size for the deepspeed runtime. See `batch_by_size()` for arguments and documentation. """ - batching_config = engine.config['data_efficiency']['dynamic_batching'] + batching_config = engine._config.data_efficiency_config[DYNAMIC_BATCHING] + assert batching_config[DYNAMIC_BATCHING_ENABLED], "Dynamic batching is not enabled in the config" dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_filter_ids=dataset_filter_ids, @@ -363,12 +363,6 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, required_microbatches_of_same_seqlen=isinstance(engine, PipelineEngine), verbose=batching_config["verbose"], ) - if replace_lr_scheduler: - engine.lr_scheduler = lr_scheduler - if replace_dataloader: - engine.training_dataloader = dataloader - engine.data_iterator = iter(engine.training_dataloader) - # engine.deepspeed_io(**deepspeed_io_kwargs) return dataloader, lr_scheduler, deepspeed_io_kwargs diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index 54cb6e4dce00..ed2458a9bbfc 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -124,7 +124,7 @@ def to_layers(self): "dynamic_batching": { "enabled": True, "dataloader_num_workers": 0, - "dataloader_pin_memory": 0, + "dataloader_pin_memory": False, "lr_scaling_method": "linear", "min_batch_size": 1, "max_batch_size": 10, @@ -168,5 +168,5 @@ def to_layers(self): engine.backward(loss) engine.step() # lr_kwargs={"epoch": batch_id}) - if engine.data_parallel_group.rank(): + if engine.data_parallel_group.rank() == 0: print(f"epoch {epoch}, batch {batch_id}, loss {loss.item()}, LRs {lr_scheduler.get_lr()}") From b072fbbfee6d93a8e2e65744b0a56115d253f880 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 15 Mar 2024 16:47:58 +0000 Subject: [PATCH 62/64] read seqlens from DataAnalyzer output --- deepspeed/runtime/data_pipeline/config.py | 1 + deepspeed/runtime/data_pipeline/constants.py | 2 + .../variable_batch_size_and_lr.py | 38 ++++++++++----- .../variable_batch_size_and_lr_example.py | 46 ++++++++++++++----- 4 files changed, 64 insertions(+), 23 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index 4973cdccb366..fd8b0df1c526 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -42,6 +42,7 @@ def get_data_sampling(param_dict): output[DATA_SAMPLING_ENABLED] = get_data_sampling_enabled(param_dict) output[DATA_SAMPLING_NUM_EPOCHS] = get_data_sampling_num_epochs(param_dict) output[DATA_SAMPLING_NUM_WORKERS] = get_data_sampling_num_workers(param_dict) + output[DATA_SAMPLING_PIN_MEMORY] = bool(output.get(param_dict[DATA_SAMPLING][DATA_SAMPLING_PIN_MEMORY], DATA_SAMPLING_PIN_MEMORY_DEFAULT)) if DATA_SAMPLING not in param_dict.keys(): param_dict[DATA_SAMPLING] = {} sub_param_dict = param_dict[DATA_SAMPLING] diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py index 7634c023d6ad..98c3361f6852 100644 --- a/deepspeed/runtime/data_pipeline/constants.py +++ b/deepspeed/runtime/data_pipeline/constants.py @@ -22,6 +22,8 @@ DATA_SAMPLING_NUM_EPOCHS_DEFAULT = 1000 DATA_SAMPLING_NUM_WORKERS = "num_workers" DATA_SAMPLING_NUM_WORKERS_DEFAULT = 0 +DATA_SAMPLING_PIN_MEMORY = "pin_memory" +DATA_SAMPLING_PIN_MEMORY_DEFAULT = False ######################################### # Data efficiency - Data Sampling - Curriculum Learning diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index 7e099a277e34..b1edfeecf159 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -7,13 +7,15 @@ import random import torch +import os import numpy as np from torch.optim.lr_scheduler import LRScheduler from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader, DistributedSampler from deepspeed.utils import logger from deepspeed.runtime.pipe.engine import PipelineEngine -from deepspeed.runtime.data_pipeline.constants import DYNAMIC_BATCHING, DYNAMIC_BATCHING_ENABLED +from deepspeed.runtime.data_pipeline.constants import * +from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset def batch_by_size( @@ -330,8 +332,8 @@ def get_lr(self) -> float: def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, - dataset_seqlens, engine, + dataset_seqlens=None, dataset_filter_ids=None, dataloader_collate_fn=None, sample_padding_fn=None): @@ -339,29 +341,43 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size for the deepspeed runtime. See `batch_by_size()` for arguments and documentation. """ - batching_config = engine._config.data_efficiency_config[DYNAMIC_BATCHING] + data_sampling_config = engine._config.data_efficiency_config[DATA_SAMPLING] + batching_config = data_sampling_config[DYNAMIC_BATCHING] assert batching_config[DYNAMIC_BATCHING_ENABLED], "Dynamic batching is not enabled in the config" + + if dataset_seqlens is None: + # In not provided by user, look for the seqlen metric that was output by the Data Analyzer + # (see the mian in deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py for an example) + # TODO this only works when all nodes can access the same file storage + sample_to_seqlen_path = batching_config[DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH] + if not (os.path.exists(f"{sample_to_seqlen_path}.bin") and os.path.exists(f"{sample_to_seqlen_path}.idx")): + msg = (f"Cannot find metric files for sequence length in {sample_to_seqlen_path}.* . Run " + "DataAnalyzer with metric_name='seqlen' and metric_value='single_value_per_sample' and pass the" + f" path to the dynamic_batching config as {DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH}") + raise ValueError(msg) + dataset_seqlens = MMapIndexedDataset(sample_to_seqlen_path, skip_warmup=True) + dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, dataset_filter_ids=dataset_filter_ids, dataset_seqlens=dataset_seqlens, effective_batch_size=engine.train_batch_size(), - max_tokens_per_batch=batching_config["max_tokens_per_batch"], - lr_scaling_method=batching_config["lr_scaling_method"], - samples_order=batching_config["samples_order"], - min_batch_size=batching_config["min_batch_size"], - max_batch_size=batching_config["max_batch_size"], + max_tokens_per_batch=batching_config[DYNAMIC_BATCHING_MAX_TOKENS_PER_BATCH], + lr_scaling_method=batching_config[DYNAMIC_BATCHING_LR_SCALING_METHOD], + samples_order=batching_config[DYNAMIC_BATCHING_SAMPLES_ORDER], + min_batch_size=batching_config[DYNAMIC_BATCHING_MIN_BATCH_SIZE], + max_batch_size=batching_config[DYNAMIC_BATCHING_MAX_BATCH_SIZE], dataloader_batch_size=engine.train_micro_batch_size_per_gpu(), dataloader_rank=engine.data_parallel_group.rank(), dataloader_num_replicas=engine.data_parallel_group.size(), - dataloader_num_workers=batching_config["dataloader_num_workers"], + dataloader_num_workers=data_sampling_config[DATA_SAMPLING_NUM_WORKERS], dataloader_collate_fn=dataloader_collate_fn, - dataloader_pin_memory=batching_config["dataloader_pin_memory"], + dataloader_pin_memory=data_sampling_config[DATA_SAMPLING_PIN_MEMORY], sample_padding_fn=sample_padding_fn, lr_scheduler_or_optimizer=engine.lr_scheduler or engine.optimizer, required_microbatches_of_same_size=isinstance(engine, PipelineEngine), required_microbatches_of_same_seqlen=isinstance(engine, PipelineEngine), - verbose=batching_config["verbose"], + verbose=batching_config[DYNAMIC_BATCHING_VERBOSE], ) return dataloader, lr_scheduler, deepspeed_io_kwargs diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index ed2458a9bbfc..ee63c04493be 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -94,7 +94,6 @@ def to_layers(self): max_seqlen = 15 dataset = TestData(seq_count=300, min_seqlen=5, max_seqlen=max_seqlen) - dataset_seqlens = [len(s[0]) for s in dataset] model = AttentionHeadAndFeedForward(max_seqlen, dataset.embed_dim).to(device) loss_fn = lambda x, y: F.mse_loss(x.float(), y.float()) @@ -121,17 +120,39 @@ def to_layers(self): }, "data_efficiency": { "enabled": True, - "dynamic_batching": { + "seed": 42, + "data_sampling": { "enabled": True, - "dataloader_num_workers": 0, - "dataloader_pin_memory": False, - "lr_scaling_method": "linear", - "min_batch_size": 1, - "max_batch_size": 10, - "samples_order": "dataloader", # "random" / "seqlen" / "default" - "max_tokens_per_batch": 40, - "verbose": False, - } + "num_epochs": 1, + "num_workers": 0, + "pin_memory": False, + # "curriculum_metrics": { + # "seqlen": { + # "index_to_sample_path": "./ds_curriculum_output/seqlen/seqlen_index_to_sample_percentile_merged", + # "index_to_metric_path": "./ds_curriculum_output/seqlen/seqlen_index_to_metric", + # "difficulty_type": "percentile", + # "clustering_type": "schedule_based", + # "max_difficulty": 100, + # "min_difficulty": 1, + # "schedule_type": "fixed_root", + # "schedule_config": { + # "total_curriculum_step": 110000, + # "difficulty_step": 1, #multiple of 8 to support FP16? + # "root_degree": 2 + # } + # }, + # }, + "dynamic_batching": { + "enabled": True, + "seqlen_sample_to_metric_path": "./ds_curriculum_output/seqlen/seqlen_sample_to_metric", + "lr_scaling_method": "linear", + "min_batch_size": 1, + "max_batch_size": 10, + "samples_order": "dataloader", # "random" / "seqlen" / "default" + "max_tokens_per_batch": 40, + "verbose": False, + }, + }, }, } @@ -139,11 +160,12 @@ def to_layers(self): engine, _, _, _ = deepspeed.initialize(config=config, model=model) # We will simulate a curriculum step, by filtering only a subset of sequences with a given seqlen + dataset_seqlens = [len(s[0]) for s in dataset] dataset_filter_ids = [i for i, seqlen in enumerate(dataset_seqlens) if seqlen > 7 and seqlen < 14] dataloader, lr_scheduler, _ = \ get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset=dataset, - dataset_seqlens=dataset_seqlens, + dataset_seqlens=dataset_seqlens, #if not provided, will look for the output of DataAnalyzer dataset_filter_ids=dataset_filter_ids, #remove or None to include the whole dataset engine=engine, dataloader_collate_fn=dataset.collate_fn, From d121be591548d0f34b3c6a57b678e44d08d638ad Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Fri, 15 Mar 2024 18:30:07 +0000 Subject: [PATCH 63/64] flatten --- .../data_pipeline/data_sampling/variable_batch_size_and_lr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index b1edfeecf159..ead2ceee4e9b 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -356,6 +356,7 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, f" path to the dynamic_batching config as {DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH}") raise ValueError(msg) dataset_seqlens = MMapIndexedDataset(sample_to_seqlen_path, skip_warmup=True) + dataset_seqlens = torch.tensor(list(dataset_seqlens), dtype=torch.int64).flatten() # from Nx1 to N dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, From b32185c695800d380c6b4670a37decfa7fe75b71 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Sat, 16 Mar 2024 23:56:58 +0000 Subject: [PATCH 64/64] use DistributedDataAanalyzer to generate seqlen metric files if missing --- deepspeed/runtime/data_pipeline/config.py | 3 +- .../variable_batch_size_and_lr.py | 47 +++++++++++++++---- .../variable_batch_size_and_lr_example.py | 20 ++++++-- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index fd8b0df1c526..d921ea732d32 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -42,7 +42,8 @@ def get_data_sampling(param_dict): output[DATA_SAMPLING_ENABLED] = get_data_sampling_enabled(param_dict) output[DATA_SAMPLING_NUM_EPOCHS] = get_data_sampling_num_epochs(param_dict) output[DATA_SAMPLING_NUM_WORKERS] = get_data_sampling_num_workers(param_dict) - output[DATA_SAMPLING_PIN_MEMORY] = bool(output.get(param_dict[DATA_SAMPLING][DATA_SAMPLING_PIN_MEMORY], DATA_SAMPLING_PIN_MEMORY_DEFAULT)) + output[DATA_SAMPLING_PIN_MEMORY] = bool( + output.get(param_dict[DATA_SAMPLING][DATA_SAMPLING_PIN_MEMORY], DATA_SAMPLING_PIN_MEMORY_DEFAULT)) if DATA_SAMPLING not in param_dict.keys(): param_dict[DATA_SAMPLING] = {} sub_param_dict = param_dict[DATA_SAMPLING] diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py index ead2ceee4e9b..51611d3ae41c 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py @@ -16,6 +16,8 @@ from deepspeed.runtime.pipe.engine import PipelineEngine from deepspeed.runtime.data_pipeline.constants import * from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset +from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer import DistributedDataAnalyzer +import pathlib def batch_by_size( @@ -336,27 +338,52 @@ def get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed(dataset, dataset_seqlens=None, dataset_filter_ids=None, dataloader_collate_fn=None, - sample_padding_fn=None): + sample_padding_fn=None, + batch_seqlens_fn=None): """ a simplified call to get_dataloader_and_lr_scheduler_for_variable_batch_size for the deepspeed runtime. - See `batch_by_size()` for arguments and documentation. + Needs the seqlens of every sample. It will try three alternatives: + - if `dataset_seqlens` is provided by user, use that. + - otherwise, looks for the seqlen metric path (in the connfig) that contains the output of the Data Analyzer + - otherwise, use the user-provided function `batch_seqlens_fn` and call Data Analyzer to output seqlen metric + See `batch_by_size()` for arguments and more documentation. """ data_sampling_config = engine._config.data_efficiency_config[DATA_SAMPLING] batching_config = data_sampling_config[DYNAMIC_BATCHING] assert batching_config[DYNAMIC_BATCHING_ENABLED], "Dynamic batching is not enabled in the config" if dataset_seqlens is None: - # In not provided by user, look for the seqlen metric that was output by the Data Analyzer - # (see the mian in deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py for an example) - # TODO this only works when all nodes can access the same file storage + # In seqlen provided by user, look for the seqlen metric that was output by the Data Analyzer + # (see the main in deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py for an example) sample_to_seqlen_path = batching_config[DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH] if not (os.path.exists(f"{sample_to_seqlen_path}.bin") and os.path.exists(f"{sample_to_seqlen_path}.idx")): - msg = (f"Cannot find metric files for sequence length in {sample_to_seqlen_path}.* . Run " - "DataAnalyzer with metric_name='seqlen' and metric_value='single_value_per_sample' and pass the" - f" path to the dynamic_batching config as {DYNAMIC_BATCHING_SEQLEN_SAMPLE_TO_METRIC_PATH}") - raise ValueError(msg) + # if the metric files are not found, we run the DataAnalyzer to write the metric files + msg = f"Cannot find metric files for sequence length in {sample_to_seqlen_path}.idx or *.bin." + msg += " We will run data analyzer to generated them..." + logger.warning(msg) + + if batch_seqlens_fn is None: + raise ValueError("sample_seqlen_fn must be provided if dataset_seqlens is not provided") + + DistributedDataAnalyzer( + dataset=dataset, + metric_functions=[batch_seqlens_fn], + collate_fn=dataloader_collate_fn, + batch_size=2**10, # batch size for map-reduce, not training + num_workers=engine.world_size, + worker_id=engine.global_rank, + save_path=pathlib.Path(f"{sample_to_seqlen_path}.bin").parent.parent, + metric_types=['single_value_per_sample'], + metric_names=["seqlen"], + device=engine.device, + ).run_map_reduce() + dataset_seqlens = MMapIndexedDataset(sample_to_seqlen_path, skip_warmup=True) - dataset_seqlens = torch.tensor(list(dataset_seqlens), dtype=torch.int64).flatten() # from Nx1 to N + assert len(dataset_seqlens) == len(dataset), "Seqlens size does not match the input dataset size" + + # TODO we are copying all seqlens into memory, we should adapt the code to use an iterative streamer + # and use the other files output by DataAnalyzer that returns an ordered dictionary of seqlen to sample ids + dataset_seqlens = np.array(list(dataset_seqlens), dtype=np.int64).flatten() # from Nx1 to N dataloader, lr_scheduler, deepspeed_io_kwargs = get_dataloader_and_lr_scheduler_for_variable_batch_size( dataset=dataset, diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py index ee63c04493be..e4bd5b09bb1f 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr_example.py @@ -31,19 +31,28 @@ def __init__(self, seq_count, min_seqlen=1, max_seqlen=20, embed_dim=5, seed=0): __len__ = lambda self: len(self.seqs) __getitem__ = lambda self, idx: (self.seqs[idx], len(self.seqs[idx])) - def collate_fn(self, batch): + def batch_collate_fn(self, batch): """ collate sequences of different lengths into batch of size BxTxE, where T is max seqlen """ seqs, labels = zip(*batch) seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=self.padding_value) labels = torch.tensor(labels, dtype=float) return seqs, labels - def padding_fn(self, sample, size): + def sample_padding_fn(self, sample, size): """ pad sequence `seq` of shape TxE to size T'xE where T' is given by `size` """ seq, label = sample seq = F.pad(seq, pad=(0, 0, 0, size - len(seq)), value=self.padding_value) return seq, label + def batch_seqlens_fn(self, batch): + """ given a batch, return the size of every sequence in the batch """ + seqlens = [] + seqs, _ = batch + for seq in seqs: + pad_indices = (seq[:, 0] == self.padding_value).nonzero(as_tuple=True)[0] + seqlens.append(len(seq) if len(pad_indices) == 0 else pad_indices[0].item()) + return torch.tensor(seqlens, dtype=torch.int64) + class AttentionHeadAndFeedForward(nn.Module): """ A single attention head of batch of shape BxTxE (with variable T) and attention matrix @@ -165,11 +174,12 @@ def to_layers(self): dataloader, lr_scheduler, _ = \ get_dataloader_and_lr_scheduler_for_variable_batch_size_deepspeed( dataset=dataset, - dataset_seqlens=dataset_seqlens, #if not provided, will look for the output of DataAnalyzer + # dataset_seqlens=dataset_seqlens, #if None, output metrics with DataAnalyzer and open them dataset_filter_ids=dataset_filter_ids, #remove or None to include the whole dataset engine=engine, - dataloader_collate_fn=dataset.collate_fn, - sample_padding_fn=dataset.padding_fn) + dataloader_collate_fn=dataset.batch_collate_fn, + sample_padding_fn=dataset.sample_padding_fn, + batch_seqlens_fn=dataset.batch_seqlens_fn,) gradient_acc_steps = engine.gradient_accumulation_steps() n_batches_per_rank = len(dataloader) // (gradient_acc_steps * engine.train_micro_batch_size_per_gpu())