diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index d71650eac..678e27471 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -257,7 +257,12 @@ def prepare_inputs_labels_for_multimodal( cur_new_input_embeds.append(cur_input_embeds_no_im[i]) cur_new_labels.append(cur_labels_noim[i]) if i < num_images: - cur_image_features = image_features[cur_image_idx] + try: + cur_image_features = image_features[cur_image_idx] + except Exception as e: + print(f'Tracking input_ids START.....{e}') + print(input_ids) + print('Tracking input_ids END.....') cur_image_idx += 1 cur_new_input_embeds.append(cur_image_features) cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype)) diff --git a/llava/train/train.py b/llava/train/train.py index d8442f265..1a3bf122a 100644 --- a/llava/train/train.py +++ b/llava/train/train.py @@ -750,6 +750,9 @@ def expand2square(pil_img, background_color): # image does not exist in the data, but the model is multimodal crop_size = self.data_args.image_processor.crop_size data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) + print(sources) + print(data_dict) + print('Tracking source and data_dict above.....') return data_dict