From cf5d7314ff73e688fa608c294b7ea13f654f87f7 Mon Sep 17 00:00:00 2001
From: Jameson Merkow <jameson.merkow@microsoft.com>
Date: Thu, 17 Oct 2024 15:04:56 +0000
Subject: [PATCH] move csvs to storage

---
 .../medimageinsight/adapter-training.ipynb    | 36 +++++++++++++------
 .../exam-parameter-detection.ipynb            |  4 +--
 .../zero-shot-classification.ipynb            | 20 ++++++++---
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/sdk/python/foundation-models/healthcare-ai/medimageinsight/adapter-training.ipynb b/sdk/python/foundation-models/healthcare-ai/medimageinsight/adapter-training.ipynb
index 5cf14966b2..050b7b2084 100644
--- a/sdk/python/foundation-models/healthcare-ai/medimageinsight/adapter-training.ipynb
+++ b/sdk/python/foundation-models/healthcare-ai/medimageinsight/adapter-training.ipynb
@@ -127,7 +127,7 @@
     "- **Computing Image Embeddings:** Our source images are in DICOM format, as typical for radiological images. During inference, we extract the image pixel bytes and convert them to 8-bit monochromatic bitmap for compatibility with MedImageInsight. We then compute image embeddings for each image using the MedImageInsight model.\n",
     "\n",
     "**Diagnostic Labels**\n",
-    "- We provide the diagnostic information (i.e., Support Devices, Pleural Effusion) as categorical label (i.e. 0, 1, 2) to train an adaptor and enable the MedImageInsight embeddings to adapt on variable downstream tasks. The diagnostic labels are provided in the data split for training and testing: `./classification_demo/data_input/adaptor_tutorial_train_split.csv` & `./classification_demo/data_input/adaptor_tutorial_test_split.csv`."
+    "- We provide the diagnostic information (i.e., Support Devices, Pleural Effusion) as categorical label (i.e. 0, 1, 2) to train an adaptor and enable the MedImageInsight embeddings to adapt on variable downstream tasks. The diagnostic labels are provided in the data split for training and testing: `<csv_folder>/adaptor_tutorial_train_split.csv` & `<csv_folder>/adaptor_tutorial_test_split.csv`."
    ]
   },
   {
@@ -147,6 +147,19 @@
     "        Exception(\n",
     "            f\"Please download dicom files as described in this notebook into the input_folder: {input_folder}\"\n",
     "        )\n",
+    "    )\n",
+    "\n",
+    "## Load csv folder labels and splits\n",
+    "csv_folder = (\n",
+    "    \"/home/azureuser/data/healthcare-ai/medimageinsight/classification_demo/data_input/\"\n",
+    ")\n",
+    "\n",
+    "## Check if the dataset exists\n",
+    "if not os.path.exists(csv_folder):\n",
+    "    raise (\n",
+    "        Exception(\n",
+    "            f\"Please download csv files as described in this notebook into the csv_folder: {csv_folder}\"\n",
+    "        )\n",
     "    )"
    ]
   },
@@ -160,8 +173,8 @@
     "- **2.1. Compute Embeddings**\n",
     "\n",
     "    First, we split our dataset into training and testing sets. The splits are specified in the CSV files:\n",
-    "    - Training set: `./classification_demo/data_input/adaptor_tutorial_train_split.csv`\n",
-    "    - Testing set: `./classification_demo/data_input/adaptor_tutorial_test_split.csv`\n",
+    "    - Training set: `<csv_folder>/adaptor_tutorial_train_split.csv`\n",
+    "    - Testing set: `<csv_folder>/adaptor_tutorial_test_split.csv`\n",
     "\n",
     "    After splitting the data manually, we compute the image embeddings for all imaging samples in both sets using the MedImageInsight model. These embeddings are high-dimensional vectors that capture the semantic meaning of the visual content of the images. They serve as compact and informative representations for various downstream tasks.\n",
     "\n",
@@ -293,8 +306,8 @@
     ")\n",
     "\n",
     "# Load the training and validation CSV files\n",
-    "train_csv_path = \"./classification_demo/data_input/adaptor_tutorial_train_split.csv\"\n",
-    "val_csv_path = \"./classification_demo/data_input/adaptor_tutorial_test_split.csv\"\n",
+    "train_csv_path = f\"{csv_folder}/adaptor_tutorial_train_split.csv\"\n",
+    "val_csv_path = f\"{csv_folder}/adaptor_tutorial_test_split.csv\"\n",
     "\n",
     "# Read the CSV files into DataFrames\n",
     "train_df = pd.read_csv(train_csv_path)\n",
@@ -2058,7 +2071,7 @@
     "    First, we prepare the test dataset and perform inference:\n",
     "\n",
     "   - **Load the Test Dataset:** We read the test CSV file, which contains the test data split:\n",
-    "      - **Test set:** `./classification_demo/data_input/adaptor_tutorial_test_split.csv`\n",
+    "      - **Test set:** `<csv_folder>/adaptor_tutorial_test_split.csv`\n",
     "   - **Load Test Image Embeddings:** We use the image embeddings computed earlier and create a DataFrame `df_features` containing image names and their corresponding embeddings\n",
     "   - **Load the Trained Adapter Model:** We initialize the adapter model architecture and load the trained weights from the saved model file: \n",
     "      - **Model path:** `./medimageinsight_tutorial_output/adaptor_model/best_metric_model.pth`\n",
@@ -2113,7 +2126,9 @@
     "model_path = os.path.join(\n",
     "    output_dir, \"best_metric_model.pth\"\n",
     ")  # Path to your saved model\n",
-    "test_csv_path = \"./classification_demo/data_input/adaptor_tutorial_test_split.csv\"  # Path to your test CSV file\n",
+    "test_csv_path = (\n",
+    "    f\"{csv_folder}/adaptor_tutorial_test_split.csv\"  # Path to your test CSV file\n",
+    ")\n",
     "\n",
     "# Load the features DataFrame (same as during training)\n",
     "df_features = pd.DataFrame(\n",
@@ -2263,14 +2278,13 @@
     "    }\n",
     ")\n",
     "df.to_csv(\n",
-    "    \"./classification_demo/data_input/adaptor_finetuning_classification_results.csv\",\n",
+    "    \"./adaptor_finetuning_classification_results.csv\",\n",
     "    index=False,\n",
     ")\n",
     "\n",
     "## Load Zero-Shot Classification Results\n",
-    "df_zero_shot = pd.read_csv(\n",
-    "    \"./classification_demo/data_input/zero_shot_classification_results.csv\"\n",
-    ")\n",
+    "# You can also use your own results if you have done the zero shot classification notebook!\n",
+    "df_zero_shot = pd.read_csv(f\"{csv_folder}/zero_shot_classification_results.csv\")\n",
     "\n",
     "# Select four images to visualize (Two correct examples and two failed examples)\n",
     "selected_images = [\n",
diff --git a/sdk/python/foundation-models/healthcare-ai/medimageinsight/exam-parameter-demo/exam-parameter-detection.ipynb b/sdk/python/foundation-models/healthcare-ai/medimageinsight/exam-parameter-demo/exam-parameter-detection.ipynb
index d177765855..92b723ef31 100644
--- a/sdk/python/foundation-models/healthcare-ai/medimageinsight/exam-parameter-demo/exam-parameter-detection.ipynb
+++ b/sdk/python/foundation-models/healthcare-ai/medimageinsight/exam-parameter-demo/exam-parameter-detection.ipynb
@@ -42,14 +42,14 @@
     "   - Install the required libraries. For convenience we provide the Conda environment definition file: [`examparamdetection_conda_environment.yml`](./examparamdetection_conda_environment.yml).\n",
     "\n",
     "3. **Download the sample data**:  \n",
-    "   - Use the following command to download the dataset with samples into your working folder. Once you download, make sure the files are in the `data` directory located in the same directory as this notebook so that all paths in this sample work out of the box.  \n",
+    "   - Use the following commands to download the dataset with samples into your working folder. Once you download, make sure the files are in the `data` directory located in the same directory as this notebook so that all paths in this sample work out of the box.  \n",
     "\n",
     "      `azcopy copy --recursive https://azuremlexampledata.blob.core.windows.net/data/healthcare-ai/medimageinsight-examparameter/* ./data`\n",
     "\n",
     "\n",
     "4. **Data Assets**:  \n",
     "   We are not providing the original DICOM files, but rather are providing the following:\n",
-    "   - DICOM tags extracted into a CSV file named `data/mri_sample_features-csm.csv`. Each row in this file represents a single MRI series.\n",
+    "   - DICOM tags extracted into a CSV file named `data/mri_sample_features-sm.csv`. Each row in this file represents a single MRI series.\n",
     "   - Embedding vectors and some slices used for visualization are provided in the above sample data.  In this data you will find:  \n",
     "       - Embedding vectors serialized as .pkl files in the: `data/feature_vectors` directory\n",
     "       - .png files of several slices from the original MRI dataset that we use for visualization in the `data/pngs` directory\n",
diff --git a/sdk/python/foundation-models/healthcare-ai/medimageinsight/zero-shot-classification.ipynb b/sdk/python/foundation-models/healthcare-ai/medimageinsight/zero-shot-classification.ipynb
index 7c1d002164..f78112cba7 100644
--- a/sdk/python/foundation-models/healthcare-ai/medimageinsight/zero-shot-classification.ipynb
+++ b/sdk/python/foundation-models/healthcare-ai/medimageinsight/zero-shot-classification.ipynb
@@ -146,6 +146,20 @@
     "        )\n",
     "    )\n",
     "\n",
+    "## Load csv folder labels and splits\n",
+    "csv_folder = (\n",
+    "    \"/home/azureuser/data/healthcare-ai/medimageinsight/classification_demo/data_input/\"\n",
+    ")\n",
+    "\n",
+    "## Check if the dataset exists\n",
+    "if not os.path.exists(csv_folder):\n",
+    "    raise (\n",
+    "        Exception(\n",
+    "            f\"Please download csv files as described in this notebook into the csv_folder: {csv_folder}\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
     "## For Text\n",
     "## You can add your own text or use the below text for inference\n",
     "chest_pathology_text = [\n",
@@ -365,7 +379,7 @@
    ],
    "source": [
     "## Load Ground Truth Labels\n",
-    "findings_csv = os.path.join(\"./classification_demo/data_input/dcm_sample_label.csv\")\n",
+    "findings_csv = os.path.join(f\"{csv_folder}/dcm_sample_label.csv\")\n",
     "findings_df = pd.read_csv(findings_csv)\n",
     "\n",
     "# Generate dictionary to organize all probabilities corresponding to name\n",
@@ -480,9 +494,7 @@
     "        \"ground_truth_label\": [gt_dict[name][\"label_category\"] for name in name_list],\n",
     "    }\n",
     ")\n",
-    "df.to_csv(\n",
-    "    \"./classification_demo/data_input/zero_shot_classification_results.csv\", index=False\n",
-    ")\n",
+    "df.to_csv(\"./zero_shot_classification_results.csv\", index=False)\n",
     "\n",
     "# Select four images to visualize (Two correct examples and two failed examples)\n",
     "selected_images = [\n",