diff --git a/playbook/stories/fine_tune_img.ipynb b/playbook/stories/fine_tune_img.ipynb new file mode 100644 index 00000000..889106d9 --- /dev/null +++ b/playbook/stories/fine_tune_img.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Easily fine-tune a ViT with images from Bing search" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the sliceguard library and Spotlight to fine-tune a ViT model for image classification and detect problematic clusters in a dataset created from Bing search in a few lines of code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The Imports\n", + "from renumics import spotlight\n", + "from sliceguard.data import create_imagedataset_from_bing\n", + "from sliceguard.models.huggingface import (\n", + " finetune_image_classifier,\n", + " generate_image_pred_probs_embeddings,\n", + ")\n", + "from sliceguard.embeddings import generate_image_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an Image Dataset from Bing\n", + "class_names = [\n", + " \"Blue Tang\",\n", + " \"Clownfish\",\n", + " \"Spotted Eagle Ray\",\n", + " \"Longnose Butterfly Fish\",\n", + " \"Moorish Idol\",\n", + " \"Royal Gramma Fish\",\n", + "]\n", + "df = create_imagedataset_from_bing(\n", + " class_names, 25, \"data\", test_split=0.2, license=\"Free to share and use\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fine-tune a ViT Model with the data (in 1-2 minutes on a GPU)\n", + "finetune_image_classifier(\n", + " df[df[\"split\"] == \"train\"],\n", + " model_name=\"google/vit-base-patch16-224-in21k\",\n", + " output_model_folder=\"./model_folder\",\n", + " epochs=15,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Enrich the DataFrame with Predictions, Probabilities and Embeddings\n", + "df[\"prediction\"], df[\"probs\"], df[\"embeddings\"] = generate_image_pred_probs_embeddings(\n", + " df[\"image\"].values, model_name=\"./model_folder\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the result and detect problematic clusters\n", + "spotlight.show(\n", + " df, layout=\"https://spotlight.renumics.com/resources/image_classification_v1.0.json\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/playbook/stories/navigating_data_issues.ipynb b/playbook/stories/navigating_data_issues.ipynb index 4460adb7..a090c158 100644 --- a/playbook/stories/navigating_data_issues.ipynb +++ b/playbook/stories/navigating_data_issues.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "## Navigating Data Issues\n", - "This notebook is part of an article not published, yet. URL tbd" + "This notebook is part of an [article at medium.](https://medium.com/@markus.stoll/navigate-data-issues-d05ae8c45841)\n" ] }, {