diff --git a/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb b/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb deleted file mode 100644 index a3aa5e1..0000000 --- a/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb +++ /dev/null @@ -1,388 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "26705673-b9e8-4d6b-b5b6-a1cf47d1df4d", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "source": [ - "# TextGrad Tutorials: MultiModal Optimization\n", - "\n", - "![TextGrad](https://github.com/vinid/data/blob/master/logo_full.png?raw=true)\n", - "\n", - "An autograd engine -- for textual gradients!\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb)\n", - "[![GitHub license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)\n", - "[![Arxiv](https://img.shields.io/badge/arXiv-2406.07496-B31B1B.svg)](https://arxiv.org/abs/2406.07496)\n", - "[![Documentation Status](https://readthedocs.org/projects/textgrad/badge/?version=latest)](https://textgrad.readthedocs.io/en/latest/?badge=latest)\n", - "[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/textgrad)](https://pypi.org/project/textgrad/)\n", - "[![PyPI](https://img.shields.io/pypi/v/textgrad)](https://pypi.org/project/textgrad/)\n", - "\n", - "**Objectives for this tutorial:**\n", - "\n", - "* Explore some more MultiModal cases in TextGrad. Using a dataset from the literature.\n", - "\n", - "**Requirements:**\n", - "\n", - "* You need to have an OpenAI API key to run this tutorial. This should be set as an environment variable as OPENAI_API_KEY.\n" - ] - }, - { - "cell_type": "markdown", - "id": "f10aa9d1-8482-4db7-97af-fa68782e5a4a", - "metadata": {}, - "source": [ - "## Image Support in TextGrad\n", - "\n", - "We currently supports PNG and JPEG images. We have a few examples below to show how to use images in TextGrad. If your image is in a different format you should convert it. Here is an example function that \n", - "does that for you. \n", - "\n", - "The way we support images is through the byte format. This is then converted to a Base64 string and sent to the OpenAI/Anthropic API." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "db295b99-e94d-44a9-b904-b1aa9cbb7888", - "metadata": {}, - "outputs": [], - "source": [ - "# Some utils to read images\n", - "\n", - "import io\n", - "from PIL import Image\n", - "\n", - "# \n", - "def encode_image(image):\n", - " # Convert RGBA to RGB if necessary\n", - " if image.mode == 'RGBA':\n", - " # Create a new image with a white background\n", - " background = Image.new('RGB', image.size, (255, 255, 255))\n", - " # Paste the image on the background.\n", - " background.paste(image, (0, 0), image)\n", - " image = background\n", - "\n", - " # Create a BytesIO object\n", - " buffered = io.BytesIO()\n", - "\n", - " # Save your image object to this BytesIO object (in JPEG format)\n", - " image.save(buffered, format=\"JPEG\")\n", - "\n", - " # Get the byte data from the BytesIO object\n", - " image_byte_data = buffered.getvalue()\n", - " return image_byte_data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "24cd0a8e-6b32-4cea-9e0a-3d95260eea49", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-07T15:52:16.587781196Z", - "start_time": "2024-07-07T15:52:16.174639147Z" - } - }, - "outputs": [], - "source": [ - "import textgrad as tg\n", - "\n", - "# differently from the past tutorials, we now need a multimodal LLM call instead of a standard one!\n", - "from textgrad.autograd import MultimodalLLMCall\n", - "from textgrad.loss import ImageQALoss\n", - "from datasets import load_dataset\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "89c990a4-4784-4c25-9374-c76552d7f974", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dotenv import load_dotenv\n", - "load_dotenv(\".env\", override=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "403c37ff-618c-4b64-a7fd-bf1f514c79b5", - "metadata": {}, - "outputs": [], - "source": [ - "tg.set_backward_engine(\"gpt-4o\", override=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "d7e06dda-e86f-4ff1-acb6-625934bb54f5", - "metadata": {}, - "outputs": [], - "source": [ - "ds = load_dataset(\"derek-thomas/ScienceQA\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d226b34-167c-4a31-8649-a9e4a026d257", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "cb10a125-ab04-40c1-9875-9876a3d7cc11", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Which solution has a higher concentration of blue particles?\n", - "\n", - "-neither; their concentrations are the same\n", - "-Solution B\n", - "-Solution A\n" - ] - } - ], - "source": [ - "target_image = ds[\"train\"][10][\"image\"]\n", - "target_question = ds[\"train\"][\"question\"][10]\n", - "target_options = ds[\"train\"][\"choices\"][10]\n", - "target_options = \"\\n-\".join(target_options)\n", - "target_correct_answer = ds[\"train\"][\"answer\"][10]\n", - "\n", - "question_for_model = f\"{target_question}\\n\\n-{target_options}\"\n", - "print(question_for_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "839654b8-91da-43f7-ba61-b9bff9799d07", - "metadata": {}, - "outputs": [ - { - "data": { - "image/jpeg": "", - "image/png": "", - "text/plain": [ - "" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_image" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "122e4cf1-cd3f-4818-aefa-a706d7c9ea83", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7595d976-ed94-4499-9735-02dbb20d3291", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "e00c0fef-d1d7-4b2f-8ea9-5547124cc775", - "metadata": {}, - "outputs": [], - "source": [ - "target_image = encode_image(target_image)\n", - "\n", - "image_variable = tg.Variable(target_image, role_description=\"image to answer a question about\", requires_grad=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "4b8dbf50-fb97-4432-b544-0092ffd1b187", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(value=Solution B has a higher concentration of blue particles. Both solutions have the same solvent volume (35 mL), but Solution B contains more blue particles than Solution A., role=response from the language model, grads=set())" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "question_variable = tg.Variable(question_for_model, role_description=\"question to answer\", requires_grad=False)\n", - "response = MultimodalLLMCall(\"gpt-4o\")([image_variable, question_variable])\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "d875d5b5-331e-4870-99aa-5eb33667d7da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(value=The existing answer correctly identifies that Solution B has a higher concentration of blue particles. The reasoning provided is accurate: both solutions have the same solvent volume (35 mL), but Solution B contains more blue particles than Solution A. This indicates a higher concentration of blue particles in Solution B. The answer accurately understands the image and provides appropriate knowledge and reasoning logic to address the question., role=evaluation of the response from the language model, grads=set())" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loss_fn = ImageQALoss(\n", - " evaluation_instruction=\"Please evaluate the existing answer to the visual scientific problem without solving it yourself. Verify that the answer accurately understands the image, provides appropriate knowledge and reasoning logic to address the question.\",\n", - " engine=\"gpt-4o\"\n", - ")\n", - "loss = loss_fn(question=question_variable, image=image_variable, response=response)\n", - "loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb26a96c-a229-4a99-8c31-49de8aabdf40", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3c5b32e-92fd-40a8-a35b-3d78fcfc313a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8a96941-17b5-49b0-9c3b-e5d9fd6bf229", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "85befc59-bb38-463a-a70c-9e005e447689", - "metadata": {}, - "source": [ - "### Direct PNG" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "34af1d5d-ed95-4676-83d9-9ae27de7d221", - "metadata": {}, - "outputs": [], - "source": [ - "import httpx\n", - "\n", - "image_url = \"https://d2bzx2vuetkzse.cloudfront.net/fit-in/0x450/images_without_background/45ca6024-4bf0-43b8-9a3a-b4a44ecac0bf.png\"\n", - "image_data = httpx.get(image_url).content" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "c65f1f1b-36b0-47ed-8b28-61fa9269148d", - "metadata": {}, - "outputs": [], - "source": [ - "image_variable = tg.Variable(image_data, role_description=\"image to answer a question about\", requires_grad=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "ca6a613f-ce72-4e58-b009-49e41af1763a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(value=The image shows a small, brown rodent that appears to be a capybara. Capybaras are the largest rodents in the world and are native to South America. They have a distinctive appearance with a large, barrel-shaped body, short legs, and a blunt snout. This particular capybara is sitting and facing to the right., role=response from the language model, grads=set())" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "question_variable = tg.Variable(\"What do you see in this image?\", role_description=\"question\", requires_grad=False)\n", - "response = MultimodalLLMCall(\"gpt-4o\")([image_variable, question_variable])\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3218180-eb0f-47d9-a3b7-309f1e994144", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}