From f44350aab5e44f1689bfea346092ff95d7748a75 Mon Sep 17 00:00:00 2001 From: vinid Date: Sun, 7 Jul 2024 12:12:29 -0400 Subject: [PATCH] support for png and jpgs without having to swap encodings --- README.md | 18 +- .../notebooks/Local-Model-With-LMStudio.ipynb | 6 +- .../Tutorial-MultiModal-DeepDive.ipynb | 388 ++++++++++++++++++ ...Vision.ipynb => Tutorial-MultiModal.ipynb} | 151 +++++-- ...itives.ipynb => Tutorial-Primitives.ipynb} | 90 +++- ...ynb => Tutorial-Prompt-Optimization.ipynb} | 0 requirements.txt | 3 +- setup.py | 4 +- textgrad/autograd/multimodal_ops.py | 22 + textgrad/engine/__init__.py | 20 +- textgrad/engine/anthropic.py | 5 +- textgrad/engine/openai.py | 4 +- 12 files changed, 638 insertions(+), 73 deletions(-) create mode 100644 examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb rename examples/notebooks/{TextGrad-Vision.ipynb => Tutorial-MultiModal.ipynb} (99%) rename examples/notebooks/{Primitives.ipynb => Tutorial-Primitives.ipynb} (87%) rename examples/notebooks/{Prompt-Optimization.ipynb => Tutorial-Prompt-Optimization.ipynb} (100%) diff --git a/README.md b/README.md index 9f4251a..1c832a9 100644 --- a/README.md +++ b/README.md @@ -103,17 +103,19 @@ We have many more examples around how TextGrad can optimize all kinds of variabl ### Tutorials -We have prepared a couple of tutorials to get you started with TextGrad. -You can run them directly in Google Colab by clicking on the links below. +We have prepared a couple of tutorials to get you started with TextGrad. The order of this +tutorial is what we would recommend to follow for a beginner. You can run them directly in Google Colab by clicking on the links below (but +you need an OpenAI/Anthropic key to run the LLMs).
-| Example | Colab Link | -|-------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Introduction to TextGrad Primitives | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Primitives.ipynb) | -| Optimizing a Code Snippet and Define a New Loss | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/textgrad/blob/main/examples/notebooks/Tutorial-Test-Time-Loss-for-Code.ipynb) | -| Prompt Optimization | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb) | -| Solution Optimization | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Tutorial-Solution-Optimization.ipynb) | +| Tutorial | Difficulty | Colab Link | +|----------------------------------------------------|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 1. Introduction to TextGrad Primitives | ![](https://img.shields.io/badge/Level-Beginner-green.svg) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Tutorial-Primitives.ipynb) | +| 2. Solution Optimization | ![](https://img.shields.io/badge/Level-Beginner-green.svg) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Tutorial-Solution-Optimization.ipynb) | +| 3. Optimizing a Code Snippet and Define a New Loss | ![](https://img.shields.io/badge/Level-Beginner-green.svg) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/textgrad/blob/main/examples/notebooks/Tutorial-Test-Time-Loss-for-Code.ipynb) | +| 4. Prompt Optimization | ![](https://img.shields.io/badge/Level-Intermediate-yellow.svg) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Tutorial-Prompt-Optimization.ipynb) | +| 5. MultiModal Optimization | ![](https://img.shields.io/badge/Level-Beginner-green.svg) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Tutorial-MultiModal.ipynb) |
diff --git a/examples/notebooks/Local-Model-With-LMStudio.ipynb b/examples/notebooks/Local-Model-With-LMStudio.ipynb index 977f148..0fdca0d 100644 --- a/examples/notebooks/Local-Model-With-LMStudio.ipynb +++ b/examples/notebooks/Local-Model-With-LMStudio.ipynb @@ -182,7 +182,7 @@ ], "metadata": { "kernelspec": { - "display_name": "textgrad", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -196,9 +196,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.9" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb b/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb new file mode 100644 index 0000000..a3aa5e1 --- /dev/null +++ b/examples/notebooks/Tutorial-MultiModal-DeepDive.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "26705673-b9e8-4d6b-b5b6-a1cf47d1df4d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "source": [ + "# TextGrad Tutorials: MultiModal Optimization\n", + "\n", + "![TextGrad](https://github.com/vinid/data/blob/master/logo_full.png?raw=true)\n", + "\n", + "An autograd engine -- for textual gradients!\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb)\n", + "[![GitHub license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)\n", + "[![Arxiv](https://img.shields.io/badge/arXiv-2406.07496-B31B1B.svg)](https://arxiv.org/abs/2406.07496)\n", + "[![Documentation Status](https://readthedocs.org/projects/textgrad/badge/?version=latest)](https://textgrad.readthedocs.io/en/latest/?badge=latest)\n", + "[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/textgrad)](https://pypi.org/project/textgrad/)\n", + "[![PyPI](https://img.shields.io/pypi/v/textgrad)](https://pypi.org/project/textgrad/)\n", + "\n", + "**Objectives for this tutorial:**\n", + "\n", + "* Explore some more MultiModal cases in TextGrad. Using a dataset from the literature.\n", + "\n", + "**Requirements:**\n", + "\n", + "* You need to have an OpenAI API key to run this tutorial. This should be set as an environment variable as OPENAI_API_KEY.\n" + ] + }, + { + "cell_type": "markdown", + "id": "f10aa9d1-8482-4db7-97af-fa68782e5a4a", + "metadata": {}, + "source": [ + "## Image Support in TextGrad\n", + "\n", + "We currently supports PNG and JPEG images. We have a few examples below to show how to use images in TextGrad. If your image is in a different format you should convert it. Here is an example function that \n", + "does that for you. \n", + "\n", + "The way we support images is through the byte format. This is then converted to a Base64 string and sent to the OpenAI/Anthropic API." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "db295b99-e94d-44a9-b904-b1aa9cbb7888", + "metadata": {}, + "outputs": [], + "source": [ + "# Some utils to read images\n", + "\n", + "import io\n", + "from PIL import Image\n", + "\n", + "# \n", + "def encode_image(image):\n", + " # Convert RGBA to RGB if necessary\n", + " if image.mode == 'RGBA':\n", + " # Create a new image with a white background\n", + " background = Image.new('RGB', image.size, (255, 255, 255))\n", + " # Paste the image on the background.\n", + " background.paste(image, (0, 0), image)\n", + " image = background\n", + "\n", + " # Create a BytesIO object\n", + " buffered = io.BytesIO()\n", + "\n", + " # Save your image object to this BytesIO object (in JPEG format)\n", + " image.save(buffered, format=\"JPEG\")\n", + "\n", + " # Get the byte data from the BytesIO object\n", + " image_byte_data = buffered.getvalue()\n", + " return image_byte_data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24cd0a8e-6b32-4cea-9e0a-3d95260eea49", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-07T15:52:16.587781196Z", + "start_time": "2024-07-07T15:52:16.174639147Z" + } + }, + "outputs": [], + "source": [ + "import textgrad as tg\n", + "\n", + "# differently from the past tutorials, we now need a multimodal LLM call instead of a standard one!\n", + "from textgrad.autograd import MultimodalLLMCall\n", + "from textgrad.loss import ImageQALoss\n", + "from datasets import load_dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "89c990a4-4784-4c25-9374-c76552d7f974", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "load_dotenv(\".env\", override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "403c37ff-618c-4b64-a7fd-bf1f514c79b5", + "metadata": {}, + "outputs": [], + "source": [ + "tg.set_backward_engine(\"gpt-4o\", override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "d7e06dda-e86f-4ff1-acb6-625934bb54f5", + "metadata": {}, + "outputs": [], + "source": [ + "ds = load_dataset(\"derek-thomas/ScienceQA\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d226b34-167c-4a31-8649-a9e4a026d257", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "cb10a125-ab04-40c1-9875-9876a3d7cc11", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which solution has a higher concentration of blue particles?\n", + "\n", + "-neither; their concentrations are the same\n", + "-Solution B\n", + "-Solution A\n" + ] + } + ], + "source": [ + "target_image = ds[\"train\"][10][\"image\"]\n", + "target_question = ds[\"train\"][\"question\"][10]\n", + "target_options = ds[\"train\"][\"choices\"][10]\n", + "target_options = \"\\n-\".join(target_options)\n", + "target_correct_answer = ds[\"train\"][\"answer\"][10]\n", + "\n", + "question_for_model = f\"{target_question}\\n\\n-{target_options}\"\n", + "print(question_for_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "839654b8-91da-43f7-ba61-b9bff9799d07", + "metadata": {}, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_image" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "122e4cf1-cd3f-4818-aefa-a706d7c9ea83", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7595d976-ed94-4499-9735-02dbb20d3291", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "e00c0fef-d1d7-4b2f-8ea9-5547124cc775", + "metadata": {}, + "outputs": [], + "source": [ + "target_image = encode_image(target_image)\n", + "\n", + "image_variable = tg.Variable(target_image, role_description=\"image to answer a question about\", requires_grad=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4b8dbf50-fb97-4432-b544-0092ffd1b187", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Variable(value=Solution B has a higher concentration of blue particles. Both solutions have the same solvent volume (35 mL), but Solution B contains more blue particles than Solution A., role=response from the language model, grads=set())" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_variable = tg.Variable(question_for_model, role_description=\"question to answer\", requires_grad=False)\n", + "response = MultimodalLLMCall(\"gpt-4o\")([image_variable, question_variable])\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "d875d5b5-331e-4870-99aa-5eb33667d7da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Variable(value=The existing answer correctly identifies that Solution B has a higher concentration of blue particles. The reasoning provided is accurate: both solutions have the same solvent volume (35 mL), but Solution B contains more blue particles than Solution A. This indicates a higher concentration of blue particles in Solution B. The answer accurately understands the image and provides appropriate knowledge and reasoning logic to address the question., role=evaluation of the response from the language model, grads=set())" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loss_fn = ImageQALoss(\n", + " evaluation_instruction=\"Please evaluate the existing answer to the visual scientific problem without solving it yourself. Verify that the answer accurately understands the image, provides appropriate knowledge and reasoning logic to address the question.\",\n", + " engine=\"gpt-4o\"\n", + ")\n", + "loss = loss_fn(question=question_variable, image=image_variable, response=response)\n", + "loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb26a96c-a229-4a99-8c31-49de8aabdf40", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3c5b32e-92fd-40a8-a35b-3d78fcfc313a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8a96941-17b5-49b0-9c3b-e5d9fd6bf229", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "85befc59-bb38-463a-a70c-9e005e447689", + "metadata": {}, + "source": [ + "### Direct PNG" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "34af1d5d-ed95-4676-83d9-9ae27de7d221", + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "\n", + "image_url = \"https://d2bzx2vuetkzse.cloudfront.net/fit-in/0x450/images_without_background/45ca6024-4bf0-43b8-9a3a-b4a44ecac0bf.png\"\n", + "image_data = httpx.get(image_url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "c65f1f1b-36b0-47ed-8b28-61fa9269148d", + "metadata": {}, + "outputs": [], + "source": [ + "image_variable = tg.Variable(image_data, role_description=\"image to answer a question about\", requires_grad=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ca6a613f-ce72-4e58-b009-49e41af1763a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Variable(value=The image shows a small, brown rodent that appears to be a capybara. Capybaras are the largest rodents in the world and are native to South America. They have a distinctive appearance with a large, barrel-shaped body, short legs, and a blunt snout. This particular capybara is sitting and facing to the right., role=response from the language model, grads=set())" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_variable = tg.Variable(\"What do you see in this image?\", role_description=\"question\", requires_grad=False)\n", + "response = MultimodalLLMCall(\"gpt-4o\")([image_variable, question_variable])\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3218180-eb0f-47d9-a3b7-309f1e994144", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/TextGrad-Vision.ipynb b/examples/notebooks/Tutorial-MultiModal.ipynb similarity index 99% rename from examples/notebooks/TextGrad-Vision.ipynb rename to examples/notebooks/Tutorial-MultiModal.ipynb index 8f8fa62..c011294 100644 --- a/examples/notebooks/TextGrad-Vision.ipynb +++ b/examples/notebooks/Tutorial-MultiModal.ipynb @@ -1,20 +1,35 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "id": "6e5bfa16-5124-452c-bc56-3427e453751a", + "cell_type": "markdown", + "id": "0023a2ae-72fe-490b-b715-4dddb2539c38", "metadata": {}, - "outputs": [], "source": [ - "%load_ext autoreload\n", + "# TextGrad Tutorials: MultiModal Optimization\n", + "\n", + "![TextGrad](https://github.com/vinid/data/blob/master/logo_full.png?raw=true)\n", + "\n", + "An autograd engine -- for textual gradients!\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb)\n", + "[![GitHub license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)\n", + "[![Arxiv](https://img.shields.io/badge/arXiv-2406.07496-B31B1B.svg)](https://arxiv.org/abs/2406.07496)\n", + "[![Documentation Status](https://readthedocs.org/projects/textgrad/badge/?version=latest)](https://textgrad.readthedocs.io/en/latest/?badge=latest)\n", + "[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/textgrad)](https://pypi.org/project/textgrad/)\n", + "[![PyPI](https://img.shields.io/pypi/v/textgrad)](https://pypi.org/project/textgrad/)\n", + "\n", + "**Objectives for this tutorial:**\n", + "\n", + "* Introduce you to multimodal optimization with TextGrad\n", "\n", - "%autoreload 2" + "**Requirements:**\n", + "\n", + "* You need to have an OpenAI API key to run this tutorial. This should be set as an environment variable as OPENAI_API_KEY.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "8dd3140c-45d0-478e-b184-ec5faed66964", "metadata": {}, "outputs": [], @@ -48,26 +63,48 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 2, "id": "3ca4c62b-2d83-412d-b410-0ed5272a6f06", "metadata": {}, "outputs": [], "source": [ "import textgrad as tg\n", + "\n", + "# differently from the past tutorials, we now need a multimodal LLM call instead of a standard one!\n", "from textgrad.autograd import MultimodalLLMCall\n", "from textgrad.loss import ImageQALoss" ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 6, + "id": "2b06474c-491d-48ff-aef1-62cb0e525473", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "load_dotenv(\".env\", override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "id": "d1986a01-3afd-46a8-a99e-e977b1141768", "metadata": {}, "outputs": [], "source": [ - "from dotenv import load_dotenv\n", - "load_dotenv(override=True)\n", - "tg.set_backward_engine(\"claude-3-haiku-20240307\")" + "tg.set_backward_engine(\"gpt-4o\")" ] }, { @@ -78,29 +115,71 @@ "# Simply answering questions about images" ] }, + { + "cell_type": "markdown", + "id": "efa853c2-2703-4304-a9c5-a3bde675b532", + "metadata": {}, + "source": [ + "We now downlaod an image from the web." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "950b3502-97ce-4581-87c7-4c47421beafc", + "metadata": {}, + "outputs": [], + "source": [ + "import httpx\n", + "\n", + "image_url = \"https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg\"\n", + "image_data = httpx.get(image_url).content" + ] + }, + { + "cell_type": "markdown", + "id": "1ad925fa-c1e6-482d-af4c-df6f8dcb2c2f", + "metadata": {}, + "source": [ + "As usual, in TextGrad we now have to transform our object of interest into a Variable object. In the previous tutorials, we were doing this with text data, now we are going to do this with Images." + ] + }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 11, "id": "e0629de4-9fcf-4df4-9316-cc86455929e6", "metadata": {}, + "outputs": [], + "source": [ + "image_variable = tg.Variable(image_data, role_description=\"image to answer a question about\", requires_grad=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1fcbfaaf-8aa4-4bfa-82af-bf5b7aef0f94", + "metadata": {}, + "source": [ + "Let's now ask as question!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1d940cf3-a461-43f4-bc4a-6103589b159e", + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Variable(value=This image shows a close-up of an ant. The ant appears to be black and is standing on a surface, possibly the ground. The image is highly detailed, showing the ant's body segments, legs, and antennae. The background is blurred, which helps to focus attention on the ant., role=response from the language model, grads=set())" + "Variable(value=This image shows a close-up of an ant. The ant appears to be black and is standing on a surface, possibly a ground or a floor. The image is highly detailed, showing the ant's body segments, legs, and antennae. The background is blurred, which helps to focus attention on the ant., role=response from the language model, grads=set())" ] }, - "execution_count": 89, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import httpx\n", - "\n", - "image_url = \"https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg\"\n", - "image_data = httpx.get(image_url).content\n", - "image_variable = tg.Variable(image_data, role_description=\"image to answer a question about\", requires_grad=False)\n", "question_variable = tg.Variable(\"What do you see in this image?\", role_description=\"question\", requires_grad=False)\n", "response = MultimodalLLMCall(\"gpt-4o\")([image_variable, question_variable])\n", "response" @@ -131,39 +210,37 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 15, "id": "29affc0a-cedc-40fd-bec4-6bf5178409cf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Variable(value=This answer, while providing some accurate information, falls short of being a complete and good response for the image. Let's critically evaluate it:\n", - "\n", - "1. Incompleteness: The answer fails to mention several key details visible in the image. For instance, it doesn't describe the ant's posture (reared up on its hind legs), the texture of its exoskeleton, or the fine hairs visible on its body.\n", + "Variable(value=The provided answer is quite detailed and covers many aspects of the image. However, there are a few points that could be improved or clarified:\n", "\n", - "2. Lack of precision: The description of the ant as \"black\" is imprecise. The ant appears to have a dark, metallic sheen that could be better described as gunmetal or dark gray.\n", + "1. **Species Identification**: The answer mentions \"likely a species of black ant,\" which is a bit vague. While it's understandable that the exact species might not be identifiable, it could be better to simply state that it is a black ant without speculating on the species.\n", "\n", - "3. Missing context: The answer doesn't comment on the exceptional quality of the macro photography, which is a significant aspect of this image.\n", + "2. **Surface Description**: The answer states the ant is on a \"textured surface, possibly concrete or soil.\" This is a reasonable guess, but it could be more concise by just mentioning a textured surface without speculating on the material.\n", "\n", - "4. Overlooked details: The response fails to mention the ant's mandibles, which are clearly visible and an important feature of the image.\n", + "3. **Ant's Posture**: The description of the ant's posture as \"alert or defensive\" is speculative. While the ant's posture is indeed notable, it might be better to describe it without attributing a specific behavior unless it is clearly evident.\n", "\n", - "5. Lack of depth: There's no attempt to describe the ant's behavior or posture, which appears to be in an alert or defensive stance.\n", + "4. **Background Description**: The explanation of the background being blurred due to a shallow depth of field is accurate and well-explained.\n", "\n", - "6. Vague background description: While the answer mentions a blurred background, it doesn't describe the colors visible (reddish and green tones), which contribute to the overall composition.\n", + "5. **Detail Description**: The mention of the texture of the ant's body, the shine on its exoskeleton, and the fine hairs on its legs is excellent and adds to the vividness of the description.\n", "\n", - "7. Surface description: The answer is uncertain about the surface the ant is on, when it's clearly a textured, light-colore, role=evaluation of the response from the language model, grads=set())" + "Overall, the answer is comprehensive and well-articulated but could benefit from slightly less speculation and more straightforward descriptions., role=evaluation of the response from the language model, grads=set())" ] }, - "execution_count": 90, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loss_fn = ImageQALoss(\n", - " evaluation_instruction=\"Does this seem like a complete and good answer for the image? Criticize.\",\n", - " engine=\"claude-3-5-sonnet-20240620\"\n", + " evaluation_instruction=\"Does this seem like a complete and good answer for the image? Criticize. Do not provide a new answer.\",\n", + " engine=\"gpt-4o\"\n", ")\n", "loss = loss_fn(question=question_variable, image=image_variable, response=response)\n", "loss" @@ -171,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 16, "id": "38c2d4ff-1458-459d-8915-3d1a254564fb", "metadata": {}, "outputs": [ @@ -179,9 +256,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "This image showcases an exceptional close-up view of a metallic, gunmetal-gray ant standing alert on a textured, light-colored surface. The ant's exoskeleton has a striking, almost iridescent sheen, and its body is covered in fine hairs that are clearly visible. The ant's prominent mandibles are positioned in a defensive stance, with the creature reared up on its hind legs, conveying a sense of vigilance and aggression.\n", - "\n", - "The exceptional macro-level detail and focus of the photography isolates the ant and draws the viewer's attention to its intricate features. The blurred, reddish and green-toned background further emphasizes the ant, creating a dramatic, almost cinematic quality to the image. This close-up perspective provides a rare glimpse into the biology and behavior of this small but fascinating creature, revealing insights into its role within the broader natural world.\n" + "This image shows a close-up of a black ant, captured using macro photography. The ant is standing on a textured surface. The image is highly detailed, showcasing the ant's body segments, legs, and antennae with great clarity. The ant's head is raised, and its antennae are extended. The background is blurred, employing a shallow depth of field to focus attention on the ant and highlight its intricate details. The texture of the ant's body segments, the shine on its exoskeleton, and the fine hairs on its legs are all clearly visible, adding to the vividness of the image.\n" ] } ], @@ -225,7 +300,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/notebooks/Primitives.ipynb b/examples/notebooks/Tutorial-Primitives.ipynb similarity index 87% rename from examples/notebooks/Primitives.ipynb rename to examples/notebooks/Tutorial-Primitives.ipynb index 46b6c6f..6ffb8f5 100644 --- a/examples/notebooks/Primitives.ipynb +++ b/examples/notebooks/Tutorial-Primitives.ipynb @@ -64,7 +64,10 @@ "cell_type": "markdown", "id": "8887fbed36c7daf2", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Introduction: Variable\n", @@ -89,7 +92,10 @@ "end_time": "2024-06-11T15:43:17.669096228Z", "start_time": "2024-06-11T15:43:17.665325560Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -105,7 +111,10 @@ "end_time": "2024-06-11T15:43:18.184004948Z", "start_time": "2024-06-11T15:43:18.178187640Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -127,7 +136,10 @@ "cell_type": "markdown", "id": "63f6a6921a1cce6a", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Introduction: Engine\n", @@ -144,7 +156,10 @@ "end_time": "2024-06-11T15:44:32.606319032Z", "start_time": "2024-06-11T15:44:32.561460448Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -155,7 +170,10 @@ "cell_type": "markdown", "id": "33c7d6eaa115cd6a", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "This object behaves like you would expect an LLM to behave: You can sample generation from the engine using the `generate` method. " @@ -170,7 +188,10 @@ "end_time": "2024-06-11T17:29:41.108552705Z", "start_time": "2024-06-11T17:29:40.294256814Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -192,7 +213,10 @@ "cell_type": "markdown", "id": "b627edc07c0d3737", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Introduction: Loss\n", @@ -209,7 +233,10 @@ "end_time": "2024-06-11T15:44:32.894722136Z", "start_time": "2024-06-11T15:44:32.890708561Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -221,7 +248,10 @@ "cell_type": "markdown", "id": "ff137c99e0659dcc", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [] }, @@ -229,7 +259,10 @@ "cell_type": "markdown", "id": "6f05ec2bf907b3ba", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Introduction: Optimizer\n", @@ -248,7 +281,10 @@ "end_time": "2024-06-11T15:44:33.741130951Z", "start_time": "2024-06-11T15:44:33.734977769Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -259,7 +295,10 @@ "cell_type": "markdown", "id": "d26883eb74ce0d01", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Putting it all together\n", @@ -276,7 +315,10 @@ "end_time": "2024-06-11T15:44:41.730132530Z", "start_time": "2024-06-11T15:44:34.997777872Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -294,7 +336,10 @@ "end_time": "2024-06-11T15:44:41.738985151Z", "start_time": "2024-06-11T15:44:41.731989729Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -316,7 +361,10 @@ "cell_type": "markdown", "id": "6a8aab93b80fb82c", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "While here it is not going to be useful, we can also do multiple optimization steps in a loop! Do not forget to reset the gradients after each step!" @@ -330,7 +378,10 @@ "ExecuteTime": { "start_time": "2024-06-11T15:44:30.989940227Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -342,7 +393,10 @@ "execution_count": null, "id": "a3a84aad4cd58737", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [] diff --git a/examples/notebooks/Prompt-Optimization.ipynb b/examples/notebooks/Tutorial-Prompt-Optimization.ipynb similarity index 100% rename from examples/notebooks/Prompt-Optimization.ipynb rename to examples/notebooks/Tutorial-Prompt-Optimization.ipynb diff --git a/requirements.txt b/requirements.txt index 82b52e6..e7d7d33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ platformdirs>=3.11.0 datasets>=2.14.6 diskcache>=5.6.3 graphviz>=0.20.3 -gdown>=5.2.0 \ No newline at end of file +gdown>=5.2.0 +pillow \ No newline at end of file diff --git a/setup.py b/setup.py index 9606119..0fad726 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,9 @@ setup( name="textgrad", - version="0.1.3", + version="0.1.4", description="", - python_requires=">=3.8", + python_requires=">=3.9", classifiers=[ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", diff --git a/textgrad/autograd/multimodal_ops.py b/textgrad/autograd/multimodal_ops.py index 6cd68a6..f6f2f35 100644 --- a/textgrad/autograd/multimodal_ops.py +++ b/textgrad/autograd/multimodal_ops.py @@ -19,6 +19,14 @@ class MultimodalLLMCall(Function): + """The MultiModalLM call function. This function will call the LLM with the input (image) and return the response, + also register the grad_fn for backpropagation. + + :param engine: engine to use for the LLM call + :type engine: EngineLM + :param system_prompt: system prompt to use for the LLM call, default depends on the engine. + :type system_prompt: Variable, optional + """ def __init__(self, engine: Union[str, EngineLM], system_prompt: Variable = None): @@ -34,6 +42,20 @@ def __init__(self, def forward(self, inputs: List[Variable], response_role_description: str = VARIABLE_OUTPUT_DEFAULT_ROLE) -> Variable: + """ + Forward pass for the multimodal LLM call function. + + :param inputs: list of input variables to the multimodal LLM call. One is an image and the second one is text + :type inputs: List[Variable] + :param response_role_description: role description for the response variable + :type response_role_description: str, optional + + >>> from textgrad import Variable, get_engine + >>> from textgrad.autograd import MultimodalLLMCall + >>> target_image = "A byte representation of the image" + >>> question_variable = Variable("What do you see here?", role_description="question to answer", requires_grad=False) + >>> response = MultimodalLLMCall("gpt-4o")([target_image, question_variable]) + """ # First ensure that all keys are present in the fields # Assert that all variables are either strings or bytes diff --git a/textgrad/engine/__init__.py b/textgrad/engine/__init__.py index 2eebcaf..b07aff5 100644 --- a/textgrad/engine/__init__.py +++ b/textgrad/engine/__init__.py @@ -49,4 +49,22 @@ def get_engine(engine_name: str, **kwargs) -> EngineLM: from .cohere import ChatCohere return ChatCohere(model_string=engine_name, **kwargs) else: - raise ValueError(f"Engine {engine_name} not supported") \ No newline at end of file + raise ValueError(f"Engine {engine_name} not supported") + + +def is_jpeg(data): + jpeg_signature = b'\xFF\xD8\xFF' + return data.startswith(jpeg_signature) + +def is_png(data): + png_signature = b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A' + return data.startswith(png_signature) + + +def get_image_type_from_bytes(data): + if is_jpeg(data): + return "jpeg" + elif is_png(data): + return "png" + else: + raise ValueError("Image type not supported, only jpeg and png supported.") \ No newline at end of file diff --git a/textgrad/engine/anthropic.py b/textgrad/engine/anthropic.py index b2a8f6d..2e90d8b 100644 --- a/textgrad/engine/anthropic.py +++ b/textgrad/engine/anthropic.py @@ -14,6 +14,7 @@ import json from typing import List, Union from .base import EngineLM, CachedEngine +from textgrad.engine import get_image_type_from_bytes class ChatAnthropic(EngineLM, CachedEngine): SYSTEM_PROMPT = "You are a helpful, creative, and smart assistant." @@ -83,7 +84,9 @@ def _format_content(self, content: List[Union[str, bytes]]) -> List[dict]: formatted_content = [] for item in content: if isinstance(item, bytes): - image_media_type = "image/jpeg" + image_type = get_image_type_from_bytes(item) + + image_media_type = f"image/{image_type}" base64_image = base64.b64encode(item).decode('utf-8') formatted_content.append( { "type": "image", diff --git a/textgrad/engine/openai.py b/textgrad/engine/openai.py index 0e755a9..f0ab2f3 100644 --- a/textgrad/engine/openai.py +++ b/textgrad/engine/openai.py @@ -13,6 +13,7 @@ wait_random_exponential, ) from typing import List, Union +from textgrad.engine import get_image_type_from_bytes from .base import EngineLM, CachedEngine @@ -92,11 +93,12 @@ def _format_content(self, content: List[Union[str, bytes]]) -> List[dict]: formatted_content = [] for item in content: if isinstance(item, bytes): + image_type = get_image_type_from_bytes(item) base64_image = base64.b64encode(item).decode('utf-8') formatted_content.append({ "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}" + "url": f"data:image/{image_type};base64,{base64_image}" } }) elif isinstance(item, str):