From 7d090bf4ac4cc76866e4c1dceb9605158b527105 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Wed, 29 Mar 2023 17:28:00 +0800
Subject: [PATCH 1/6] add gitignore

---
 .gitignore        | 2 ++
 visual_chatgpt.py | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..13d4ac58
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.env
+.idea/
\ No newline at end of file
diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index e9614894..01190ab0 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -1,4 +1,6 @@
 import os
+from dotenv import load_dotenv
+load_dotenv()
 import gradio as gr
 import random
 import torch
@@ -25,6 +27,7 @@
 from langchain.chains.conversation.memory import ConversationBufferMemory
 from langchain.llms.openai import OpenAI
 
+
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 
 Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
@@ -1066,4 +1069,4 @@ def run_image(self, image, state, txt):
         clear.click(bot.memory.clear)
         clear.click(lambda: [], None, chatbot)
         clear.click(lambda: [], None, state)
-        demo.launch(server_name="0.0.0.0", server_port=1015)
+        demo.launch(server_name=os.getenv("HOST_NAME"), server_port=1015)

From ec00912bf08d2e16b757b89d9c7750f1483cc22e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Wed, 29 Mar 2023 19:30:41 +0800
Subject: [PATCH 2/6] support Chinese

---
 visual_chatgpt.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index 01190ab0..b07e6d56 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -27,7 +27,6 @@
 from langchain.chains.conversation.memory import ConversationBufferMemory
 from langchain.llms.openai import OpenAI
 
-
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 
 Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
@@ -60,7 +59,8 @@
 """
 
 VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does not exist.
-You will remember to provide the image file name loyally if it's provided in the last tool observation.
+You will remember to provide the image file name loyally if it's provided in the last tool observation. 
+When using tools, for all the input except the filename (e.g. object, text, user description, and question), you can only describe them in English even if the Human may ask you in other languages (like Chinese). 
 
 Begin!
 
@@ -226,7 +226,7 @@ def __init__(self, device):
              description="useful when you want to remove and object or something from the photo "
                          "from its description or location. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the object need to be removed. ")
+                         "representing the image_path and the object (in English) need to be removed. ")
     def inference_remove(self, inputs):
         image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
@@ -235,7 +235,7 @@ def inference_remove(self, inputs):
              description="useful when you want to replace an object from the object description or "
                          "location with another object from its description. "
                          "The input to this tool should be a comma separated string of three, "
-                         "representing the image_path, the object to be replaced, the object to be replaced with ")
+                         "representing the image_path, the object to be replaced (in English), the object to be replaced with (in English) ")
     def inference_replace(self, inputs):
         image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
         original_image = Image.open(image_path)
@@ -266,7 +266,7 @@ def __init__(self, device):
              description="useful when you want to the style of the image to be like the text. "
                          "like: make it look like a painting. or make it like a robot. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the text. ")
+                         "representing the image_path and the text (in English). ")
     def inference(self, inputs):
         """Change style of image."""
         print("===>Starting InstructPix2Pix Inference")
@@ -295,7 +295,7 @@ def __init__(self, device):
     @prompts(name="Generate Image From User Input Text",
              description="useful when you want to generate an image from a user input text and save it to a file. "
                          "like: generate an image of an object or something, or generate an image that includes some objects. "
-                         "The input to this tool should be a string, representing the text used to generate image. ")
+                         "The input to this tool should be a string, representing the text (in English) used to generate image. ")
     def inference(self, text):
         image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
         prompt = text + ', ' + self.a_prompt
@@ -371,7 +371,7 @@ def __init__(self, device):
                          " like: generate a real image of a object or something from this canny image,"
                          " or generate a new real image of a object or something from this edge image. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description. ")
+                         "representing the image_path and the user description (in English). ")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -429,7 +429,7 @@ def __init__(self, device):
                          "like: generate a real image of a object or something from this straight line image, "
                          "or generate a new real image of a object or something from this straight lines. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description. ")
+                         "representing the image_path and the user description (in English). ")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -487,7 +487,7 @@ def __init__(self, device):
                          "like: generate a real image of a object or something from this soft hed boundary image, "
                          "or generate a new real image of a object or something from this hed boundary. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -543,7 +543,7 @@ def __init__(self, device):
              description="useful when you want to generate a new real image from both the user description and "
                          "a scribble image or a sketch image. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -601,7 +601,7 @@ def __init__(self, device):
                          "like: generate a real image of a human from this human pose image, "
                          "or generate a new real image of a human from this pose. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -705,7 +705,7 @@ def __init__(self, device):
                          "like: generate a real image of a object or something from this segmentation image, "
                          "or generate a new real image of a object or something from these segmentations. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -764,7 +764,7 @@ def __init__(self, device):
                          "like: generate a real image of a object or something from this depth image, "
                          "or generate a new real image of a object or something from the depth map. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -835,7 +835,7 @@ def __init__(self, device):
                          "like: generate a real image of a object or something from this normal map, "
                          "or generate a new real image of a object or something from the normal map. "
                          "The input to this tool should be a comma separated string of two, "
-                         "representing the image_path and the user description")
+                         "representing the image_path and the user description (in English)")
     def inference(self, inputs):
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
@@ -863,7 +863,7 @@ def __init__(self, device):
     @prompts(name="Answer Question About The Image",
              description="useful when you need an answer for a question based on an image. "
                          "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
-                         "The input to this tool should be a comma separated string of two, representing the image_path and the question")
+                         "The input to this tool should be a comma separated string of two, representing the image_path and the question (in English)")
     def inference(self, inputs):
         image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         raw_image = Image.open(image_path).convert('RGB')

From 2f51440575806f46e33b2dfb4f2f9d6b4a021879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Wed, 29 Mar 2023 20:27:35 +0800
Subject: [PATCH 3/6] close gr

---
 run.sh            | 7 +++++++
 visual_chatgpt.py | 1 +
 2 files changed, 8 insertions(+)
 create mode 100644 run.sh

diff --git a/run.sh b/run.sh
new file mode 100644
index 00000000..3358bcd4
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,7 @@
+python visual_chatgpt.py --load "ImageCaptioning_cuda:0,ImageEditing_cuda:4,
+    Text2Image_cuda:1,Image2Canny_cpu,CannyText2Image_cuda:0,
+    Image2Depth_cpu,DepthText2Image_cuda:5,VisualQuestionAnswering_cuda:2,
+    InstructPix2Pix_cuda:2,Image2Scribble_cpu,ScribbleText2Image_cuda:2,
+    Image2Seg_cpu,SegText2Image_cuda:6,Image2Pose_cpu,PoseText2Image_cuda:6,
+    Image2Hed_cpu,HedText2Image_cuda:3,Image2Normal_cpu,
+    NormalText2Image_cuda:3,Image2Line_cpu,LineText2Image_cuda:7"
\ No newline at end of file
diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index b07e6d56..c163d13b 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -2,6 +2,7 @@
 from dotenv import load_dotenv
 load_dotenv()
 import gradio as gr
+gr.close_all()
 import random
 import torch
 import cv2

From 60e537f065b88d5270ce28264301ea6b011aa919 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Wed, 29 Mar 2023 20:30:36 +0800
Subject: [PATCH 4/6] debug

---
 visual_chatgpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index c163d13b..faf2850b 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -1070,4 +1070,4 @@ def run_image(self, image, state, txt):
         clear.click(bot.memory.clear)
         clear.click(lambda: [], None, chatbot)
         clear.click(lambda: [], None, state)
-        demo.launch(server_name=os.getenv("HOST_NAME"), server_port=1015)
+        demo.launch(server_name=os.getenv("HOST_NAME"), server_port=os.getenv("HOST_PORT"))

From b484de8b093ba4f75fab3e4cf1e983f0f8e672df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Wed, 29 Mar 2023 20:31:53 +0800
Subject: [PATCH 5/6] debug

---
 visual_chatgpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index faf2850b..d6f01542 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -1070,4 +1070,4 @@ def run_image(self, image, state, txt):
         clear.click(bot.memory.clear)
         clear.click(lambda: [], None, chatbot)
         clear.click(lambda: [], None, state)
-        demo.launch(server_name=os.getenv("HOST_NAME"), server_port=os.getenv("HOST_PORT"))
+        demo.launch(server_name=os.getenv("HOST_NAME"), server_port=int(os.getenv("HOST_PORT")))

From 03c103f1af8b9895f27f4770d7c7e2b8c2b38201 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9F=A9=E5=AE=8F=E7=82=9C?=
 <hanhongwei@hanhongweideMacBook-Pro.local>
Date: Fri, 31 Mar 2023 16:34:30 +0800
Subject: [PATCH 6/6] update prompt

---
 visual_chatgpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index d6f01542..5e59a28d 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -47,7 +47,7 @@
 ```
 Thought: Do I need to use a tool? Yes
 Action: the action to take, should be one of [{tool_names}]
-Action Input: the input to the action
+Action Input: the input to the action (in English)
 Observation: the result of the action
 ```