POC DataScientist ChatBot (Assistant) (#5)

Data Scientist Assistant POC based on: - Google Vertex AI gemini-1.5-flash-001 model - Dask for for distributed execution - Coiled for cloud-based execution on Google Cloud - Stream.lit for web UI and visualisation - pandas for reading CSV data files
Sklavit · Jul 27, 2024 · c6278b6 · c6278b6
1 parent 5f14612
commit c6278b6
Show file tree

Hide file tree

Showing 55 changed files with 8,095 additions and 5 deletions.
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/pet_project.iml b/.idea/pet_project.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/apps/__init__.py b/apps/__init__.py
diff --git a/apps/ds_chat_backend/__init__.py b/apps/ds_chat_backend/__init__.py
diff --git a/apps/ds_chat_backend/cli.py b/apps/ds_chat_backend/cli.py
@@ -0,0 +1,62 @@
+#  Copyright (c) 2024. Sergii Nechuiviter
+import requests
+
+
+def print_help():
+    print("You can type your message and press Enter to send it.")
+    print("")
+    print("If you start your message with '`' character, you can use some commands.")
+    print("You can type '`h' and press Enter to see the help.")
+    print("You can type '`q' and press Enter to quit the application.")
+    print("You can try to enter '`' and your command in natural language for other system commands.")
+    print("")
+
+
+def process_user_message(message: str):
+    return (f'You said: {message}\n'
+            f'    Ok...')
+
+
+def process_command(message: str):
+    return (f'Command: {message}\n'
+            f'    Command is not supported yet.')
+
+
+if __name__ == "__main__":
+    # Print invitation message
+    print("Hello, this is a simple chat application.")
+    print("")
+    print_help()
+
+    # Main processing loop
+    while True:
+        # Get user input
+        user_input = input("You: ")
+
+        # Check if user wants to quit
+        if user_input == '`q':
+            break
+
+        # Check if user wants to see help
+        if user_input == "`h":
+            print_help()
+            continue
+
+        try:
+            # Check if user wants to send a command
+            if user_input.startswith("`"):
+                # Send the command to the server
+                response = process_command(user_input)
+            else:
+                # send message to the server
+                response = process_user_message(user_input)
+        except APIException as e:
+            print(e)
+
+        print(response)
+
+        continue
+
+    # Print a final message
+    print("Thank you for using the chat application.")
+    print("Goodbye!")
diff --git a/apps/streamlit_ds_chat/.gitignore b/apps/streamlit_ds_chat/.gitignore
@@ -0,0 +1,3 @@
+.streamlit/secrets.toml
+env/
+env12/
diff --git a/apps/streamlit_ds_chat/.idea/inspectionProfiles/profiles_settings.xml b/apps/streamlit_ds_chat/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/apps/streamlit_ds_chat/.idea/modules.xml b/apps/streamlit_ds_chat/.idea/modules.xml
diff --git a/apps/streamlit_ds_chat/.idea/streamlit_ds_chat.iml b/apps/streamlit_ds_chat/.idea/streamlit_ds_chat.iml
diff --git a/apps/streamlit_ds_chat/.streamlit/secrets.toml.example b/apps/streamlit_ds_chat/.streamlit/secrets.toml.example
@@ -0,0 +1,2 @@
+# here is an example file with secrets
+# the real file must not be commited to git
diff --git a/apps/streamlit_ds_chat/README_00_ds_chat_app.md b/apps/streamlit_ds_chat/README_00_ds_chat_app.md
diff --git a/apps/streamlit_ds_chat/README_01_Streamlit_app.md b/apps/streamlit_ds_chat/README_01_Streamlit_app.md
diff --git a/apps/streamlit_ds_chat/README_02_Vertex_AI.md b/apps/streamlit_ds_chat/README_02_Vertex_AI.md
diff --git a/apps/streamlit_ds_chat/README_03_Remote_cloud_execution.md b/apps/streamlit_ds_chat/README_03_Remote_cloud_execution.md
diff --git a/apps/streamlit_ds_chat/README_04_vertex_ai_codegen_streamlit.md b/apps/streamlit_ds_chat/README_04_vertex_ai_codegen_streamlit.md
@@ -0,0 +1,68 @@
+
+
+For testing manager, the following questions are used:
+
+<q>How many unique fruits are there in the dataset?</q>
+<q>How many fruits likes each person?</q>
+<q>Make a chart which shows, how many fruits likes each person?</q>
+<q>How many different kinds of fruit each person likes?</q>
+
+### Options from VertexAI itself
+<q>What is the average number of fruits liked? </q>
+<q>Who likes the most number of fruits? </q>
+<q>Show the distribution of fruits liked by people. </q>
+
+## Example output to the last question:
+
+### Attention! The response includes \```python phrase in the beginning!
+
+```python
+import pandas as pd
+from bokeh.plotting import figure
+from bokeh.io import show
+
+def run(data):
+    """
+    This function analyzes the provided data and returns a descriptive summary.
+
+    Args:
+        data (dict): A dictionary containing the data to be analyzed.
+        It should have the same keys as <locals> and corresponding values.
+
+    Returns:
+        dict: A dictionary containing the results of the analysis.
+        'results' key contains the descriptive summary of the dataset.
+        'chart' key contains the plot of the distribution of fruit likes per person.
+    """
+
+    df = data['df_table']  # Access the DataFrame from the data dictionary
+
+    # Group by person and count likes
+    person_likes = df.groupby('person')['likes'].sum()
+
+    # Create a Bokeh figure
+    p = figure(title="Distribution of Fruit Likes per Person",
+               x_axis_label="Person", y_axis_label="Number of Likes")
+
+    # Add a bar chart to the figure
+    p.vbar(x=person_likes.index, top=person_likes.values, width=0.5)
+
+    # Show the chart
+    show(p)
+
+    # Return the results and chart
+    return {
+        'results': None,  # No specific results in this case
+        'chart': {
+            'chart_type': 'vbar',
+            'kwargs': {
+                'x': person_likes.index,
+                'top': person_likes.values,
+                'width': 0.5,
+                'title': "Distribution of Fruit Likes per Person",
+                'x_axis_label': "Person",
+                'y_axis_label': "Number of Likes"
+            }
+        }
+    }
+```
diff --git a/apps/streamlit_ds_chat/README_05_putting_all_togather.md b/apps/streamlit_ds_chat/README_05_putting_all_togather.md
@@ -0,0 +1,28 @@
+# Step 5. Put all components together
+
+Now I have code for:
+- running Streamlit.app
+- uploading sample dataset
+- basic chat-like streamlit application
+- generating code in response to the user request
+- means for remote code execution
+- preformatted params to draw charts with Bokeh
+  (Bokeh exists for >10 years, Streamlit is quite young.  
+  Probably, this is the reason why Gemini may generate Bokeh, but not Streamlit.chart code)
+
+It is time now to put all components togather.
+
+Example requests:
+- Draw number of likes for people and fruits
+
+conda create -p ./env12 -c conda-forge coiled python=3.12 "numpy<2.0.0" streamlit google-cloud-aiplatform dask
+
+
+Known issues:
+- Bokeh is not needed for workers but it is still provided.
+- StreamlitAPIException: Streamlit only supports Bokeh version 2.4.3, but you have version 3.5.0 installed. Please run `pip install --force-reinstall --no-deps bokeh==2.4.3` to install the correct version.
+- start of coiled cluster takes 1-2 minutes, but this will be the same for any cluster start on any tool
+- hard to acquire good code from code generation; it usually works, but the return value is not exactly what was requested
+  This complicates diagram generation on the client side.
+- cluster disconnects and killed after set period of time, no reconnection implemented
+- TODO needed some way to clear all interface
diff --git a/apps/streamlit_ds_chat/__init__.py b/apps/streamlit_ds_chat/__init__.py
diff --git a/apps/streamlit_ds_chat/chat_with_vertex_ai.py b/apps/streamlit_ds_chat/chat_with_vertex_ai.py
@@ -0,0 +1,100 @@
+import streamlit as st
+
+import vertexai
+from vertexai.generative_models import GenerativeModel
+import vertexai.preview.generative_models as generative_models
+from google.oauth2.service_account import Credentials
+
+from datetime import datetime
+
+
+st.title("Stub of DS chat with Google Vertex AI")
+
+########################################################################
+# init chat model
+
+chat = None
+
+if "vertex_ai_model" not in st.session_state or chat is None:
+    st.session_state["vertex_ai_model"] = "gemini-1.5-flash-001"
+
+    credentials = Credentials.from_service_account_info(st.secrets["gcs_connections"])
+    vertexai.init(
+        project="pivotal-cable-428219-c5",
+        location="us-central1",
+        credentials=credentials,
+    )
+    model = GenerativeModel(
+        st.session_state[
+            "vertex_ai_model"
+        ],  # by default it will be "gemini-1.5-flash-001",
+        system_instruction=[
+            "You are an expert python engineer with data scientist background."
+            "If the user's question needs code generation, respond with: <CODE>"
+            "If the user's question needs drawing a chart, respond with: <CHART>"
+        ],
+    )
+    chat = model.start_chat()
+
+generation_config = {
+    "max_output_tokens": 8192,
+    "temperature": 1,
+    "top_p": 0.95,
+}
+
+safety_settings = {
+    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
+    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
+    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
+    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
+}
+
+#####################################
+
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+# streaming is not working with streamlit, exceptions inside `vertexai.generative_models import GenerativeModel`
+USE_STREAMING = False
+
+if prompt := st.chat_input("What is up?"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    with st.chat_message("assistant"):
+        if USE_STREAMING:
+            api_response_stream = chat.send_message(
+                [prompt],
+                generation_config=generation_config,
+                safety_settings=safety_settings,
+                stream=True,
+            )
+
+            def stream_data():
+                for api_response in api_response_stream:
+                    chunk = api_response.candidates[0].content.parts[0]._raw_part.text
+                    print(f"{datetime.now()}: {chunk}")
+                    yield chunk
+
+            response = st.write_stream(stream_data)
+        else:
+            with st.spinner(
+                "Wait for the whole response (streaming not working with Streamlit)..."
+            ):
+                api_response = chat.send_message(
+                    [prompt],
+                    generation_config=generation_config,
+                    safety_settings=safety_settings,
+                    stream=False,
+                )
+                response = api_response.candidates[0].content.parts[0]._raw_part.text
+                st.write(response)
+
+        print(("response:", response))
+        st.session_state.messages.append({"role": "assistant", "content": response})
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# here is an example file with secrets
		# the real file must not be commited to git