diff --git a/backend/database/sql_executor.py b/backend/database/sql_executor.py
index 558cfc5..5c34d8f 100644
--- a/backend/database/sql_executor.py
+++ b/backend/database/sql_executor.py
@@ -23,13 +23,13 @@ def append_df_to_table(self, df: pd.DataFrame, table_name: str):
raise
def create_table_for_data_profile(
- self, org_id: int, table_name: str, column_names_and_types: dict
+ self, org_id: int, table_name: str, column_metadata: dict
):
"""Creates a table for a data profile."""
try:
create_query = (
self.sql_string_manager.generate_create_query_for_data_profile_table(
- table_name, column_names_and_types
+ table_name, column_metadata
)
)
self.session.execute(text(create_query))
diff --git a/backend/database/table_manager.py b/backend/database/table_manager.py
index 9d846c6..4b860aa 100644
--- a/backend/database/table_manager.py
+++ b/backend/database/table_manager.py
@@ -57,14 +57,12 @@ def create_table_for_data_profile(
org_id: int,
table_name: str,
table_alias: str,
- column_names_and_types: dict,
+ column_metadata: dict,
):
"""Creates a table for a data profile."""
try:
executor = SQLExecutor(self.session)
- executor.create_table_for_data_profile(
- org_id, table_name, column_names_and_types
- )
+ executor.create_table_for_data_profile(org_id, table_name, column_metadata)
self._map_table_to_org(org_id, table_name, table_alias)
except Exception as e:
print(f"An error occurred: {e}")
diff --git a/backend/llms/gpt.py b/backend/llms/gpt.py
index 27f7445..00e726d 100644
--- a/backend/llms/gpt.py
+++ b/backend/llms/gpt.py
@@ -368,12 +368,12 @@ async def generate_chart_config(
return parsed_config
- async def generate_suggested_column_types(self, column_names: list, data: dict):
+ async def generate_suggested_column_metadata(self, column_names: list, data: dict):
"""Generate suggested column types for the given data."""
- self._add_system_message(assistant_type="column_type_suggestion")
+ self._add_system_message(assistant_type="column_metadata_suggestion")
self._set_response_format(is_json=True)
- prompt = self.prompt_manager.create_column_type_suggestion_prompt(
+ prompt = self.prompt_manager.create_column_metadata_suggestion_prompt(
column_names, data
)
diff --git a/backend/llms/prompt_manager.py b/backend/llms/prompt_manager.py
index ef202a2..c138002 100644
--- a/backend/llms/prompt_manager.py
+++ b/backend/llms/prompt_manager.py
@@ -123,10 +123,10 @@ def jpg_data_extraction_prompt(self, instructions: str):
"""
return prompt
- def create_column_type_suggestion_prompt(self, column_names, data):
+ def create_column_metadata_suggestion_prompt(self, column_names, data):
prompt = f"""
- Based on the following data, suggest the data types for each column in the table.
- The available column types are: text, integer, money, date, boolean
+ Based on the following data, suggest the data types for each column in the table and indicate which column should be a primary key.
+ The available data types are: text, integer, money, date, boolean.
Column names:
{column_names}
@@ -134,6 +134,17 @@ def create_column_type_suggestion_prompt(self, column_names, data):
Data:
{data}
- Return a JSON with the column names as keys and the suggested data types as values.
+ Return a JSON object where each key is a column name.
+ For each key, provide an object specifying 'data_type' and 'primary_key' status (a boolean indicating whether the column is a primary key).
+
+ Example output:
+ {{
+ client_name: {{ data_type: "text", primary_key: false }},
+ net_amount: {{ data_type: "money", primary_key: true }},
+ gross_amount: {{ data_type: "money", primary_key: false }},
+ date: {{ data_type: "date", primary_key: false }},
+ }}
+
+ If no column appears that it should be a primary key, set the 'primary_key' value to false for all columns.
"""
return prompt
diff --git a/backend/llms/system_message_manager.py b/backend/llms/system_message_manager.py
index 4a1f83b..7e56f18 100644
--- a/backend/llms/system_message_manager.py
+++ b/backend/llms/system_message_manager.py
@@ -4,9 +4,9 @@ def __init__(self):
"analytics_chat": """
You are an analytics assistant.
You will be generating SQL queries, and providing useful information for reports and analytics based on the given prompt.""",
- "column_type_suggestion": """
- You are a column type suggestion assistant.
- You will be suggesting column data types based on the given prompt.
+ "column_metadata_suggestion": """
+ You are a column metadata suggestion assistant.
+ You will be suggesting column data types and primary keys based on the given prompt.
""",
"sql_code": """
You are a PostgreSQL SQL statement assistant.
diff --git a/backend/models/data_profile.py b/backend/models/data_profile.py
index 0e8fdd0..0244395 100644
--- a/backend/models/data_profile.py
+++ b/backend/models/data_profile.py
@@ -1,3 +1,5 @@
+from typing import Dict, Union
+
from pydantic import BaseModel
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
@@ -45,9 +47,20 @@ def to_dict(self):
class DataProfileCreateRequest(BaseModel):
+ """
+ DataProfileCreateRequest Model
+ ------------------------------
+ This class represents the request body for creating a new data profile.
+ Attributes:
+ - name: The name of the data profile.
+ - extract_instructions: The instructions for extracting data from the file.
+ - column_metadata: A dictionary where each key is a column name and each value is another dictionary specifying the attributes of the column.
+ The inner dictionary includes 'data_type' and 'primary_key' fields.
+ """
+
name: str
extract_instructions: str
- column_names_and_types: dict
+ column_metadata: Dict[str, Dict[str, Union[str, bool]]]
class DataProfileCreateResponse(BaseModel):
diff --git a/backend/routes/data_profile_routes.py b/backend/routes/data_profile_routes.py
index 6c3331e..fa0caff 100644
--- a/backend/routes/data_profile_routes.py
+++ b/backend/routes/data_profile_routes.py
@@ -45,7 +45,25 @@ async def get_data_profiles_by_org_id(current_user: User = Depends(get_current_u
async def save_data_profile(
request: DataProfileCreateRequest, current_user: User = Depends(get_current_user)
) -> DataProfileCreateResponse:
- """Save a new data profile to the database"""
+ """
+ Creates a new data profile and saves it to the database.
+
+ This function first validates the name of the data profile, ensuring it is not longer than 50 characters and only contains valid characters for a table name.
+ It then checks if a data profile with the same name already exists for the current user's organization.
+
+ If the validation passes and no duplicate data profile exists, it creates a new table for the data profile using the provided column metadata.
+ It then creates a new data profile with the provided name, extract instructions, and the current user's organization id, and saves it to the database.
+
+ Args:
+ request (DataProfileCreateRequest): The data profile creation request containing the name, extract instructions, and column metadata for the new data profile.
+ current_user (User, optional): The current user. Defaults to the result of `get_current_user()`.
+
+ Raises:
+ HTTPException: If the data profile name is invalid or a data profile with the same name already exists for the current user's organization.
+
+ Returns:
+ DataProfileCreateResponse: The created data profile.
+ """
if len(request.name) > 50:
raise HTTPException(
status_code=400, detail="Data Profile name cannot be longer than 50 chars"
@@ -73,7 +91,7 @@ async def save_data_profile(
org_id=current_user.organization_id,
table_name=table_name,
table_alias=request.name,
- column_names_and_types=request.column_names_and_types,
+ column_metadata=request.column_metadata,
)
# Create the data profile
@@ -160,14 +178,14 @@ async def preview_data_profile(
return extracted_data
-@data_profile_router.post("/data-profiles/preview/column-types/")
-async def generate_suggested_column_types(
+@data_profile_router.post("/data-profiles/preview/column-metadata/")
+async def generate_suggested_column_metadata(
request: SuggestedColumnTypesRequest, current_user: User = Depends(get_current_user)
):
gpt = GPTLLM(chat_id=1, user=current_user)
if request.data:
column_names = list(request.data[0].keys())
- suggested_column_types = await gpt.generate_suggested_column_types(
+ suggested_column_types = await gpt.generate_suggested_column_metadata(
column_names, request.data
)
@@ -248,6 +266,7 @@ async def save_extracted_data(
files: List[UploadFile] = File(...),
current_user: User = Depends(get_current_user),
):
+ """Save the extracted data to the database using the data profile. Save the original files to DigitalOcean Spaces."""
# Get the organization name
with DatabaseManager() as session:
org_manager = OrganizationManager(session)
diff --git a/backend/utils/sql_string_manager.py b/backend/utils/sql_string_manager.py
index a7f1ea2..cd16e48 100644
--- a/backend/utils/sql_string_manager.py
+++ b/backend/utils/sql_string_manager.py
@@ -52,24 +52,30 @@ def map_to_postgres_type(self, column_type: str) -> str:
return type_mapping.get(column_type, "TEXT")
def generate_create_query_for_data_profile_table(
- self, table_name: str, column_names_and_types: dict
+ self, table_name: str, column_metadata: dict
) -> str:
"""
Generates a CREATE TABLE query for a data profile table.
Parameters:
table_name (str): The name of the table.
- column_names_and_types (dict): A dictionary of column names and types.
+ column_metadata (dict): A dictionary of column names, types, and primary key information.
Returns:
str: The CREATE TABLE query.
"""
# Generate the CREATE TABLE query
create_query = f"CREATE TABLE {table_name} ("
- for column_name, column_type in column_names_and_types.items():
- postgres_type = self.map_to_postgres_type(column_type)
+ primary_key = None
+ for column_name, column_info in column_metadata.items():
+ postgres_type = self.map_to_postgres_type(column_info["data_type"])
create_query += f"{column_name} {postgres_type}, "
- create_query = create_query[:-2] + ");"
+ if column_info.get("primary_key"):
+ primary_key = column_name
+ create_query = create_query[:-2]
+ if primary_key:
+ create_query += f", PRIMARY KEY ({primary_key})"
+ create_query += ");"
return create_query
diff --git a/frontend/src/api/dataProfilesRequests.jsx b/frontend/src/api/dataProfilesRequests.jsx
index e659d15..d0c1fe6 100644
--- a/frontend/src/api/dataProfilesRequests.jsx
+++ b/frontend/src/api/dataProfilesRequests.jsx
@@ -19,8 +19,8 @@ export const getAvailableColumnTypes = () => {
return axios.get(`${API_URL}data-profiles/column-types/`);
};
-export const getSuggestedColumnTypes = (previewData) => {
- return axios.post(`${API_URL}data-profiles/preview/column-types/`, {
+export const getSuggestedColumnMetadata = (previewData) => {
+ return axios.post(`${API_URL}data-profiles/preview/column-metadata/`, {
data: previewData,
});
};
diff --git a/frontend/src/components/tabs/TabPanel.jsx b/frontend/src/components/tabs/TabPanel.jsx
new file mode 100644
index 0000000..a7e4047
--- /dev/null
+++ b/frontend/src/components/tabs/TabPanel.jsx
@@ -0,0 +1,24 @@
+import Box from "@mui/material/Box";
+import Typography from "@mui/material/Typography";
+
+function TabPanel(props) {
+ const { children, value, index, ...other } = props;
+
+ return (
+