Skip to content

Commit

Permalink
Merge pull request #264 from DocShow-AI/primary-key
Browse files Browse the repository at this point in the history
Primary key
  • Loading branch information
liberty-rising authored Feb 12, 2024
2 parents 23f5cd7 + 009b3ad commit 83eebd9
Show file tree
Hide file tree
Showing 21 changed files with 350 additions and 569 deletions.
4 changes: 2 additions & 2 deletions backend/database/sql_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ def append_df_to_table(self, df: pd.DataFrame, table_name: str):
raise

def create_table_for_data_profile(
self, org_id: int, table_name: str, column_names_and_types: dict
self, org_id: int, table_name: str, column_metadata: dict
):
"""Creates a table for a data profile."""
try:
create_query = (
self.sql_string_manager.generate_create_query_for_data_profile_table(
table_name, column_names_and_types
table_name, column_metadata
)
)
self.session.execute(text(create_query))
Expand Down
6 changes: 2 additions & 4 deletions backend/database/table_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,12 @@ def create_table_for_data_profile(
org_id: int,
table_name: str,
table_alias: str,
column_names_and_types: dict,
column_metadata: dict,
):
"""Creates a table for a data profile."""
try:
executor = SQLExecutor(self.session)
executor.create_table_for_data_profile(
org_id, table_name, column_names_and_types
)
executor.create_table_for_data_profile(org_id, table_name, column_metadata)
self._map_table_to_org(org_id, table_name, table_alias)
except Exception as e:
print(f"An error occurred: {e}")
Expand Down
6 changes: 3 additions & 3 deletions backend/llms/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,12 +368,12 @@ async def generate_chart_config(

return parsed_config

async def generate_suggested_column_types(self, column_names: list, data: dict):
async def generate_suggested_column_metadata(self, column_names: list, data: dict):
"""Generate suggested column types for the given data."""
self._add_system_message(assistant_type="column_type_suggestion")
self._add_system_message(assistant_type="column_metadata_suggestion")
self._set_response_format(is_json=True)

prompt = self.prompt_manager.create_column_type_suggestion_prompt(
prompt = self.prompt_manager.create_column_metadata_suggestion_prompt(
column_names, data
)

Expand Down
19 changes: 15 additions & 4 deletions backend/llms/prompt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,17 +123,28 @@ def jpg_data_extraction_prompt(self, instructions: str):
"""
return prompt

def create_column_type_suggestion_prompt(self, column_names, data):
def create_column_metadata_suggestion_prompt(self, column_names, data):
prompt = f"""
Based on the following data, suggest the data types for each column in the table.
The available column types are: text, integer, money, date, boolean
Based on the following data, suggest the data types for each column in the table and indicate which column should be a primary key.
The available data types are: text, integer, money, date, boolean.
Column names:
{column_names}
Data:
{data}
Return a JSON with the column names as keys and the suggested data types as values.
Return a JSON object where each key is a column name.
For each key, provide an object specifying 'data_type' and 'primary_key' status (a boolean indicating whether the column is a primary key).
Example output:
{{
client_name: {{ data_type: "text", primary_key: false }},
net_amount: {{ data_type: "money", primary_key: true }},
gross_amount: {{ data_type: "money", primary_key: false }},
date: {{ data_type: "date", primary_key: false }},
}}
If no column appears that it should be a primary key, set the 'primary_key' value to false for all columns.
"""
return prompt
6 changes: 3 additions & 3 deletions backend/llms/system_message_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ def __init__(self):
"analytics_chat": """
You are an analytics assistant.
You will be generating SQL queries, and providing useful information for reports and analytics based on the given prompt.""",
"column_type_suggestion": """
You are a column type suggestion assistant.
You will be suggesting column data types based on the given prompt.
"column_metadata_suggestion": """
You are a column metadata suggestion assistant.
You will be suggesting column data types and primary keys based on the given prompt.
""",
"sql_code": """
You are a PostgreSQL SQL statement assistant.
Expand Down
15 changes: 14 additions & 1 deletion backend/models/data_profile.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Dict, Union

from pydantic import BaseModel
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint

Expand Down Expand Up @@ -45,9 +47,20 @@ def to_dict(self):


class DataProfileCreateRequest(BaseModel):
"""
DataProfileCreateRequest Model
------------------------------
This class represents the request body for creating a new data profile.
Attributes:
- name: The name of the data profile.
- extract_instructions: The instructions for extracting data from the file.
- column_metadata: A dictionary where each key is a column name and each value is another dictionary specifying the attributes of the column.
The inner dictionary includes 'data_type' and 'primary_key' fields.
"""

name: str
extract_instructions: str
column_names_and_types: dict
column_metadata: Dict[str, Dict[str, Union[str, bool]]]


class DataProfileCreateResponse(BaseModel):
Expand Down
29 changes: 24 additions & 5 deletions backend/routes/data_profile_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,25 @@ async def get_data_profiles_by_org_id(current_user: User = Depends(get_current_u
async def save_data_profile(
request: DataProfileCreateRequest, current_user: User = Depends(get_current_user)
) -> DataProfileCreateResponse:
"""Save a new data profile to the database"""
"""
Creates a new data profile and saves it to the database.
This function first validates the name of the data profile, ensuring it is not longer than 50 characters and only contains valid characters for a table name.
It then checks if a data profile with the same name already exists for the current user's organization.
If the validation passes and no duplicate data profile exists, it creates a new table for the data profile using the provided column metadata.
It then creates a new data profile with the provided name, extract instructions, and the current user's organization id, and saves it to the database.
Args:
request (DataProfileCreateRequest): The data profile creation request containing the name, extract instructions, and column metadata for the new data profile.
current_user (User, optional): The current user. Defaults to the result of `get_current_user()`.
Raises:
HTTPException: If the data profile name is invalid or a data profile with the same name already exists for the current user's organization.
Returns:
DataProfileCreateResponse: The created data profile.
"""
if len(request.name) > 50:
raise HTTPException(
status_code=400, detail="Data Profile name cannot be longer than 50 chars"
Expand Down Expand Up @@ -73,7 +91,7 @@ async def save_data_profile(
org_id=current_user.organization_id,
table_name=table_name,
table_alias=request.name,
column_names_and_types=request.column_names_and_types,
column_metadata=request.column_metadata,
)

# Create the data profile
Expand Down Expand Up @@ -160,14 +178,14 @@ async def preview_data_profile(
return extracted_data


@data_profile_router.post("/data-profiles/preview/column-types/")
async def generate_suggested_column_types(
@data_profile_router.post("/data-profiles/preview/column-metadata/")
async def generate_suggested_column_metadata(
request: SuggestedColumnTypesRequest, current_user: User = Depends(get_current_user)
):
gpt = GPTLLM(chat_id=1, user=current_user)
if request.data:
column_names = list(request.data[0].keys())
suggested_column_types = await gpt.generate_suggested_column_types(
suggested_column_types = await gpt.generate_suggested_column_metadata(
column_names, request.data
)

Expand Down Expand Up @@ -248,6 +266,7 @@ async def save_extracted_data(
files: List[UploadFile] = File(...),
current_user: User = Depends(get_current_user),
):
"""Save the extracted data to the database using the data profile. Save the original files to DigitalOcean Spaces."""
# Get the organization name
with DatabaseManager() as session:
org_manager = OrganizationManager(session)
Expand Down
16 changes: 11 additions & 5 deletions backend/utils/sql_string_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,30 @@ def map_to_postgres_type(self, column_type: str) -> str:
return type_mapping.get(column_type, "TEXT")

def generate_create_query_for_data_profile_table(
self, table_name: str, column_names_and_types: dict
self, table_name: str, column_metadata: dict
) -> str:
"""
Generates a CREATE TABLE query for a data profile table.
Parameters:
table_name (str): The name of the table.
column_names_and_types (dict): A dictionary of column names and types.
column_metadata (dict): A dictionary of column names, types, and primary key information.
Returns:
str: The CREATE TABLE query.
"""
# Generate the CREATE TABLE query
create_query = f"CREATE TABLE {table_name} ("
for column_name, column_type in column_names_and_types.items():
postgres_type = self.map_to_postgres_type(column_type)
primary_key = None
for column_name, column_info in column_metadata.items():
postgres_type = self.map_to_postgres_type(column_info["data_type"])
create_query += f"{column_name} {postgres_type}, "
create_query = create_query[:-2] + ");"
if column_info.get("primary_key"):
primary_key = column_name
create_query = create_query[:-2]
if primary_key:
create_query += f", PRIMARY KEY ({primary_key})"
create_query += ");"

return create_query

Expand Down
4 changes: 2 additions & 2 deletions frontend/src/api/dataProfilesRequests.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ export const getAvailableColumnTypes = () => {
return axios.get(`${API_URL}data-profiles/column-types/`);
};

export const getSuggestedColumnTypes = (previewData) => {
return axios.post(`${API_URL}data-profiles/preview/column-types/`, {
export const getSuggestedColumnMetadata = (previewData) => {
return axios.post(`${API_URL}data-profiles/preview/column-metadata/`, {
data: previewData,
});
};
24 changes: 24 additions & 0 deletions frontend/src/components/tabs/TabPanel.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import Box from "@mui/material/Box";
import Typography from "@mui/material/Typography";

function TabPanel(props) {
const { children, value, index, ...other } = props;

return (
<div
role="tabpanel"
hidden={value !== index}
id={`simple-tabpanel-${index}`}
aria-labelledby={`simple-tab-${index}`}
{...other}
>
{value === index && (
<Box sx={{ p: 3 }}>
<Typography component="div">{children}</Typography>
</Box>
)}
</div>
);
}

export default TabPanel;
59 changes: 0 additions & 59 deletions frontend/src/pages/data-profiling/CreateDataProfile.jsx

This file was deleted.

Loading

0 comments on commit 83eebd9

Please sign in to comment.