Skip to content

Commit

Permalink
feat: add IsValidSQL() (#1372)
Browse files Browse the repository at this point in the history
* implement query validation logic

* implement descriptor

* implement test

* fix typo at llm extras

* Add sqlvalidator to requirements file

* Moved sqlvalidator import into the is_valid_sql function

* add ignore_missing_imports for sqlvalidator in setup.cfg

* docs: add IsValidSQL()

---------

Co-authored-by: Your Name <[email protected]>
Co-authored-by: Sifr'un <[email protected]>
Co-authored-by: Emeli Dral <[email protected]>
  • Loading branch information
4 people authored Dec 4, 2024
1 parent a1318d1 commit 7c8fdfb
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ Check for regular expression matches.
| **JSONMatch()** <ul><li>Compares two columns of a dataframe and checks whether the two objects in each row of the dataframe are matching JSONs or not. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `JSONMatch(with_column="column_2")`| **Required:** <br> `with_column : str` <br><br>**Optional:**<ul><li>`display_name`</li> |
| **ContainsLink()** <ul><li>Checks if the text contains at least one valid URL. </li><li>Returns True/False for each row. </li></ul> | **Required:** n/a <br>**Optional:**<ul><li>`display_name`</li></ul> |
| **IsValidPython()** <ul><li>Checks if the text is valid Python code without syntax errors.</li><li>Returns True/False for every input. </li></ul>| **Required:** <br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **IsValidSQL()** <ul><li>Checks if the text in a specified column is a valid SQL query without executing the query.</li><li>Returns True/False for every input. </li></ul>| **Required:** <br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |

## Descriptors: Text stats

Expand Down
1 change: 1 addition & 0 deletions requirements.min.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ openai==1.16.2
evaluate==0.4.1
transformers[torch]==4.39.3
sentence-transformers==2.7.0
sqlvalidator==0.0.20
chromadb==0.4.0
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ ignore_missing_imports = True
[mypy-litellm.*]
ignore_missing_imports = True

[mypy-sqlvalidator.*]
ignore_missing_imports = True

[mypy-chromadb.*]
ignore_missing_imports = True

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
"evaluate>=0.4.1",
"transformers[torch]>=4.39.3",
"sentence-transformers>=2.7.0",
"sqlvalidator>=0.0.20",
"chromadb>=0.4.0",
],
"spark": ["pyspark>=3.4.0"],
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .hf_descriptor import HuggingFaceToxicityModel
from .is_valid_json_descriptor import IsValidJSON
from .is_valid_python_descriptor import IsValidPython
from .is_valid_sql_descriptor import IsValidSQL
from .json_match_descriptor import JSONMatch
from .json_schema_match_descriptor import JSONSchemaMatch
from .llm_judges import BiasLLMEval
Expand Down Expand Up @@ -74,6 +75,7 @@
"WordMatch",
"WordNoMatch",
"IsValidJSON",
"IsValidSQL",
"JSONSchemaMatch",
"IsValidPython",
"_registry",
Expand Down
3 changes: 3 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@
"evidently.descriptors.custom_descriptor.CustomPairColumnEval",
"evidently:descriptor:CustomPairColumnEval",
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.is_valid_sql_descriptor.IsValidSQL", "evidently:descriptor:IsValidSQL"
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.json_match_descriptor.JSONMatch",
Expand Down
11 changes: 11 additions & 0 deletions src/evidently/descriptors/is_valid_sql_descriptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from evidently.features import is_valid_sql_feature
from evidently.features.generated_features import FeatureDescriptor
from evidently.features.generated_features import GeneratedFeature


class IsValidSQL(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:IsValidSQL"

def feature(self, column_name: str) -> GeneratedFeature:
return is_valid_sql_feature.IsValidSQL(column_name, self.display_name)
3 changes: 3 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@
register_type_alias(
GeneratedFeatures, "evidently.features.contains_link_feature.ContainsLink", "evidently:feature:ContainsLink"
)
register_type_alias(
GeneratedFeatures, "evidently.features.is_valid_sql_feature.IsValidSQL", "evidently:feature:IsValidSQL"
)
register_type_alias(
GeneratedFeatures, "evidently.features.exact_match_feature.ExactMatchFeature", "evidently:feature:ExactMatchFeature"
)
Expand Down
43 changes: 43 additions & 0 deletions src/evidently/features/is_valid_sql_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Any
from typing import ClassVar
from typing import Optional

from evidently import ColumnType
from evidently.features.generated_features import ApplyColumnGeneratedFeature


class IsValidSQL(ApplyColumnGeneratedFeature):
class Config:
type_alias = "evidently:feature:IsValidSQL"

__feature_type__: ClassVar = ColumnType.Categorical
display_name_template: ClassVar = "SQL Validity Check for {column_name}"
column_name: str

def __init__(self, column_name: str, display_name: Optional[str] = None):
self.column_name = column_name
self.display_name = display_name
super().__init__()

def apply(self, value: Any):
if value is None or not isinstance(value, str):
return False

return self.is_valid_sql(value)

def is_valid_sql(self, query: str) -> bool:
import sqlvalidator

queries = query.strip().split(";") # Split by semicolon

for q in queries:
q = q.strip() # Remove extra whitespace
if not q: # Skip empty queries
continue

try:
sqlvalidator.format_sql(q) # Validate SQL syntax
except Exception:
return False # Invalid SQL

return True # All queries are valid
29 changes: 29 additions & 0 deletions tests/features/test_is_valid_sql_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd

from evidently.features.is_valid_sql_feature import IsValidSQL
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition


def test_is_valid_sql_feature():
feature_generator = IsValidSQL("column_1")
data = pd.DataFrame(
dict(
column_1=[
"SELECT * FROM users", # Valid SQL (simple query)
"SELECT id, address FROM users; SELECT count(id) FROM users", # Valid SQL (multiple SQL queries)
"INSERT INTO table", # Invalid SQL (incomplete query)
"SLECT * FROM users", # Invalid SQL (typo)
"SLECT * FROM users; SELECT id, address FROM users", # Invalid SQL (1 invalid sub-query)
]
)
)

result = feature_generator.generate_feature(
data=data,
data_definition=create_data_definition(None, data, ColumnMapping()),
)

expected_result = pd.DataFrame(dict(column_1=[True, True, False, False, False]))

assert result.equals(expected_result)

0 comments on commit 7c8fdfb

Please sign in to comment.