Skip to content

Commit

Permalink
Merge branch 'main' of github.com:radicalbit/radicalbit-ai-monitoring…
Browse files Browse the repository at this point in the history
… into refactoring/ROS-338-edit-metrics-dto
  • Loading branch information
dtria91 committed Jul 8, 2024
2 parents c29749e + 65f4f65 commit db54d1a
Show file tree
Hide file tree
Showing 38 changed files with 4,237 additions and 181 deletions.
42 changes: 42 additions & 0 deletions api/alembic/versions/3ec04e609ae9_set_correlation_id_optional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""set_correlation_id_optional
Revision ID: 3ec04e609ae9
Revises: 086f26392cc4
Create Date: 2024-07-08 10:28:35.068312
"""
from typing import Sequence, Union, Text

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '3ec04e609ae9'
down_revision: Union[str, None] = '086f26392cc4'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column('current_dataset', 'CORRELATION_ID_COLUMN',
existing_type=sa.VARCHAR(),
nullable=True)
op.create_unique_constraint(None, 'current_dataset', ['UUID'])
op.create_unique_constraint(None, 'current_dataset_metrics', ['UUID'])
op.create_unique_constraint(None, 'reference_dataset', ['UUID'])
op.create_unique_constraint(None, 'reference_dataset_metrics', ['UUID'])
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(None, 'reference_dataset_metrics', type_='unique')
op.drop_constraint(None, 'reference_dataset', type_='unique')
op.drop_constraint(None, 'current_dataset_metrics', type_='unique')
op.drop_constraint(None, 'current_dataset', type_='unique')
op.alter_column('current_dataset', 'CORRELATION_ID_COLUMN',
existing_type=sa.VARCHAR(),
nullable=False)
# ### end Alembic commands ###
2 changes: 1 addition & 1 deletion api/app/db/tables/current_dataset_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ class CurrentDataset(Reflected, BaseTable, BaseDAO):
)
path = Column('PATH', VARCHAR, nullable=False)
date = Column('DATE', TIMESTAMP(timezone=True), nullable=False)
correlation_id_column = Column('CORRELATION_ID_COLUMN', VARCHAR, nullable=False)
correlation_id_column = Column('CORRELATION_ID_COLUMN', VARCHAR, nullable=True)
status = Column('STATUS', VARCHAR, nullable=False, default=JobStatus.IMPORTING)
2 changes: 1 addition & 1 deletion api/app/models/dataset_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class CurrentDatasetDTO(BaseModel):
model_uuid: UUID
path: str
date: str
correlation_id_column: str
correlation_id_column: Optional[str]
status: str

model_config = ConfigDict(
Expand Down
1 change: 1 addition & 0 deletions api/app/models/metrics/data_quality_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ class ClassMetrics(BaseModel):
class ClassificationDataQuality(BaseModel):
n_observations: int
class_metrics: List[ClassMetrics]
class_metrics_prediction: List[ClassMetrics]
feature_metrics: List[NumericalFeatureMetrics | CategoricalFeatureMetrics]

model_config = ConfigDict(
Expand Down
61 changes: 53 additions & 8 deletions api/app/models/metrics/model_quality_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,25 +90,61 @@ class MultiClassificationModelQuality(BaseModel):
model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)


class RegressionModelQuality(BaseModel):
class RegressionMetricsBase(BaseModel):
r2: Optional[float] = None
mae: Optional[float] = None
mse: Optional[float] = None
var: Optional[float] = None
variance: Optional[float] = None
mape: Optional[float] = None
rmse: Optional[float] = None
adj_r2: Optional[float] = None

model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)


class BaseRegressionMetrics(BaseModel):
r2: Optional[float] = None
mae: Optional[float] = None
mse: Optional[float] = None
variance: Optional[float] = None
mape: Optional[float] = None
rmse: Optional[float] = None
adj_r2: Optional[float] = None

model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)


class GroupedBaseRegressionMetrics(BaseModel):
r2: List[Distribution]
mae: List[Distribution]
mse: List[Distribution]
variance: List[Distribution]
mape: List[Distribution]
rmse: List[Distribution]
adj_r2: List[Distribution]

model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)


class RegressionModelQuality(BaseRegressionMetrics):
pass


class CurrentRegressionModelQuality(BaseModel):
global_metrics: BaseRegressionMetrics
grouped_metrics: GroupedBaseRegressionMetrics

model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)


class ModelQualityDTO(BaseModel):
job_status: JobStatus
model_quality: Optional[
BinaryClassificationModelQuality
| CurrentBinaryClassificationModelQuality
| MultiClassificationModelQuality
| RegressionModelQuality
| CurrentRegressionModelQuality
]

model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
Expand Down Expand Up @@ -143,11 +179,6 @@ def _create_model_quality(
model_type: ModelType,
dataset_type: DatasetType,
model_quality_data: Dict,
) -> (
BinaryClassificationModelQuality
| CurrentBinaryClassificationModelQuality
| MultiClassificationModelQuality
| RegressionModelQuality
):
"""Create a specific model quality instance based on model type and dataset type."""
if model_type == ModelType.BINARY:
Expand All @@ -158,7 +189,9 @@ def _create_model_quality(
if model_type == ModelType.MULTI_CLASS:
return MultiClassificationModelQuality(**model_quality_data)
if model_type == ModelType.REGRESSION:
return RegressionModelQuality(**model_quality_data)
return ModelQualityDTO._create_regression_model_quality(
dataset_type=dataset_type, model_quality_data=model_quality_data
)
raise MetricsInternalError(f'Invalid model type {model_type}')

@staticmethod
Expand All @@ -172,3 +205,15 @@ def _create_binary_model_quality(
if dataset_type == DatasetType.CURRENT:
return CurrentBinaryClassificationModelQuality(**model_quality_data)
raise MetricsInternalError(f'Invalid dataset type {dataset_type}')

@staticmethod
def _create_regression_model_quality(
dataset_type: DatasetType,
model_quality_data: Dict,
) -> RegressionModelQuality | CurrentRegressionModelQuality:
"""Create a binary model quality instance based on dataset type."""
if dataset_type == DatasetType.REFERENCE:
return RegressionModelQuality(**model_quality_data)
if dataset_type == DatasetType.CURRENT:
return CurrentRegressionModelQuality(**model_quality_data)
raise MetricsInternalError(f'Invalid dataset type {dataset_type}')
2 changes: 1 addition & 1 deletion api/app/routes/upload_dataset_route.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def upload_current_file(
model_uuid: UUID,
csv_file: UploadFile = File(...),
sep: str = Form(','),
correlation_id_column: str = Form(''),
correlation_id_column: Optional[str] = Form(None),
) -> CurrentDatasetDTO:
return file_service.upload_current_file(
model_uuid, csv_file, correlation_id_column, sep
Expand Down
2 changes: 1 addition & 1 deletion api/app/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def upload_current_file(
self,
model_uuid: UUID,
csv_file: UploadFile,
correlation_id_column: str,
correlation_id_column: Optional[str] = None,
sep: str = ',',
columns=None,
) -> CurrentDatasetDTO:
Expand Down
41 changes: 41 additions & 0 deletions api/tests/commons/db_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ def get_sample_current_dataset(
{'name': 'classA', 'count': 100, 'percentage': 50.0},
{'name': 'classB', 'count': 100, 'percentage': 50.0},
],
'classMetricsPrediction': [
{'name': 'classA', 'count': 100, 'percentage': 50.0},
{'name': 'classB', 'count': 100, 'percentage': 50.0},
],
'featureMetrics': [
{
'featureName': 'age',
Expand Down Expand Up @@ -319,6 +323,43 @@ def get_sample_current_dataset(
'mape': 35.19314237273801,
'rmse': 202.23194752188695,
'adj_r2': 0.9116805380966796,
'variance': 0.23
}

grouped_regression_model_quality_dict = {
'r2': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.8},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.85},
],
'mae': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.88},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.9},
],
'mse': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.86},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.88},
],
'mape': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.81},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.83},
],
'rmse': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.8},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.85},
],
'adj_r2': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.85},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.87},
],
'variance': [
{'timestamp': '2024-01-01T00:00:00Z', 'value': 0.82},
{'timestamp': '2024-02-01T00:00:00Z', 'value': 0.84},
],
}

current_regression_model_quality_dict = {
'global_metrics': regression_model_quality_dict,
'grouped_metrics': grouped_regression_model_quality_dict,
}

regression_data_quality_dict = {
Expand Down
40 changes: 40 additions & 0 deletions api/tests/routes/upload_dataset_route_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,46 @@ def test_bind_reference(self):
assert res.status_code == 200
assert jsonable_encoder(upload_file_result) == res.json()

def test_upload_current(self):
file = csv.get_correct_sample_csv_file()
model_uuid = uuid.uuid4()
upload_file_result = CurrentDatasetDTO(
uuid=uuid.uuid4(),
model_uuid=model_uuid,
path='test',
date=str(datetime.datetime.now(tz=datetime.UTC)),
status=JobStatus.IMPORTING,
correlation_id_column=None,
)
self.file_service.upload_current_file = MagicMock(
return_value=upload_file_result
)
res = self.client.post(
f'{self.prefix}/{model_uuid}/current/upload',
files={'csv_file': (file.filename, file.file)},
)
assert res.status_code == 200
assert jsonable_encoder(upload_file_result) == res.json()

def test_bind_current(self):
file_ref = FileReference(file_url='/file')
model_uuid = uuid.uuid4()
upload_file_result = CurrentDatasetDTO(
uuid=uuid.uuid4(),
model_uuid=model_uuid,
path='test',
date=str(datetime.datetime.now(tz=datetime.UTC)),
status=JobStatus.IMPORTING,
correlation_id_column=None,
)
self.file_service.bind_current_file = MagicMock(return_value=upload_file_result)
res = self.client.post(
f'{self.prefix}/{model_uuid}/current/bind',
json=jsonable_encoder(file_ref),
)
assert res.status_code == 200
assert jsonable_encoder(upload_file_result) == res.json()

def test_get_all_reference_datasets_by_model_uuid_paginated(self):
test_model_uuid = uuid.uuid4()
reference_upload_1 = db_mock.get_sample_reference_dataset(
Expand Down
6 changes: 3 additions & 3 deletions api/tests/services/file_service_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,12 @@ def test_upload_current_file_ok(self):
)
object_name = f'{str(model.uuid)}/current/{file.filename}'
path = f's3://bucket/{object_name}'
correlation_id_column = 'correlation_id'
inserted_file = CurrentDataset(
uuid=uuid4(),
model_uuid=model_uuid,
path=path,
date=datetime.datetime.now(tz=datetime.UTC),
correlation_id_column=correlation_id_column,
correlation_id_column=None,
status=JobStatus.IMPORTING,
)
reference_file = get_sample_reference_dataset(model_uuid=model_uuid)
Expand All @@ -221,7 +220,8 @@ def test_upload_current_file_ok(self):
self.spark_k8s_client.submit_app = MagicMock()

result = self.files_service.upload_current_file(
model.uuid, file, correlation_id_column
model.uuid,
file,
)

self.model_svc.get_model_by_uuid.assert_called_once()
Expand Down
31 changes: 31 additions & 0 deletions api/tests/services/metrics_service_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,37 @@ def test_get_current_multiclass_model_quality_by_model_by_uuid(self):
model_quality_data=current_metrics.model_quality,
)

def test_get_current_regression_model_quality_by_model_by_uuid(self):
status = JobStatus.SUCCEEDED
current_dataset = db_mock.get_sample_current_dataset(status=status.value)
current_metrics = db_mock.get_sample_current_metrics(
model_quality=db_mock.current_regression_model_quality_dict
)
model = db_mock.get_sample_model(model_type=ModelType.REGRESSION)
self.model_service.get_model_by_uuid = MagicMock(return_value=model)
self.current_dataset_dao.get_current_dataset_by_model_uuid = MagicMock(
return_value=current_dataset
)
self.current_metrics_dao.get_current_metrics_by_model_uuid = MagicMock(
return_value=current_metrics
)
res = self.metrics_service.get_current_model_quality_by_model_by_uuid(
model_uuid, current_dataset.uuid
)
self.current_dataset_dao.get_current_dataset_by_model_uuid.assert_called_once_with(
model_uuid, current_dataset.uuid
)
self.current_metrics_dao.get_current_metrics_by_model_uuid.assert_called_once_with(
model_uuid, current_dataset.uuid
)

assert res == ModelQualityDTO.from_dict(
dataset_type=DatasetType.CURRENT,
model_type=model.model_type,
job_status=current_dataset.status,
model_quality_data=current_metrics.model_quality,
)


model_uuid = db_mock.MODEL_UUID
current_uuid = db_mock.CURRENT_UUID
19 changes: 16 additions & 3 deletions docs/quickstarts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,23 @@ After you have poetry installed you can install the project's dependencies run:
poetry install
```

Then, activate an environment:

```bash
poetry shell
```

If you apply any change in the requirements, don't forget to run the following command:

```bash
poetry update
```


### Tutorials ###

| Task | Tutorial Name | Dataset to use | Description |
|-----------------------|---------------------------------------------------|-------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
| Binary Classification | notebooks/binary-classification-income-data.ipynb | data/binary_classification/reference-income.csv, data/binary_classification/current1-income.csv| In this tutorial we monitor data and performance of a ML used to classify if the income is > 50K given a set of features. |
| Task | Tutorial Name | Dataset to use | Dataset license | Description |
|-----------------------|---------------------------------------------------|-------------------------------------------------------------------------------------------------|-------------------|---------------------------------------------------------------------------------------------------------------------------|
| Binary Classification | notebooks/binary-classification-income-data.ipynb | data/binary_classification/reference-income.csv, data/binary_classification/current1-income.csv|Kohavi,Ron. (1996). Census Income. UCI Machine Learning Repository. https://doi.org/10.24432/C5GP7S. Adapted by Radicalbit. | In this tutorial we monitor data and performance of a ML used to classify if the income is > 50K given a set of features. |


Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@
"\n",
"In this example we will use a dataset build to classify between two different classes:\n",
"- **class 0:** income <= 50K\n",
"- **class 1:** income > 50K"
"- **class 1:** income > 50K\n",
"\n",
"\n",
"> **_Dataset license:_** Kohavi,Ron. (1996). Census Income. UCI Machine Learning Repository. https://doi.org/10.24432/C5GP7S. Adapted by Radicalbit."
]
},
{
Expand Down
Loading

0 comments on commit db54d1a

Please sign in to comment.