diff --git a/.github/workflows/authors.yaml b/.github/workflows/authors.yaml new file mode 100644 index 00000000..9c457a44 --- /dev/null +++ b/.github/workflows/authors.yaml @@ -0,0 +1,15 @@ +name: Update Authors +on: + push: + branches: + - main +jobs: + update-authors: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v2 + - name: Run update-authors.sh script file + run: | + chmod +x ./tools/update-authors.sh + ./tools/update-authors.sh + shell: bash \ No newline at end of file diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 00000000..b48da5b9 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,34 @@ + + +Contributors to radicalbit-ai-monitoring +============================ +radicalbit-ai-monitoring is developed and maintained by a community of people interested in providing a comprehensive solution for monitoring your Machine Learning and Large Language Models in production. + + +

+ +

+ + +Contributors (ordered by first contribution.) +------------------------------------- +[Full List of Contributors](https://github.com/radicalbit/radicalbit-ai-monitoring/graphs/contributors) + +- Paolo Filippelli () +- Roberto Bentivoglio () +- paoloyx () +- Luca Tagliabue (<32895300+lucataglia@users.noreply.github.com>) +- Mauro Cortellazzi () +- Stefano Zamboni (<39366866+SteZamboni@users.noreply.github.com>) +- mmariniello90 (<133773116+mmariniello90@users.noreply.github.com>) +- Davide Valleri (<146823822+dvalleri@users.noreply.github.com>) +- Lorenzo D'Agostino (<127778257+lorenzodagostinoradicalbit@users.noreply.github.com>) +- Daniele Tria (<36860433+dtria91@users.noreply.github.com>) +- rivamarco () +- d-croci () +- bigmoby () + + #### Generated by tools/update-authors.sh. diff --git a/api/README.md b/api/README.md index 8c821433..463b1153 100644 --- a/api/README.md +++ b/api/README.md @@ -33,6 +33,18 @@ to check. ## Test +Please install a PostgreSQL database locally. For example, on a macOS platform, execute: + +```bash +brew install postgresql +``` + +Note: If any errors occur during pytest runs, please stop the local database service by executing: + +```bash +brew services stop postgresql +``` + Tests are done with `pytest` Run diff --git a/api/app/models/model_dto.py b/api/app/models/model_dto.py index 85cd3738..f76fdbd6 100644 --- a/api/app/models/model_dto.py +++ b/api/app/models/model_dto.py @@ -1,12 +1,14 @@ import datetime from enum import Enum -from typing import List, Optional +from typing import List, Optional, Self from uuid import UUID -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, model_validator from pydantic.alias_generators import to_camel from app.db.dao.model_dao import Model +from app.models.inferred_schema_dto import SupportedTypes +from app.models.utils import is_none, is_number, is_number_or_string, is_optional_float class ModelType(str, Enum): @@ -28,26 +30,25 @@ class Granularity(str, Enum): MONTH = 'MONTH' -class ColumnDefinition(BaseModel): +class ColumnDefinition(BaseModel, validate_assignment=True): name: str - type: str + type: SupportedTypes def to_dict(self): return self.model_dump() -class OutputType(BaseModel): +class OutputType(BaseModel, validate_assignment=True): prediction: ColumnDefinition prediction_proba: Optional[ColumnDefinition] = None output: List[ColumnDefinition] - model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel) def to_dict(self): return self.model_dump() -class ModelIn(BaseModel): +class ModelIn(BaseModel, validate_assignment=True): name: str description: Optional[str] = None model_type: ModelType @@ -64,6 +65,74 @@ class ModelIn(BaseModel): populate_by_name=True, alias_generator=to_camel, protected_namespaces=() ) + @model_validator(mode='after') + def validate_target(self) -> Self: + checked_model_type: ModelType = self.model_type + match checked_model_type: + case ModelType.BINARY: + if not is_number(self.target.type): + raise ValueError( + f'target must be a number for a ModelType.BINARY, has been provided [{self.target}]' + ) + return self + case ModelType.MULTI_CLASS: + if not is_number_or_string(self.target.type): + raise ValueError( + f'target must be a number or string for a ModelType.MULTI_CLASS, has been provided [{self.target}]' + ) + return self + case ModelType.REGRESSION: + if not is_number(self.target.type): + raise ValueError( + f'target must be a number for a ModelType.REGRESSION, has been provided [{self.target}]' + ) + return self + case _: + raise ValueError('not supported type for model_type') + + @model_validator(mode='after') + def validate_outputs(self) -> Self: + checked_model_type: ModelType = self.model_type + match checked_model_type: + case ModelType.BINARY: + if not is_number(self.outputs.prediction.type): + raise ValueError( + f'prediction must be a number for a ModelType.BINARY, has been provided [{self.outputs.prediction}]' + ) + if not is_optional_float(self.outputs.prediction_proba.type): + raise ValueError( + f'prediction_proba must be an optional float for a ModelType.BINARY, has been provided [{self.outputs.prediction_proba}]' + ) + return self + case ModelType.MULTI_CLASS: + if not is_number_or_string(self.outputs.prediction.type): + raise ValueError( + f'prediction must be a number or string for a ModelType.MULTI_CLASS, has been provided [{self.outputs.prediction}]' + ) + if not is_optional_float(self.outputs.prediction_proba.type): + raise ValueError( + f'prediction_proba must be an optional float for a ModelType.MULTI_CLASS, has been provided [{self.outputs.prediction_proba}]' + ) + return self + case ModelType.REGRESSION: + if not is_number(self.outputs.prediction.type): + raise ValueError( + f'prediction must be a number for a ModelType.REGRESSION, has been provided [{self.outputs.prediction}]' + ) + if not is_none(self.outputs.prediction_proba.type): + raise ValueError( + f'prediction_proba must be None for a ModelType.REGRESSION, has been provided [{self.outputs.prediction_proba}]' + ) + return self + case _: + raise ValueError('not supported type for model_type') + + @model_validator(mode='after') + def timestamp_must_be_datetime(self) -> Self: + if not self.timestamp.type == SupportedTypes.datetime: + raise ValueError('timestamp must be a datetime') + return self + def to_model(self) -> Model: now = datetime.datetime.now(tz=datetime.UTC) return Model( diff --git a/api/app/models/utils.py b/api/app/models/utils.py new file mode 100644 index 00000000..62d19ee9 --- /dev/null +++ b/api/app/models/utils.py @@ -0,0 +1,19 @@ +from typing import Any, Optional + +from app.models.inferred_schema_dto import SupportedTypes + + +def is_number(value: SupportedTypes): + return value in (SupportedTypes.int, SupportedTypes.float) + + +def is_number_or_string(value: SupportedTypes): + return value in (SupportedTypes.int, SupportedTypes.float, SupportedTypes.string) + + +def is_optional_float(value: Optional[SupportedTypes] = None) -> bool: + return value in (None, SupportedTypes.float) + + +def is_none(value: Any) -> bool: + return value is None diff --git a/api/tests/commons/db_mock.py b/api/tests/commons/db_mock.py index bd27b34f..b6adc8d0 100644 --- a/api/tests/commons/db_mock.py +++ b/api/tests/commons/db_mock.py @@ -8,7 +8,15 @@ from app.db.tables.reference_dataset_metrics_table import ReferenceDatasetMetrics from app.db.tables.reference_dataset_table import ReferenceDataset from app.models.job_status import JobStatus -from app.models.model_dto import DataType, Granularity, ModelIn, ModelType +from app.models.model_dto import ( + ColumnDefinition, + DataType, + Granularity, + ModelIn, + ModelType, + OutputType, + SupportedTypes, +) MODEL_UUID = uuid.uuid4() REFERENCE_UUID = uuid.uuid4() @@ -26,7 +34,7 @@ def get_sample_model( features: List[Dict] = [{'name': 'feature1', 'type': 'string'}], outputs: Dict = { 'prediction': {'name': 'pred1', 'type': 'int'}, - 'prediction_proba': {'name': 'prob1', 'type': 'double'}, + 'prediction_proba': {'name': 'prob1', 'type': 'float'}, 'output': [{'name': 'output1', 'type': 'string'}], }, target: Dict = {'name': 'target1', 'type': 'string'}, @@ -59,14 +67,20 @@ def get_sample_model_in( model_type: str = ModelType.BINARY.value, data_type: str = DataType.TEXT.value, granularity: str = Granularity.DAY.value, - features: List[Dict] = [{'name': 'feature1', 'type': 'string'}], - outputs: Dict = { - 'prediction': {'name': 'pred1', 'type': 'int'}, - 'prediction_proba': {'name': 'prob1', 'type': 'double'}, - 'output': [{'name': 'output1', 'type': 'string'}], - }, - target: Dict = {'name': 'target1', 'type': 'string'}, - timestamp: Dict = {'name': 'timestamp', 'type': 'datetime'}, + features: List[ColumnDefinition] = [ + ColumnDefinition(name='feature1', type=SupportedTypes.string) + ], + outputs: OutputType = OutputType( + prediction=ColumnDefinition(name='pred1', type=SupportedTypes.int), + prediction_proba=ColumnDefinition(name='prob1', type=SupportedTypes.float), + output=[ColumnDefinition(name='output1', type=SupportedTypes.string)], + ), + target: ColumnDefinition = ColumnDefinition( + name='target1', type=SupportedTypes.int + ), + timestamp: ColumnDefinition = ColumnDefinition( + name='timestamp', type=SupportedTypes.datetime + ), frameworks: Optional[str] = None, algorithm: Optional[str] = None, ): diff --git a/api/tests/commons/modelin_factory.py b/api/tests/commons/modelin_factory.py new file mode 100644 index 00000000..631b8dc5 --- /dev/null +++ b/api/tests/commons/modelin_factory.py @@ -0,0 +1,68 @@ +from app.models.inferred_schema_dto import SupportedTypes +from app.models.model_dto import ( + ColumnDefinition, + DataType, + Granularity, + ModelType, + OutputType, +) + + +def get_model_sample_wrong(fail_field: str, model_type: ModelType): + prediction = None + prediction_proba = None + if fail_field == 'outputs.prediction' and model_type == ModelType.BINARY: + prediction = ColumnDefinition(name='pred1', type=SupportedTypes.string) + elif fail_field == 'outputs.prediction' and model_type == ModelType.MULTI_CLASS: + prediction = ColumnDefinition(name='pred1', type=SupportedTypes.datetime) + elif fail_field == 'outputs.prediction' and model_type == ModelType.REGRESSION: + prediction = ColumnDefinition(name='pred1', type=SupportedTypes.string) + else: + prediction = ColumnDefinition(name='pred1', type=SupportedTypes.int) + + if ( + fail_field == 'outputs.prediction_proba' + and model_type == ModelType.BINARY + or fail_field == 'outputs.prediction_proba' + and model_type == ModelType.MULTI_CLASS + ): + prediction_proba = ColumnDefinition(name='prob1', type=SupportedTypes.int) + elif ( + fail_field == 'outputs.prediction_proba' and model_type == ModelType.REGRESSION + ): + prediction_proba = ColumnDefinition(name='prob1', type=SupportedTypes.float) + else: + prediction_proba = ColumnDefinition(name='prob1', type=SupportedTypes.float) + + target: ColumnDefinition = None + if fail_field == 'target' and model_type == ModelType.BINARY: + target = ColumnDefinition(name='target1', type=SupportedTypes.string) + elif fail_field == 'target' and model_type == ModelType.MULTI_CLASS: + target = ColumnDefinition(name='target1', type=SupportedTypes.datetime) + elif fail_field == 'target' and model_type == ModelType.REGRESSION: + target = ColumnDefinition(name='target1', type=SupportedTypes.string) + else: + target = ColumnDefinition(name='target1', type=SupportedTypes.int) + + timestamp: ColumnDefinition = None + if fail_field == 'timestamp': + timestamp = ColumnDefinition(name='timestamp', type=SupportedTypes.string) + else: + timestamp = ColumnDefinition(name='timestamp', type=SupportedTypes.datetime) + + return { + 'name': 'model_name', + 'model_type': model_type, + 'data_type': DataType.TEXT, + 'granularity': Granularity.DAY, + 'features': [ColumnDefinition(name='feature1', type=SupportedTypes.string)], + 'outputs': OutputType( + prediction=prediction, + prediction_proba=prediction_proba, + output=[ColumnDefinition(name='output1', type=SupportedTypes.string)], + ), + 'target': target, + 'timestamp': timestamp, + 'frameworks': None, + 'algorithm': None, + } diff --git a/api/tests/validation/__init__.py b/api/tests/validation/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/api/tests/validation/__init__.py @@ -0,0 +1 @@ + diff --git a/api/tests/validation/model_type_validator_test.py b/api/tests/validation/model_type_validator_test.py new file mode 100644 index 00000000..2a743d69 --- /dev/null +++ b/api/tests/validation/model_type_validator_test.py @@ -0,0 +1,106 @@ +from pydantic import ValidationError +import pytest + +from app.models.model_dto import ModelIn, ModelType +from tests.commons.modelin_factory import get_model_sample_wrong + + +def test_timestamp_not_datetime(): + """Tests that timestamp validator fails when timestamp is not valid.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong( + fail_field='timestamp', model_type=ModelType.BINARY + ) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'timestamp must be a datetime' in str(excinfo.value) + + +def test_target_for_binary(): + """Tests that for ModelType.BINARY: target must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('target', ModelType.BINARY) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'target must be a number for a ModelType.BINARY' in str(excinfo.value) + + +def test_target_for_multiclass(): + """Tests that for ModelType.MULTI_CLASS: target must be a number or string.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('target', ModelType.MULTI_CLASS) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'target must be a number or string for a ModelType.MULTI_CLASS' in str( + excinfo.value + ) + + +def test_target_for_regression(): + """Tests that for ModelType.REGRESSION: target must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('target', ModelType.REGRESSION) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'target must be a number for a ModelType.REGRESSION' in str(excinfo.value) + + +def test_prediction_for_binary(): + """Tests that for ModelType.BINARY: prediction must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('outputs.prediction', ModelType.BINARY) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'prediction must be a number for a ModelType.BINARY' in str(excinfo.value) + + +def test_prediction_for_multiclass(): + """Tests that for ModelType.MULTI_CLASS: prediction must be a number or string.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('outputs.prediction', ModelType.MULTI_CLASS) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'prediction must be a number or string for a ModelType.MULTI_CLASS' in str( + excinfo.value + ) + + +def test_prediction_for_regression(): + """Tests that for ModelType.REGRESSION: prediction must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong('outputs.prediction', ModelType.REGRESSION) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'prediction must be a number for a ModelType.REGRESSION' in str( + excinfo.value + ) + + +def test_prediction_proba_for_binary(): + """Tests that for ModelType.BINARY: prediction_proba must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong( + 'outputs.prediction_proba', ModelType.BINARY + ) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'prediction_proba must be an optional float for a ModelType.BINARY' in str( + excinfo.value + ) + + +def test_prediction_proba_for_multiclass(): + """Tests that for ModelType.MULTI_CLASS: prediction_proba must be a number.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong( + 'outputs.prediction_proba', ModelType.MULTI_CLASS + ) + ModelIn.model_validate(ModelIn(**model_data)) + assert ( + 'prediction_proba must be an optional float for a ModelType.MULTI_CLASS' + in str(excinfo.value) + ) + + +def test_prediction_proba_for_regression(): + """Tests that for ModelType.REGRESSION: prediction_proba must be None.""" + with pytest.raises(ValidationError) as excinfo: + model_data = get_model_sample_wrong( + 'outputs.prediction_proba', ModelType.REGRESSION + ) + ModelIn.model_validate(ModelIn(**model_data)) + assert 'prediction_proba must be None for a ModelType.REGRESSION' in str( + excinfo.value + ) diff --git a/tools/update-authors.sh b/tools/update-authors.sh new file mode 100644 index 00000000..4063a4c5 --- /dev/null +++ b/tools/update-authors.sh @@ -0,0 +1,32 @@ +#!/bin/sh +#Update AUTHORS.md based on git history. + +git log --reverse --format='%aN (<%aE>)' | perl -we ' +BEGIN { + %seen = (), @authors = (); +} +while (<>) { + next if $seen{$_}; + next if /(dependabot\[bot\])/; + next if /(devops\@radicalbit.ai)/; + $seen{$_} = push @authors, "- ", $_; +} +END { + print "\n\n"; + print "Contributors to radicalbit-ai-monitoring\n"; + print "============================\n"; + print "radicalbit-ai-monitoring is developed and maintained by a community of people interested in providing a comprehensive solution for monitoring your Machine Learning and Large Language Models in production.\n\n\n"; + print "

\n"; + print "\n"; + print "

\n\n\n"; + print "Contributors (ordered by first contribution.)\n"; + print "-------------------------------------\n"; + print "[Full List of Contributors](https://github.com/radicalbit/radicalbit-ai-monitoring/graphs/contributors)\n\n"; + print @authors, "\n"; + print " "; + print "#### Generated by tools/update-authors.sh.\n"; +} +' > AUTHORS.md \ No newline at end of file