diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 9596204..dc933ba 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,13 +1,14 @@ -FROM adrienaury/go-devcontainer:v2.0 +FROM adrienaury/go-devcontainer:v3.1 USER root RUN apk add --update --progress --no-cache make gomplate -ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 -RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ - && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr +RUN wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ + && chmod +x /usr/bin/mlr + +ARG VERSION_PIMO=1.19.0 +RUN wget -O- https://github.com/CGI-FR/PIMO/releases/download/v${VERSION_PIMO}/pimo_${VERSION_PIMO}_linux_amd64.tar.gz | tar xz -C /usr/bin pimo USER vscode diff --git a/.devcontainer/Dockerfile.ci b/.devcontainer/Dockerfile.ci index af9cf4f..a1d30e6 100644 --- a/.devcontainer/Dockerfile.ci +++ b/.devcontainer/Dockerfile.ci @@ -1,11 +1,9 @@ -FROM adrienaury/go-devcontainer-ci:v2.0 +FROM adrienaury/go-devcontainer-ci:v3.1 USER root RUN apk add --update --progress --no-cache make gomplate -ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 -RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ - && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr +RUN wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ + && chmod +x /usr/bin/mlr diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a924f1..333bab7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [0.2.0] + +- `Added` configuration file with `metrics[].exclude`, `metrics[]coherentWith` and `metrics[]constraints` parameters. + ## [0.1.0] -- `Added` First official version. +- `Added` first official version. diff --git a/README.md b/README.md index af2fbdc..269a2c9 100644 --- a/README.md +++ b/README.md @@ -8,26 +8,66 @@ # MIMO : Masked Input Metrics Output +Measure the quality of a pseudonymization transformation by masking. + +MIMO will compute the following indicators for each columns : + +- masking rate : percentage of values actually masked; ignoring null or inexisting values in real data +- coherent rate : percentage of real unique values that are masked coherently (the same single pseudonym is used for each distinct real value) +- identifiant rate : percentage of unique pseudonyms that are attributed to a single real value + +The result is a HTML report that contains the computed indicators for each column. + +![MIMO Report](docs/MIMO-report.png) + ## Usage +### Real time usage + ```console > mkfifo real.jsonl # create a pipe file to store the real json stream before pseudonymization > lino pull prod | tee real.jsonl | pimo | mimo real.jsonl | lino push dev 8:27AM WRN field is not completely masked fieldname=surname - - MIMO REPORT -=========================================== -fieldname | masking rate | collision rate | -----------|--------------|----------------| -name | 100 % | 0 % | -surname | 99 % | 0 % | > rm real.jsonl # pipe file can be removed after ``` +Here is a single command that run an example on synthesized data (require PIMO) : + ```bash pimo --empty-input --repeat 1000 --mask 'name=[{add:""},{randomChoiceInUri:"pimo://nameFR"}]' | tee real.jsonl | pimo --mask 'name={randomChoiceInUri:"pimo://nameFR"}' | mimo real.jsonl ``` +### After process usage + +MIMO can also be used on existing file on disk. + +```console +> cat masked.jsonl | mimo real.jsonl +``` + +### Configuration + +Here is an example configuration file. + +```yaml +version: "1" +metrics: + - name: "name" # required : name of the column to configure + exclude: [""] # optional : these values will be ignored during computation of the masking rate + coherentWith: ["name"] # optional : which values to use for the computation of the coherent rate + constraints: # optional : list of constraints to validate at the end of the execution + maskingRate: + shouldEqual: 1 + coherentRate: + shouldBeGreaterThan: 0.5 +``` + +You need to inform MIMO of this configuration file with the `--config` flag : + +```console +> cat masked.jsonl | mimo --config myconfig.yaml real.jonsl +``` + ## Contributing Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. diff --git a/build.yml b/build.yml index b6ab76d..6c422a0 100644 --- a/build.yml +++ b/build.yml @@ -51,6 +51,7 @@ properties: "ifshort", "nosnakecase", "exhaustivestruct", + "depguard", ] snapshot: false # If true, do not upload release when publish target is used dockerfiles: # List of Dockerfiles to build, defined by a map of {key=Dockerfile name ; value=path to build context}, the image name will be determined by the extension of the Dockerfile diff --git a/cmd/mimo/main.go b/cmd/mimo/main.go index 2f5b9df..885d1e9 100644 --- a/cmd/mimo/main.go +++ b/cmd/mimo/main.go @@ -44,6 +44,8 @@ var ( jsonlog bool debug bool colormode string + + configfile string ) func main() { @@ -84,6 +86,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa rootCmd.PersistentFlags().BoolVar(&debug, "debug", false, "add debug information to logs (very slow)") rootCmd.PersistentFlags().BoolVar(&jsonlog, "log-json", false, "output logs in JSON format") rootCmd.PersistentFlags().StringVar(&colormode, "color", "auto", "use colors in log outputs : yes, no or auto") + rootCmd.PersistentFlags().StringVar(&configfile, "config", "", "name of the YAML configuration file to use") if err := rootCmd.Execute(); err != nil { log.Err(err).Msg("error when executing command") @@ -100,6 +103,17 @@ func run(_ *cobra.Command, realJSONLineFileName string) { } driver := mimo.NewDriver(realReader, maskedReader, infra.SubscriberLogger{}) + + if configfile != "" { + if config, err := infra.LoadConfig(configfile); err != nil { + log.Fatal().Err(err).Msg("end MIMO") + } else { + driver.Configure(config) + } + } + + haserror := false + if report, err := driver.Analyze(); err != nil { log.Error().Err(err).Msg("end of program") } else { @@ -107,19 +121,37 @@ func run(_ *cobra.Command, realJSONLineFileName string) { sort.Strings(columns) for _, colname := range columns { metrics := report.ColumnMetric(colname) - log.Info(). - Str("field", colname). - Int64("count-nil", metrics.NilCount). - Int64("count-empty", metrics.EmptyCount). - Int64("count-masked", metrics.MaskedCount). - Int64("count-missed", metrics.NonMaskedCount()). - Float64("rate-masking", metrics.MaskedRate()). - Float64("rate-coherence", metrics.Coherence.Rate()). - Float64("rate-identifiable", metrics.Identifiant.Rate()). - Msg("summmary for column " + colname) + if metrics.Validate() >= 0 { + log.Info(). + Str("field", colname). + Int64("count-nil", metrics.NilCount). + Int64("count-empty", metrics.EmptyCount). + Int64("count-masked", metrics.MaskedCount). + Int64("count-missed", metrics.NonMaskedCount()). + Float64("rate-masking", metrics.MaskedRate()). + Float64("rate-coherence", metrics.Coherence.Rate()). + Float64("rate-identifiable", metrics.Identifiant.Rate()). + Msg("summmary for column " + colname) + } else { + log.Error(). + Str("field", colname). + Int64("count-nil", metrics.NilCount). + Int64("count-empty", metrics.EmptyCount). + Int64("count-masked", metrics.MaskedCount). + Int64("count-missed", metrics.NonMaskedCount()). + Float64("rate-masking", metrics.MaskedRate()). + Float64("rate-coherence", metrics.Coherence.Rate()). + Float64("rate-identifiable", metrics.Identifiant.Rate()). + Msg("summmary for column " + colname) + haserror = true + } } _ = infra.NewReportExporter().Export(report, "report.html") } + + if haserror { + os.Exit(1) + } } func initLog() { diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..3576d6e --- /dev/null +++ b/config.yaml @@ -0,0 +1,10 @@ +version: "1" +metrics: + - name: "name" + exclude: [""] + coherentWith: ["name"] + constraints: + maskingRate: + shouldEqual: 1 + coherentRate: + shouldBeGreaterThan: 0.5 diff --git a/docs/MIMO-report.png b/docs/MIMO-report.png new file mode 100644 index 0000000..2c28a36 Binary files /dev/null and b/docs/MIMO-report.png differ diff --git a/go.mod b/go.mod index 7b9b9d1..c4781c3 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/cgi-fr/mimo -go 1.20 +go 1.21 require ( github.com/Masterminds/sprig/v3 v3.2.3 @@ -8,6 +8,7 @@ require ( github.com/rs/zerolog v1.28.0 github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.5.1 + gopkg.in/yaml.v3 v3.0.1 ) require ( diff --git a/go.sum b/go.sum index 816e45a..61cdcc1 100644 --- a/go.sum +++ b/go.sum @@ -82,4 +82,5 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/infra/config_loader.go b/internal/infra/config_loader.go new file mode 100644 index 0000000..3b63122 --- /dev/null +++ b/internal/infra/config_loader.go @@ -0,0 +1,131 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of MIMO. +// +// MIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// MIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with MIMO. If not, see . + +package infra + +import ( + "fmt" + "os" + + "github.com/cgi-fr/mimo/pkg/mimo" + "github.com/rs/zerolog/log" + "gopkg.in/yaml.v3" +) + +// Version of the YAML strcuture. +const Version string = "1" + +// YAMLStructure of the file. +type YAMLStructure struct { + Version string `yaml:"version"` + Columns []YAMLColumn `yaml:"metrics,omitempty"` +} + +// YAMLColumn defines how to store a column config in YAML format. +type YAMLColumn struct { + Name string `yaml:"name"` + Exclude []any `yaml:"exclude,omitempty"` + CoherentWith []string `yaml:"coherentWith,omitempty"` + Constraints map[string]YAMLConstraint `yaml:"constraints,omitempty"` +} + +type YAMLConstraint map[string]float64 + +func LoadConfig(filename string) (mimo.Config, error) { + config := &YAMLStructure{ + Version: Version, + Columns: []YAMLColumn{}, + } + + if _, err := os.Stat(filename); os.IsNotExist(err) { + return mimo.NewConfig(), fmt.Errorf("%w: %s", ErrConfigFileNotExists, filename) + } + + log.Debug().Str("file", filename).Msg("loading config from file") + + dat, err := os.ReadFile(filename) + if err != nil { + return mimo.NewConfig(), fmt.Errorf("%w: %s", err, filename) + } + + err = yaml.Unmarshal(dat, config) + if err != nil { + return mimo.NewConfig(), fmt.Errorf("%w: %s", err, filename) + } + + if config.Version != Version { + return mimo.NewConfig(), fmt.Errorf("%w: %s", ErrConfigInvalidVersion, filename) + } + + return CreateConfig(config) +} + +//nolint:cyclop +func CreateConfig(yamlconfig *YAMLStructure) (mimo.Config, error) { + config := mimo.NewConfig() + + for _, yamlcolumn := range yamlconfig.Columns { + column := mimo.ColumnConfig{ + Exclude: yamlcolumn.Exclude, + CoherentWith: yamlcolumn.CoherentWith, + Constraints: []mimo.Constraint{}, + } + + for target, yamlconstraint := range yamlcolumn.Constraints { + for constraintType, value := range yamlconstraint { + constraint := mimo.Constraint{ + Target: 0, + Type: 0, + Value: value, + } + + switch target { + case "maskingRate": + constraint.Target = mimo.MaskingRate + case "coherentRate": + constraint.Target = mimo.CohenrentRate + case "identifiantRate": + constraint.Target = mimo.IdentifiantRate + default: + return config, fmt.Errorf("%w: %s", ErrConfigInvalidConstraintTarget, target) + } + + switch constraintType { + case "shouldEqual": + constraint.Type = mimo.ShouldEqual + case "shouldBeGreaterThan": + constraint.Type = mimo.ShouldBeGreaterThan + case "shouldBeGreaterThanOrEqualTo": + constraint.Type = mimo.ShouldBeGreaterThanOrEqualTo + case "shouldBeLowerThan": + constraint.Type = mimo.ShouldBeLowerThan + case "shouldBeLessThanOrEqualTo": + constraint.Type = mimo.ShouldBeLessThanOrEqualTo + default: + return config, fmt.Errorf("%w: %s", ErrConfigInvalidConstraintType, constraintType) + } + + column.Constraints = append(column.Constraints, constraint) + } + } + + config.ColumnNames = append(config.ColumnNames, yamlcolumn.Name) + config.ColumnConfigs[yamlcolumn.Name] = column + } + + return config, nil +} diff --git a/internal/infra/errors.go b/internal/infra/errors.go new file mode 100644 index 0000000..8133f02 --- /dev/null +++ b/internal/infra/errors.go @@ -0,0 +1,34 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of MIMO. +// +// MIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// MIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with MIMO. If not, see . + +package infra + +import "errors" + +var ( + // ErrConfigFileNotExists is returned when a config file doesn't exist. + ErrConfigFileNotExists = errors.New("error config file does not exist") + + // ErrConfigInvalidVersion is returned when a config file has an invalid version. + ErrConfigInvalidVersion = errors.New("invalid version in config file") + + // ErrConfigInvalidConstraintType is returned when a config file has an invalid constraint type. + ErrConfigInvalidConstraintType = errors.New("invalid constraint type in config file") + + // ErrConfigInvalidConstraintTarget is returned when a config file has an invalid constraint target. + ErrConfigInvalidConstraintTarget = errors.New("invalid constraint target in config file") +) diff --git a/internal/infra/subscriber_log.go b/internal/infra/subscriber_log.go index 729bb72..bc47cb8 100644 --- a/internal/infra/subscriber_log.go +++ b/internal/infra/subscriber_log.go @@ -25,6 +25,6 @@ func (sl SubscriberLogger) NewField(fieldname string) { log.Info().Str("name", fieldname).Msg("new field") } -func (sl SubscriberLogger) FirstNonMaskedValue(fieldname string, value any) { +func (sl SubscriberLogger) FirstNonMaskedValue(fieldname string, _ any) { log.Info().Str("name", fieldname).Msg("unmasked value detected") } diff --git a/internal/infra/template/default.html b/internal/infra/template/default.html index 8c65fe1..fce023f 100644 --- a/internal/infra/template/default.html +++ b/internal/infra/template/default.html @@ -28,9 +28,9 @@

MIMO Report

{{ $value.EmptyCount }} {{ $value.MaskedCount }} {{ $value.NonMaskedCount }} - {{ $value.MaskedRate | mulf 100.00 | printf "%0.02f" }} % - {{ $value.Coherence.Rate | mulf 100.00 | printf "%0.02f" }} % - {{ $value.Identifiant.Rate | mulf 100.00 | printf "%0.02f" }} % + {{ $value.MaskedRate | mulf 100.00 | printf "%0.02f" }} % + {{ $value.Coherence.Rate | mulf 100.00 | printf "%0.02f" }} % + {{ $value.Identifiant.Rate | mulf 100.00 | printf "%0.02f" }} % {{ $value.K }} {{ end }} diff --git a/pkg/mimo/driver.go b/pkg/mimo/driver.go index 9f94c2c..6d937fb 100644 --- a/pkg/mimo/driver.go +++ b/pkg/mimo/driver.go @@ -35,11 +35,15 @@ func NewDriver(realReader DataRowReader, maskedReader DataRowReader, subs ...Eve realDataSource: realReader, maskDataSource: maskedReader, subscribers: subs, - report: NewReport(subs), + report: NewReport(subs, NewConfig()), } } -func (d Driver) Analyze() (Report, error) { +func (d *Driver) Configure(c Config) { + d.report.config = c +} + +func (d *Driver) Analyze() (Report, error) { for { realRow, err := d.realDataSource.ReadDataRow() if err != nil { diff --git a/pkg/mimo/model.go b/pkg/mimo/model.go index fd6d854..f3b20ad 100644 --- a/pkg/mimo/model.go +++ b/pkg/mimo/model.go @@ -20,7 +20,9 @@ package mimo import ( "encoding/json" "fmt" + "slices" "strconv" + "strings" ) type DataRow map[string]any @@ -40,15 +42,16 @@ func (subs Suscribers) PostFirstNonMaskedValue(fieldname string, value any) { } type Metrics struct { - TotalCount int64 // TotalCount is the number of values analyzed - NilCount int64 // NilCount is the number of null values in real data - EmptyCount int64 // EmptyCount is the number of empty values in real data (empty string or numbers at 0 value) - MaskedCount int64 // MaskedCount is the number of non-blank real values masked - Coherence Multimap // Coherence is a multimap used to compute the coherence rate - Identifiant Multimap // Identifiant is a multimap used to compute the identifiable rate + TotalCount int64 // TotalCount is the number of values analyzed + NilCount int64 // NilCount is the number of null values in real data + EmptyCount int64 // EmptyCount is the number of empty values in real data (empty string or numbers at 0 value) + MaskedCount int64 // MaskedCount is the number of non-blank real values masked + Coherence Multimap // Coherence is a multimap used to compute the coherence rate + Identifiant Multimap // Identifiant is a multimap used to compute the identifiable rate + Constraints []Constraint // Constraints is the set of rules to validate } -func NewMetrics() Metrics { +func NewMetrics(constraints ...Constraint) Metrics { return Metrics{ TotalCount: 0, NilCount: 0, @@ -56,10 +59,18 @@ func NewMetrics() Metrics { MaskedCount: 0, Coherence: Multimap{}, Identifiant: Multimap{}, + Constraints: constraints, } } -func (m *Metrics) Update(fieldname string, realValue any, maskedValue any, subs Suscribers) bool { +func (m *Metrics) Update( + fieldname string, + realValue any, + maskedValue any, + coherenceValue []any, + subs Suscribers, + config ColumnConfig, +) bool { nonBlankCount := m.NonBlankCount() realValueStr, realValueOk := toString(realValue) @@ -71,7 +82,7 @@ func (m *Metrics) Update(fieldname string, realValue any, maskedValue any, subs m.TotalCount++ - m.Coherence.Add(realValueStr, maskedValueStr) + m.Coherence.Add(toStringSlice(coherenceValue), maskedValueStr) m.Identifiant.Add(maskedValueStr, realValueStr) if realValue == nil { @@ -80,6 +91,12 @@ func (m *Metrics) Update(fieldname string, realValue any, maskedValue any, subs return true } + if slices.Contains(config.Exclude, realValue) { + m.EmptyCount++ + + return true + } + if realValueOk && maskedValueOk { if realValueStr != maskedValueStr { m.MaskedCount++ @@ -119,25 +136,127 @@ func (m Metrics) MaskedRate() float64 { return float64(m.MaskedCount) / float64(m.NonBlankCount()) } +// MaskedRateValidate returns : +// - -1 if at least one constraint fail on the MaskedRate, +// - 0 if no constraint exist on the MaskedRate, +// - 1 if all constraints succeed on the MaskedRate, +func (m Metrics) MaskedRateValidate() int { + result := 0 + + for _, constraint := range m.Constraints { + if constraint.Target == MaskingRate { + if !validate(constraint.Type, constraint.Value, m.MaskedRate()) { + return -1 + } + + result = 1 + } + } + + return result +} + +// CoherenceRateValidate returns : +// - -1 if at least one constraint fail on the CoherenceRate, +// - 0 if no constraint exist on the CoherenceRate, +// - 1 if all constraints succeed on the CoherenceRate, +func (m Metrics) CoherenceRateValidate() int { + result := 0 + + for _, constraint := range m.Constraints { + if constraint.Target == CohenrentRate { + if !validate(constraint.Type, constraint.Value, m.Coherence.Rate()) { + return -1 + } + + result = 1 + } + } + + return result +} + +// IdentifiantRateValidate returns : +// - -1 if at least one constraint fail on the IdentifiantRate, +// - 0 if no constraint exist on the IdentifiantRate, +// - 1 if all constraints succeed on the IdentifiantRate, +func (m Metrics) IdentifiantRateValidate() int { + result := 0 + + for _, constraint := range m.Constraints { + if constraint.Target == IdentifiantRate { + if !validate(constraint.Type, constraint.Value, m.Identifiant.Rate()) { + return -1 + } + + result = 1 + } + } + + return result +} + +// Validate returns : +// - -1 if at least one constraint fail, +// - 0 if no constraint exist, +// - 1 if all constraints succeed , +func (m Metrics) Validate() int { + resultMaskedRate := m.MaskedRateValidate() + if resultMaskedRate < 0 { + return -1 + } + + resultCoherentRate := m.CoherenceRateValidate() + if resultCoherentRate < 0 { + return -1 + } + + resultIdentifiantRate := m.IdentifiantRateValidate() + if resultIdentifiantRate < 0 { + return -1 + } + + if resultMaskedRate > 0 || resultCoherentRate > 0 || resultIdentifiantRate > 0 { + return 1 + } + + return 0 +} + type Report struct { Metrics map[string]Metrics subs Suscribers + config Config } -func NewReport(subs []EventSubscriber) Report { - return Report{make(map[string]Metrics), subs} +func NewReport(subs []EventSubscriber, config Config) Report { + return Report{make(map[string]Metrics), subs, config} } func (r Report) Update(realRow DataRow, maskedRow DataRow) { for key, realValue := range realRow { metrics, exists := r.Metrics[key] if !exists { - metrics = NewMetrics() + metrics = NewMetrics(r.config.ColumnConfigs[key].Constraints...) r.subs.PostNewField(key) } - if metrics.Update(key, realValue, maskedRow[key], r.subs) { + config := NewDefaultColumnConfig(key) + if cfg, ok := r.config.ColumnConfigs[key]; ok { + config = cfg + } + + coherenceValues := make([]any, len(config.CoherentWith)) + for i, coherentColumn := range config.CoherentWith { + coherenceValues[i] = realRow[coherentColumn] + } + + if len(coherenceValues) == 0 { + coherenceValues = []any{realValue} + } + + if metrics.Update(key, realValue, maskedRow[key], coherenceValues, r.subs, config) { r.Metrics[key] = metrics } } @@ -173,3 +292,34 @@ func toString(value any) (string, bool) { return str, true } + +func toStringSlice(values []any) string { + result := &strings.Builder{} + + for _, value := range values { + if str, ok := toString(value); ok { + result.WriteString(str) + } + + result.WriteString("_") + } + + return result.String() +} + +func validate(constraint ConstraintType, reference float64, value float64) bool { + switch constraint { + case ShouldEqual: + return value == reference + case ShouldBeGreaterThan: + return value > reference + case ShouldBeGreaterThanOrEqualTo: + return value >= reference + case ShouldBeLowerThan: + return value < reference + case ShouldBeLessThanOrEqualTo: + return value <= reference + default: + return false + } +} diff --git a/pkg/mimo/model_config.go b/pkg/mimo/model_config.go new file mode 100644 index 0000000..a53c1e4 --- /dev/null +++ b/pkg/mimo/model_config.go @@ -0,0 +1,68 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of MIMO. +// +// MIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// MIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with MIMO. If not, see . + +package mimo + +type Config struct { + ColumnNames []string + ColumnConfigs map[string]ColumnConfig +} + +type ColumnConfig struct { + Exclude []any // exclude values from the masking rate computation (default: exclude only nil values) + CoherentWith []string // list of fields from witch the coherent rate is computed (default: the current field) + Constraints []Constraint // list of constraints to validate +} + +type Constraint struct { + Target ConstraintTarget + Type ConstraintType + Value float64 +} + +type ConstraintTarget int + +const ( + MaskingRate ConstraintTarget = iota + CohenrentRate + IdentifiantRate +) + +type ConstraintType int + +const ( + ShouldEqual ConstraintType = iota + ShouldBeGreaterThan + ShouldBeGreaterThanOrEqualTo + ShouldBeLowerThan + ShouldBeLessThanOrEqualTo +) + +func NewConfig() Config { + return Config{ + ColumnNames: []string{}, + ColumnConfigs: map[string]ColumnConfig{}, + } +} + +func NewDefaultColumnConfig(columnname string) ColumnConfig { + return ColumnConfig{ + Exclude: []any{}, + CoherentWith: []string{columnname}, + Constraints: []Constraint{}, + } +} diff --git a/report-example.html b/report-example.html deleted file mode 100644 index f70ecc6..0000000 --- a/report-example.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - MIMO Report - - - - - -

MIMO Report yyyy-mm-ddThh:mm:ss

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FieldCountsMasking RateCollision Rate
nameNil: 0 ; Preserved: 0 ; Masked: 100100.00 %5.00 %
surnameNil: 0 ; Preserved: 5 ; Masked: 9595.00 %0.00 %
ageNil: 15 ; Preserved: 12 ; Masked: 7385.88 %0.00 %
nationalityNil: 0 ; Preserved: 100 ; Masked: 00.00 %0.00 %
- - diff --git a/test/configs/config_coherent.yaml b/test/configs/config_coherent.yaml new file mode 100644 index 0000000..691eff1 --- /dev/null +++ b/test/configs/config_coherent.yaml @@ -0,0 +1,4 @@ +version: "1" +metrics: + - name: "value" + coherentWith: ["coherent"] diff --git a/test/configs/config_coherent2.yaml b/test/configs/config_coherent2.yaml new file mode 100644 index 0000000..815b258 --- /dev/null +++ b/test/configs/config_coherent2.yaml @@ -0,0 +1,4 @@ +version: "1" +metrics: + - name: "value" + coherentWith: ["value", "coherent"] diff --git a/test/configs/config_constraints_1.yaml b/test/configs/config_constraints_1.yaml new file mode 100644 index 0000000..014aafd --- /dev/null +++ b/test/configs/config_constraints_1.yaml @@ -0,0 +1,10 @@ +version: "1" +metrics: + - name: "value" + constraints: + maskingRate: + shouldEqual: 0.8 + coherentRate: + shouldBeGreaterThan: 0.5 #0.6 + identifiantRate: + shouldBeGreaterThan: 0.5 #0.8 diff --git a/test/configs/config_constraints_2.yaml b/test/configs/config_constraints_2.yaml new file mode 100644 index 0000000..9aa5f1f --- /dev/null +++ b/test/configs/config_constraints_2.yaml @@ -0,0 +1,10 @@ +version: "1" +metrics: + - name: "value" + constraints: + maskingRate: + shouldEqual: 1 + coherentRate: + shouldBeGreaterThan: 0.6 #0.6 + identifiantRate: + shouldBeGreaterThanOrEqualTo: 0.8 #0.8 diff --git a/test/configs/config_exclude.yaml b/test/configs/config_exclude.yaml new file mode 100644 index 0000000..569bb49 --- /dev/null +++ b/test/configs/config_exclude.yaml @@ -0,0 +1,4 @@ +version: "1" +metrics: + - name: "value" + exclude: ["C"] diff --git a/test/suites/report.html b/test/reports/report_1.html similarity index 100% rename from test/suites/report.html rename to test/reports/report_1.html diff --git a/report.html b/test/reports/report_2.html similarity index 74% rename from report.html rename to test/reports/report_2.html index 24e8795..6c780c0 100644 --- a/report.html +++ b/test/reports/report_2.html @@ -18,18 +18,20 @@

MIMO Report

Masking Rate Coherent Rate Identifiable Rate + K - name + value 0 - 0 - 1000 + 2 + 8 0 100.00 % - 54.04 % - 55.09 % + 60.00 % + 80.00 % + 1 diff --git a/test/reports/report_3.html b/test/reports/report_3.html new file mode 100644 index 0000000..ce0fad8 --- /dev/null +++ b/test/reports/report_3.html @@ -0,0 +1,52 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
coherent00100100.00 %100.00 %0.00 %2
value008280.00 %0.00 %80.00 %1
+ + diff --git a/test/reports/report_4.html b/test/reports/report_4.html new file mode 100644 index 0000000..8a7130b --- /dev/null +++ b/test/reports/report_4.html @@ -0,0 +1,52 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
coherent00100100.00 %100.00 %0.00 %2
value008280.00 %100.00 %80.00 %1
+ + diff --git a/test/reports/report_5.html b/test/reports/report_5.html new file mode 100644 index 0000000..d6fdee4 --- /dev/null +++ b/test/reports/report_5.html @@ -0,0 +1,40 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
value008280.00 %60.00 %80.00 %1
+ + diff --git a/test/reports/report_6.html b/test/reports/report_6.html new file mode 100644 index 0000000..0f493ce --- /dev/null +++ b/test/reports/report_6.html @@ -0,0 +1,40 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
value008280.00 %60.00 %80.00 %1
+ + diff --git a/test/suites/02-validate-metrics.yml b/test/suites/02-validate-metrics.yml index 1f59cc1..9a36bba 100644 --- a/test/suites/02-validate-metrics.yml +++ b/test/suites/02-validate-metrics.yml @@ -36,3 +36,112 @@ testcases: assertions: - result.code ShouldEqual 0 - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=8 count-missed=2 count-nil=1 field=value rate-coherence=0.6666666666666666 rate-identifiable=0.6 rate-masking=0.8 + + - script: mv report.html ../reports/report_1.html + + - name: config exclude values + steps: + - script: echo '{"value":"A"}' > working/real.jsonl + - script: echo '{"value":"A"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"C"}' >> working/real.jsonl + - script: echo '{"value":"C"}' >> working/real.jsonl + - script: echo '{"value":"D"}' >> working/real.jsonl + - script: echo '{"value":"D"}' >> working/real.jsonl + - script: echo '{"value":"E"}' >> working/real.jsonl + - script: echo '{"value":"E"}' >> working/real.jsonl + + - script: echo '{"value":"X"}' > working/masked.jsonl + - script: echo '{"value":"Y"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"V"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo --config ../configs/config_exclude.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=2 count-masked=8 count-missed=0 count-nil=0 field=value rate-coherence=0.6 rate-identifiable=0.8 rate-masking=1 + + - script: mv report.html ../reports/report_2.html + + - name: config coherence source + steps: + - script: echo '{"value":"A", "coherent":"1"}' > working/real.jsonl + - script: echo '{"value":"A", "coherent":"2"}' >> working/real.jsonl + - script: echo '{"value":"B", "coherent":"1"}' >> working/real.jsonl + - script: echo '{"value":"B", "coherent":"2"}' >> working/real.jsonl + - script: echo '{"value":"C", "coherent":"1"}' >> working/real.jsonl + - script: echo '{"value":"C", "coherent":"2"}' >> working/real.jsonl + - script: echo '{"value":"D", "coherent":"1"}' >> working/real.jsonl + - script: echo '{"value":"D", "coherent":"2"}' >> working/real.jsonl + - script: echo '{"value":"E", "coherent":"1"}' >> working/real.jsonl + - script: echo '{"value":"E", "coherent":"2"}' >> working/real.jsonl + + - script: echo '{"value":"X"}' > working/masked.jsonl + - script: echo '{"value":"Y"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"V"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo --config ../configs/config_coherent.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=8 count-missed=2 count-nil=0 field=value rate-coherence=0 rate-identifiable=0.8 rate-masking=0.8 + + - script: mv report.html ../reports/report_3.html + + - script: cat working/masked.jsonl | mimo --config ../configs/config_coherent2.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=8 count-missed=2 count-nil=0 field=value rate-coherence=1 rate-identifiable=0.8 rate-masking=0.8 + + - script: mv report.html ../reports/report_4.html + + - name: config constraints + steps: + - script: echo '{"value":"A"}' > working/real.jsonl + - script: echo '{"value":"A"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"C"}' >> working/real.jsonl + - script: echo '{"value":"C"}' >> working/real.jsonl + - script: echo '{"value":"D"}' >> working/real.jsonl + - script: echo '{"value":"D"}' >> working/real.jsonl + - script: echo '{"value":"E"}' >> working/real.jsonl + - script: echo '{"value":"E"}' >> working/real.jsonl + + - script: echo '{"value":"X"}' > working/masked.jsonl + - script: echo '{"value":"Y"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"W"}' >> working/masked.jsonl + - script: echo '{"value":"V"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo --config ../configs/config_constraints_1.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=8 count-missed=2 count-nil=0 field=value rate-coherence=0.6 rate-identifiable=0.8 rate-masking=0.8 + + - script: mv report.html ../reports/report_5.html + + - script: cat working/masked.jsonl | mimo --config ../configs/config_constraints_2.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 1 + - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=8 count-missed=2 count-nil=0 field=value rate-coherence=0.6 rate-identifiable=0.8 rate-masking=0.8 + + - script: mv report.html ../reports/report_6.html