From 97943be79fe6363005f443970046283254eba281 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 28 Aug 2023 13:37:14 +0200 Subject: [PATCH] feat: add column alias in config (#17) * feat: column alias * fix: double config read * fix: excluding numeric values --- CHANGELOG.md | 2 + internal/infra/config_loader.go | 2 + pkg/mimo/model.go | 51 ++++++++++++++----- pkg/mimo/model_config.go | 4 +- test/configs/config_alias.yaml | 4 ++ test/configs/config_exclude_numeric.yaml | 4 ++ .../report_bugfix_exclude_numeric.html | 40 +++++++++++++++ test/reports/report_reuse_previous_alias.html | 40 +++++++++++++++ test/suites/06-column-alias.yml | 46 +++++++++++++++++ test/suites/99-bugfixes.yml | 19 +++++++ 10 files changed, 197 insertions(+), 15 deletions(-) create mode 100644 test/configs/config_alias.yaml create mode 100644 test/configs/config_exclude_numeric.yaml create mode 100644 test/reports/report_bugfix_exclude_numeric.html create mode 100644 test/reports/report_reuse_previous_alias.html create mode 100644 test/suites/06-column-alias.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 872a969..66f1f06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ Types of changes - `Added` analysis of deep nested structures (arrays and objects). - `Added` validated constraints use a different shade of green in HTML report. - `Added` possibility to use template string to generate coherent source with `coherentSource` parameter. +- `Added` possibility to configure a column name alias with the `alias` parameter. +- `Fixed` excluding numeric values. ## [0.2.1] diff --git a/internal/infra/config_loader.go b/internal/infra/config_loader.go index 22520fc..3aee320 100644 --- a/internal/infra/config_loader.go +++ b/internal/infra/config_loader.go @@ -42,6 +42,7 @@ type YAMLColumn struct { CoherentWith []string `yaml:"coherentWith,omitempty"` CoherentSource string `yaml:"coherentSource,omitempty"` Constraints map[string]YAMLConstraint `yaml:"constraints,omitempty"` + Alias string `yaml:"alias,omitempty"` } type YAMLConstraint map[string]float64 @@ -85,6 +86,7 @@ func CreateConfig(yamlconfig *YAMLStructure) (mimo.Config, error) { CoherentWith: yamlcolumn.CoherentWith, CoherentSource: yamlcolumn.CoherentSource, Constraints: []mimo.Constraint{}, + Alias: yamlcolumn.Alias, } for target, yamlconstraint := range yamlcolumn.Constraints { diff --git a/pkg/mimo/model.go b/pkg/mimo/model.go index b8b8179..2187d27 100644 --- a/pkg/mimo/model.go +++ b/pkg/mimo/model.go @@ -104,7 +104,7 @@ func (m *Metrics) Update( return true } - if slices.Contains(config.Exclude, realValue) { + if isExcluded(config.Exclude, realValue, realValueStr) { m.EmptyCount++ return true @@ -306,17 +306,35 @@ func (r Report) UpdateArray(root DataRow, realArray []any, maskedArray []any, st func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack []any, path ...string) { key := strings.Join(path, ".") + config := NewDefaultColumnConfig() + if cfg, ok := r.config.ColumnConfigs[key]; ok { + config = cfg + } + + if len(config.Alias) > 0 { + key = config.Alias + } + metrics, exists := r.Metrics[key] if !exists { metrics = NewMetrics(key, r.multiMapFactory, r.config.ColumnConfigs[key].Constraints...) r.subs.PostNewField(key) } - config := NewDefaultColumnConfig() - if cfg, ok := r.config.ColumnConfigs[key]; ok { - config = cfg + coherenceValues := computeCoherenceValues(config, root, stack) + if len(coherenceValues) == 0 { + coherenceValues = []any{realValue} + } + + if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists { + metrics.Coherence.Close() + metrics.Identifiant.Close() + } else { + r.Metrics[key] = metrics } +} +func computeCoherenceValues(config ColumnConfig, root DataRow, stack []any) []any { coherenceValues := make([]any, len(config.CoherentWith)) for i, coherentColumn := range config.CoherentWith { @@ -337,16 +355,7 @@ func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack } } - if len(coherenceValues) == 0 { - coherenceValues = []any{realValue} - } - - if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists { - metrics.Coherence.Close() - metrics.Identifiant.Close() - } else { - r.Metrics[key] = metrics - } + return coherenceValues } func (r Report) Update(realRow DataRow, maskedRow DataRow) { @@ -414,3 +423,17 @@ func validate(constraint ConstraintType, reference float64, value float64) bool return false } } + +func isExcluded(exclude []any, value any, valueStr string) bool { + if slices.Contains(exclude, value) { + return true + } + + for _, exVal := range exclude { + if exValStr, ok := toString(exVal); ok && valueStr == exValStr { + return true + } + } + + return false +} diff --git a/pkg/mimo/model_config.go b/pkg/mimo/model_config.go index 6b7ec7f..fa96f85 100644 --- a/pkg/mimo/model_config.go +++ b/pkg/mimo/model_config.go @@ -25,8 +25,9 @@ type Config struct { type ColumnConfig struct { Exclude []any // exclude values from the masking rate computation (default: exclude only nil values) CoherentWith []string // list of fields from witch the coherent rate is computed (default: the current field) - CoherentSource string // template to execute to create coherence source. + CoherentSource string // template to execute to create coherence source Constraints []Constraint // list of constraints to validate + Alias string // alias to use in persisted data } type Constraint struct { @@ -66,5 +67,6 @@ func NewDefaultColumnConfig() ColumnConfig { CoherentWith: []string{}, CoherentSource: "", Constraints: []Constraint{}, + Alias: "", } } diff --git a/test/configs/config_alias.yaml b/test/configs/config_alias.yaml new file mode 100644 index 0000000..cf77213 --- /dev/null +++ b/test/configs/config_alias.yaml @@ -0,0 +1,4 @@ +version: "1" +metrics: + - name: "value_renamed" + alias: "value" diff --git a/test/configs/config_exclude_numeric.yaml b/test/configs/config_exclude_numeric.yaml new file mode 100644 index 0000000..9970f4a --- /dev/null +++ b/test/configs/config_exclude_numeric.yaml @@ -0,0 +1,4 @@ +version: "1" +metrics: + - name: "value" + exclude: [2] diff --git a/test/reports/report_bugfix_exclude_numeric.html b/test/reports/report_bugfix_exclude_numeric.html new file mode 100644 index 0000000..bcf349c --- /dev/null +++ b/test/reports/report_bugfix_exclude_numeric.html @@ -0,0 +1,40 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
value0130100.00 %100.00 %100.00 %1
+ + diff --git a/test/reports/report_reuse_previous_alias.html b/test/reports/report_reuse_previous_alias.html new file mode 100644 index 0000000..889b61e --- /dev/null +++ b/test/reports/report_reuse_previous_alias.html @@ -0,0 +1,40 @@ + + + + + MIMO Report + + + + +

MIMO Report

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldNilEmptyMaskedMissedMasking RateCoherent RateIdentifiable RateK
value104180.00 %66.67 %60.00 %1
+ + diff --git a/test/suites/06-column-alias.yml b/test/suites/06-column-alias.yml new file mode 100644 index 0000000..f79a628 --- /dev/null +++ b/test/suites/06-column-alias.yml @@ -0,0 +1,46 @@ +# Venom Test Suite definition +# Check Venom documentation for more information : https://github.com/ovh/venom +name: use of columns aliases +testcases: + - name: reuse previous data directory with column alias + steps: + - script: rm -rf working/data + + - script: echo '{"value":"A"}' > working/real.jsonl + - script: echo '{"value":"A"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"B"}' >> working/real.jsonl + - script: echo '{"value":"C"}' >> working/real.jsonl + + - script: echo '{"value":"X"}' > working/masked.jsonl + - script: echo '{"value":"Y"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo -v3 --disk-storage --persist working/data/ working/real.jsonl + + - script: echo '{"value_renamed":"C"}' > working/real.jsonl + - script: echo '{"value_renamed":"D"}' >> working/real.jsonl + - script: echo '{"value_renamed":"D"}' >> working/real.jsonl + - script: echo '{"value_renamed":"E"}' >> working/real.jsonl + - script: echo '{"value_renamed":"E"}' >> working/real.jsonl + - script: echo '{"value_renamed":null}' >> working/real.jsonl + - script: echo '{}' >> working/real.jsonl + - script: echo '{"object":{"hello":{}}}' >> working/real.jsonl + + - script: echo '{"value_renamed":"C"}' > working/masked.jsonl + - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl + - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl + - script: echo '{"value_renamed":"V"}' >> working/masked.jsonl + - script: echo '{"value_renamed":"C"}' >> working/masked.jsonl + - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl + - script: echo '{}' >> working/masked.jsonl + - script: echo '{"object":{"hello":{}}}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo -v5 --config ../configs/config_alias.yaml --disk-storage --persist working/data/ working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=4 count-missed=1 count-nil=1 field=value rate-coherence=0.6666666666666666 rate-identifiable=0.6 rate-masking=0.8 + + - script: mv report.html ../reports/report_reuse_previous_alias.html diff --git a/test/suites/99-bugfixes.yml b/test/suites/99-bugfixes.yml index 27c477a..2edfe92 100644 --- a/test/suites/99-bugfixes.yml +++ b/test/suites/99-bugfixes.yml @@ -19,3 +19,22 @@ testcases: - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=0 count-missed=0 count-nil=4 field=value rate-coherence=0 rate-identifiable=1 rate-masking=NaN - script: mv report.html ../reports/report_bugfix_1.html + + - name: exclude numeric value + steps: + - script: echo '{"value":1}' > working/real.jsonl + - script: echo '{"value":2}' >> working/real.jsonl + - script: echo '{"value":3}' >> working/real.jsonl + - script: echo '{"value":4}' >> working/real.jsonl + + - script: echo '{"value":"A"}' > working/masked.jsonl + - script: echo '{"value":"B"}' >> working/masked.jsonl + - script: echo '{"value":"C"}' >> working/masked.jsonl + - script: echo '{"value":"D"}' >> working/masked.jsonl + + - script: cat working/masked.jsonl | mimo --config ../configs/config_exclude_numeric.yaml -v3 working/real.jsonl + assertions: + - result.code ShouldEqual 0 + - result.systemerr ShouldContainSubstring value count-empty=1 count-masked=3 count-missed=0 count-nil=0 field=value rate-coherence=1 rate-identifiable=1 rate-masking=1 + + - script: mv report.html ../reports/report_bugfix_exclude_numeric.html