From 97943be79fe6363005f443970046283254eba281 Mon Sep 17 00:00:00 2001
From: Adrien Aury <44274230+adrienaury@users.noreply.github.com>
Date: Mon, 28 Aug 2023 13:37:14 +0200
Subject: [PATCH] feat: add column alias in config (#17)
* feat: column alias
* fix: double config read
* fix: excluding numeric values
---
CHANGELOG.md | 2 +
internal/infra/config_loader.go | 2 +
pkg/mimo/model.go | 51 ++++++++++++++-----
pkg/mimo/model_config.go | 4 +-
test/configs/config_alias.yaml | 4 ++
test/configs/config_exclude_numeric.yaml | 4 ++
.../report_bugfix_exclude_numeric.html | 40 +++++++++++++++
test/reports/report_reuse_previous_alias.html | 40 +++++++++++++++
test/suites/06-column-alias.yml | 46 +++++++++++++++++
test/suites/99-bugfixes.yml | 19 +++++++
10 files changed, 197 insertions(+), 15 deletions(-)
create mode 100644 test/configs/config_alias.yaml
create mode 100644 test/configs/config_exclude_numeric.yaml
create mode 100644 test/reports/report_bugfix_exclude_numeric.html
create mode 100644 test/reports/report_reuse_previous_alias.html
create mode 100644 test/suites/06-column-alias.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 872a969..66f1f06 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,8 @@ Types of changes
- `Added` analysis of deep nested structures (arrays and objects).
- `Added` validated constraints use a different shade of green in HTML report.
- `Added` possibility to use template string to generate coherent source with `coherentSource` parameter.
+- `Added` possibility to configure a column name alias with the `alias` parameter.
+- `Fixed` excluding numeric values.
## [0.2.1]
diff --git a/internal/infra/config_loader.go b/internal/infra/config_loader.go
index 22520fc..3aee320 100644
--- a/internal/infra/config_loader.go
+++ b/internal/infra/config_loader.go
@@ -42,6 +42,7 @@ type YAMLColumn struct {
CoherentWith []string `yaml:"coherentWith,omitempty"`
CoherentSource string `yaml:"coherentSource,omitempty"`
Constraints map[string]YAMLConstraint `yaml:"constraints,omitempty"`
+ Alias string `yaml:"alias,omitempty"`
}
type YAMLConstraint map[string]float64
@@ -85,6 +86,7 @@ func CreateConfig(yamlconfig *YAMLStructure) (mimo.Config, error) {
CoherentWith: yamlcolumn.CoherentWith,
CoherentSource: yamlcolumn.CoherentSource,
Constraints: []mimo.Constraint{},
+ Alias: yamlcolumn.Alias,
}
for target, yamlconstraint := range yamlcolumn.Constraints {
diff --git a/pkg/mimo/model.go b/pkg/mimo/model.go
index b8b8179..2187d27 100644
--- a/pkg/mimo/model.go
+++ b/pkg/mimo/model.go
@@ -104,7 +104,7 @@ func (m *Metrics) Update(
return true
}
- if slices.Contains(config.Exclude, realValue) {
+ if isExcluded(config.Exclude, realValue, realValueStr) {
m.EmptyCount++
return true
@@ -306,17 +306,35 @@ func (r Report) UpdateArray(root DataRow, realArray []any, maskedArray []any, st
func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack []any, path ...string) {
key := strings.Join(path, ".")
+ config := NewDefaultColumnConfig()
+ if cfg, ok := r.config.ColumnConfigs[key]; ok {
+ config = cfg
+ }
+
+ if len(config.Alias) > 0 {
+ key = config.Alias
+ }
+
metrics, exists := r.Metrics[key]
if !exists {
metrics = NewMetrics(key, r.multiMapFactory, r.config.ColumnConfigs[key].Constraints...)
r.subs.PostNewField(key)
}
- config := NewDefaultColumnConfig()
- if cfg, ok := r.config.ColumnConfigs[key]; ok {
- config = cfg
+ coherenceValues := computeCoherenceValues(config, root, stack)
+ if len(coherenceValues) == 0 {
+ coherenceValues = []any{realValue}
+ }
+
+ if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists {
+ metrics.Coherence.Close()
+ metrics.Identifiant.Close()
+ } else {
+ r.Metrics[key] = metrics
}
+}
+func computeCoherenceValues(config ColumnConfig, root DataRow, stack []any) []any {
coherenceValues := make([]any, len(config.CoherentWith))
for i, coherentColumn := range config.CoherentWith {
@@ -337,16 +355,7 @@ func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack
}
}
- if len(coherenceValues) == 0 {
- coherenceValues = []any{realValue}
- }
-
- if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists {
- metrics.Coherence.Close()
- metrics.Identifiant.Close()
- } else {
- r.Metrics[key] = metrics
- }
+ return coherenceValues
}
func (r Report) Update(realRow DataRow, maskedRow DataRow) {
@@ -414,3 +423,17 @@ func validate(constraint ConstraintType, reference float64, value float64) bool
return false
}
}
+
+func isExcluded(exclude []any, value any, valueStr string) bool {
+ if slices.Contains(exclude, value) {
+ return true
+ }
+
+ for _, exVal := range exclude {
+ if exValStr, ok := toString(exVal); ok && valueStr == exValStr {
+ return true
+ }
+ }
+
+ return false
+}
diff --git a/pkg/mimo/model_config.go b/pkg/mimo/model_config.go
index 6b7ec7f..fa96f85 100644
--- a/pkg/mimo/model_config.go
+++ b/pkg/mimo/model_config.go
@@ -25,8 +25,9 @@ type Config struct {
type ColumnConfig struct {
Exclude []any // exclude values from the masking rate computation (default: exclude only nil values)
CoherentWith []string // list of fields from witch the coherent rate is computed (default: the current field)
- CoherentSource string // template to execute to create coherence source.
+ CoherentSource string // template to execute to create coherence source
Constraints []Constraint // list of constraints to validate
+ Alias string // alias to use in persisted data
}
type Constraint struct {
@@ -66,5 +67,6 @@ func NewDefaultColumnConfig() ColumnConfig {
CoherentWith: []string{},
CoherentSource: "",
Constraints: []Constraint{},
+ Alias: "",
}
}
diff --git a/test/configs/config_alias.yaml b/test/configs/config_alias.yaml
new file mode 100644
index 0000000..cf77213
--- /dev/null
+++ b/test/configs/config_alias.yaml
@@ -0,0 +1,4 @@
+version: "1"
+metrics:
+ - name: "value_renamed"
+ alias: "value"
diff --git a/test/configs/config_exclude_numeric.yaml b/test/configs/config_exclude_numeric.yaml
new file mode 100644
index 0000000..9970f4a
--- /dev/null
+++ b/test/configs/config_exclude_numeric.yaml
@@ -0,0 +1,4 @@
+version: "1"
+metrics:
+ - name: "value"
+ exclude: [2]
diff --git a/test/reports/report_bugfix_exclude_numeric.html b/test/reports/report_bugfix_exclude_numeric.html
new file mode 100644
index 0000000..bcf349c
--- /dev/null
+++ b/test/reports/report_bugfix_exclude_numeric.html
@@ -0,0 +1,40 @@
+
+
+
+
+ MIMO Report
+
+
+
+
+ MIMO Report
+
+
+ Field |
+ Nil |
+ Empty |
+ Masked |
+ Missed |
+ Masking Rate |
+ Coherent Rate |
+ Identifiable Rate |
+ K |
+
+
+
+
+ value |
+ 0 |
+ 1 |
+ 3 |
+ 0 |
+ 100.00 % |
+ 100.00 % |
+ 100.00 % |
+ 1 |
+
+
+
+
+
+
diff --git a/test/reports/report_reuse_previous_alias.html b/test/reports/report_reuse_previous_alias.html
new file mode 100644
index 0000000..889b61e
--- /dev/null
+++ b/test/reports/report_reuse_previous_alias.html
@@ -0,0 +1,40 @@
+
+
+
+
+ MIMO Report
+
+
+
+
+ MIMO Report
+
+
+ Field |
+ Nil |
+ Empty |
+ Masked |
+ Missed |
+ Masking Rate |
+ Coherent Rate |
+ Identifiable Rate |
+ K |
+
+
+
+
+ value |
+ 1 |
+ 0 |
+ 4 |
+ 1 |
+ 80.00 % |
+ 66.67 % |
+ 60.00 % |
+ 1 |
+
+
+
+
+
+
diff --git a/test/suites/06-column-alias.yml b/test/suites/06-column-alias.yml
new file mode 100644
index 0000000..f79a628
--- /dev/null
+++ b/test/suites/06-column-alias.yml
@@ -0,0 +1,46 @@
+# Venom Test Suite definition
+# Check Venom documentation for more information : https://github.com/ovh/venom
+name: use of columns aliases
+testcases:
+ - name: reuse previous data directory with column alias
+ steps:
+ - script: rm -rf working/data
+
+ - script: echo '{"value":"A"}' > working/real.jsonl
+ - script: echo '{"value":"A"}' >> working/real.jsonl
+ - script: echo '{"value":"B"}' >> working/real.jsonl
+ - script: echo '{"value":"B"}' >> working/real.jsonl
+ - script: echo '{"value":"C"}' >> working/real.jsonl
+
+ - script: echo '{"value":"X"}' > working/masked.jsonl
+ - script: echo '{"value":"Y"}' >> working/masked.jsonl
+ - script: echo '{"value":"C"}' >> working/masked.jsonl
+ - script: echo '{"value":"C"}' >> working/masked.jsonl
+ - script: echo '{"value":"C"}' >> working/masked.jsonl
+
+ - script: cat working/masked.jsonl | mimo -v3 --disk-storage --persist working/data/ working/real.jsonl
+
+ - script: echo '{"value_renamed":"C"}' > working/real.jsonl
+ - script: echo '{"value_renamed":"D"}' >> working/real.jsonl
+ - script: echo '{"value_renamed":"D"}' >> working/real.jsonl
+ - script: echo '{"value_renamed":"E"}' >> working/real.jsonl
+ - script: echo '{"value_renamed":"E"}' >> working/real.jsonl
+ - script: echo '{"value_renamed":null}' >> working/real.jsonl
+ - script: echo '{}' >> working/real.jsonl
+ - script: echo '{"object":{"hello":{}}}' >> working/real.jsonl
+
+ - script: echo '{"value_renamed":"C"}' > working/masked.jsonl
+ - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
+ - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
+ - script: echo '{"value_renamed":"V"}' >> working/masked.jsonl
+ - script: echo '{"value_renamed":"C"}' >> working/masked.jsonl
+ - script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
+ - script: echo '{}' >> working/masked.jsonl
+ - script: echo '{"object":{"hello":{}}}' >> working/masked.jsonl
+
+ - script: cat working/masked.jsonl | mimo -v5 --config ../configs/config_alias.yaml --disk-storage --persist working/data/ working/real.jsonl
+ assertions:
+ - result.code ShouldEqual 0
+ - result.systemerr ShouldContainSubstring value count-empty=0 count-masked=4 count-missed=1 count-nil=1 field=value rate-coherence=0.6666666666666666 rate-identifiable=0.6 rate-masking=0.8
+
+ - script: mv report.html ../reports/report_reuse_previous_alias.html
diff --git a/test/suites/99-bugfixes.yml b/test/suites/99-bugfixes.yml
index 27c477a..2edfe92 100644
--- a/test/suites/99-bugfixes.yml
+++ b/test/suites/99-bugfixes.yml
@@ -19,3 +19,22 @@ testcases:
- result.systemerr ShouldContainSubstring value count-empty=0 count-masked=0 count-missed=0 count-nil=4 field=value rate-coherence=0 rate-identifiable=1 rate-masking=NaN
- script: mv report.html ../reports/report_bugfix_1.html
+
+ - name: exclude numeric value
+ steps:
+ - script: echo '{"value":1}' > working/real.jsonl
+ - script: echo '{"value":2}' >> working/real.jsonl
+ - script: echo '{"value":3}' >> working/real.jsonl
+ - script: echo '{"value":4}' >> working/real.jsonl
+
+ - script: echo '{"value":"A"}' > working/masked.jsonl
+ - script: echo '{"value":"B"}' >> working/masked.jsonl
+ - script: echo '{"value":"C"}' >> working/masked.jsonl
+ - script: echo '{"value":"D"}' >> working/masked.jsonl
+
+ - script: cat working/masked.jsonl | mimo --config ../configs/config_exclude_numeric.yaml -v3 working/real.jsonl
+ assertions:
+ - result.code ShouldEqual 0
+ - result.systemerr ShouldContainSubstring value count-empty=1 count-masked=3 count-missed=0 count-nil=0 field=value rate-coherence=1 rate-identifiable=1 rate-masking=1
+
+ - script: mv report.html ../reports/report_bugfix_exclude_numeric.html