Skip to content

Commit

Permalink
feat: add column alias in config (#17)
Browse files Browse the repository at this point in the history
* feat: column alias

* fix: double config read

* fix: excluding numeric values
  • Loading branch information
adrienaury authored Aug 28, 2023
1 parent 0fcec45 commit 97943be
Show file tree
Hide file tree
Showing 10 changed files with 197 additions and 15 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Types of changes
- `Added` analysis of deep nested structures (arrays and objects).
- `Added` validated constraints use a different shade of green in HTML report.
- `Added` possibility to use template string to generate coherent source with `coherentSource` parameter.
- `Added` possibility to configure a column name alias with the `alias` parameter.
- `Fixed` excluding numeric values.

## [0.2.1]

Expand Down
2 changes: 2 additions & 0 deletions internal/infra/config_loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type YAMLColumn struct {
CoherentWith []string `yaml:"coherentWith,omitempty"`
CoherentSource string `yaml:"coherentSource,omitempty"`
Constraints map[string]YAMLConstraint `yaml:"constraints,omitempty"`
Alias string `yaml:"alias,omitempty"`
}

type YAMLConstraint map[string]float64
Expand Down Expand Up @@ -85,6 +86,7 @@ func CreateConfig(yamlconfig *YAMLStructure) (mimo.Config, error) {
CoherentWith: yamlcolumn.CoherentWith,
CoherentSource: yamlcolumn.CoherentSource,
Constraints: []mimo.Constraint{},
Alias: yamlcolumn.Alias,
}

for target, yamlconstraint := range yamlcolumn.Constraints {
Expand Down
51 changes: 37 additions & 14 deletions pkg/mimo/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func (m *Metrics) Update(
return true
}

if slices.Contains(config.Exclude, realValue) {
if isExcluded(config.Exclude, realValue, realValueStr) {
m.EmptyCount++

return true
Expand Down Expand Up @@ -306,17 +306,35 @@ func (r Report) UpdateArray(root DataRow, realArray []any, maskedArray []any, st
func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack []any, path ...string) {
key := strings.Join(path, ".")

config := NewDefaultColumnConfig()
if cfg, ok := r.config.ColumnConfigs[key]; ok {
config = cfg
}

if len(config.Alias) > 0 {
key = config.Alias
}

metrics, exists := r.Metrics[key]
if !exists {
metrics = NewMetrics(key, r.multiMapFactory, r.config.ColumnConfigs[key].Constraints...)
r.subs.PostNewField(key)
}

config := NewDefaultColumnConfig()
if cfg, ok := r.config.ColumnConfigs[key]; ok {
config = cfg
coherenceValues := computeCoherenceValues(config, root, stack)
if len(coherenceValues) == 0 {
coherenceValues = []any{realValue}
}

if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists {
metrics.Coherence.Close()
metrics.Identifiant.Close()
} else {
r.Metrics[key] = metrics
}
}

func computeCoherenceValues(config ColumnConfig, root DataRow, stack []any) []any {
coherenceValues := make([]any, len(config.CoherentWith))

for i, coherentColumn := range config.CoherentWith {
Expand All @@ -337,16 +355,7 @@ func (r Report) UpdateValue(root DataRow, realValue any, maskedValue any, stack
}
}

if len(coherenceValues) == 0 {
coherenceValues = []any{realValue}
}

if !metrics.Update(key, realValue, maskedValue, coherenceValues, r.subs, config) && !exists {
metrics.Coherence.Close()
metrics.Identifiant.Close()
} else {
r.Metrics[key] = metrics
}
return coherenceValues
}

func (r Report) Update(realRow DataRow, maskedRow DataRow) {
Expand Down Expand Up @@ -414,3 +423,17 @@ func validate(constraint ConstraintType, reference float64, value float64) bool
return false
}
}

func isExcluded(exclude []any, value any, valueStr string) bool {
if slices.Contains(exclude, value) {
return true
}

for _, exVal := range exclude {
if exValStr, ok := toString(exVal); ok && valueStr == exValStr {
return true
}
}

return false
}
4 changes: 3 additions & 1 deletion pkg/mimo/model_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ type Config struct {
type ColumnConfig struct {
Exclude []any // exclude values from the masking rate computation (default: exclude only nil values)
CoherentWith []string // list of fields from witch the coherent rate is computed (default: the current field)
CoherentSource string // template to execute to create coherence source.
CoherentSource string // template to execute to create coherence source
Constraints []Constraint // list of constraints to validate
Alias string // alias to use in persisted data
}

type Constraint struct {
Expand Down Expand Up @@ -66,5 +67,6 @@ func NewDefaultColumnConfig() ColumnConfig {
CoherentWith: []string{},
CoherentSource: "",
Constraints: []Constraint{},
Alias: "",
}
}
4 changes: 4 additions & 0 deletions test/configs/config_alias.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
version: "1"
metrics:
- name: "value_renamed"
alias: "value"
4 changes: 4 additions & 0 deletions test/configs/config_exclude_numeric.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
version: "1"
metrics:
- name: "value"
exclude: [2]
40 changes: 40 additions & 0 deletions test/reports/report_bugfix_exclude_numeric.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>MIMO Report</title>
<meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="description" content="MIMO Report" />
</head>
<body>
<h1>MIMO Report</h1>
<table border="1" cellspacing="0" cellpadding="5">
<thead>
<th>Field</th>
<th>Nil</th>
<th>Empty</th>
<th>Masked</th>
<th>Missed</th>
<th>Masking Rate</th>
<th>Coherent Rate</th>
<th>Identifiable Rate</th>
<th>K</th>
</thead>
<tbody>

<tr>
<td>value</td>
<td>0</td>
<td>1</td>
<td>3</td>
<td>0</td>
<td style="background-color: lightgreen">100.00 %</td>
<td style="background-color: lightgreen">100.00 %</td>
<td style="background-color: lightgreen">100.00 %</td>
<td style="background-color: orange">1</td>
</tr>

</tbody>
</table>
</body>
</html>
40 changes: 40 additions & 0 deletions test/reports/report_reuse_previous_alias.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>MIMO Report</title>
<meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="description" content="MIMO Report" />
</head>
<body>
<h1>MIMO Report</h1>
<table border="1" cellspacing="0" cellpadding="5">
<thead>
<th>Field</th>
<th>Nil</th>
<th>Empty</th>
<th>Masked</th>
<th>Missed</th>
<th>Masking Rate</th>
<th>Coherent Rate</th>
<th>Identifiable Rate</th>
<th>K</th>
</thead>
<tbody>

<tr>
<td>value</td>
<td>1</td>
<td>0</td>
<td>4</td>
<td>1</td>
<td style="background-color: orange">80.00 %</td>
<td style="background-color: orange">66.67 %</td>
<td style="background-color: orange">60.00 %</td>
<td style="background-color: orange">1</td>
</tr>

</tbody>
</table>
</body>
</html>
46 changes: 46 additions & 0 deletions test/suites/06-column-alias.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Venom Test Suite definition
# Check Venom documentation for more information : https://github.com/ovh/venom
name: use of columns aliases
testcases:
- name: reuse previous data directory with column alias
steps:
- script: rm -rf working/data

- script: echo '{"value":"A"}' > working/real.jsonl
- script: echo '{"value":"A"}' >> working/real.jsonl
- script: echo '{"value":"B"}' >> working/real.jsonl
- script: echo '{"value":"B"}' >> working/real.jsonl
- script: echo '{"value":"C"}' >> working/real.jsonl

- script: echo '{"value":"X"}' > working/masked.jsonl
- script: echo '{"value":"Y"}' >> working/masked.jsonl
- script: echo '{"value":"C"}' >> working/masked.jsonl
- script: echo '{"value":"C"}' >> working/masked.jsonl
- script: echo '{"value":"C"}' >> working/masked.jsonl

- script: cat working/masked.jsonl | mimo -v3 --disk-storage --persist working/data/ working/real.jsonl

- script: echo '{"value_renamed":"C"}' > working/real.jsonl
- script: echo '{"value_renamed":"D"}' >> working/real.jsonl
- script: echo '{"value_renamed":"D"}' >> working/real.jsonl
- script: echo '{"value_renamed":"E"}' >> working/real.jsonl
- script: echo '{"value_renamed":"E"}' >> working/real.jsonl
- script: echo '{"value_renamed":null}' >> working/real.jsonl
- script: echo '{}' >> working/real.jsonl
- script: echo '{"object":{"hello":{}}}' >> working/real.jsonl

- script: echo '{"value_renamed":"C"}' > working/masked.jsonl
- script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
- script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
- script: echo '{"value_renamed":"V"}' >> working/masked.jsonl
- script: echo '{"value_renamed":"C"}' >> working/masked.jsonl
- script: echo '{"value_renamed":"W"}' >> working/masked.jsonl
- script: echo '{}' >> working/masked.jsonl
- script: echo '{"object":{"hello":{}}}' >> working/masked.jsonl

- script: cat working/masked.jsonl | mimo -v5 --config ../configs/config_alias.yaml --disk-storage --persist working/data/ working/real.jsonl
assertions:
- result.code ShouldEqual 0
- result.systemerr ShouldContainSubstring value count-empty=0 count-masked=4 count-missed=1 count-nil=1 field=value rate-coherence=0.6666666666666666 rate-identifiable=0.6 rate-masking=0.8

- script: mv report.html ../reports/report_reuse_previous_alias.html
19 changes: 19 additions & 0 deletions test/suites/99-bugfixes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,22 @@ testcases:
- result.systemerr ShouldContainSubstring value count-empty=0 count-masked=0 count-missed=0 count-nil=4 field=value rate-coherence=0 rate-identifiable=1 rate-masking=NaN

- script: mv report.html ../reports/report_bugfix_1.html

- name: exclude numeric value
steps:
- script: echo '{"value":1}' > working/real.jsonl
- script: echo '{"value":2}' >> working/real.jsonl
- script: echo '{"value":3}' >> working/real.jsonl
- script: echo '{"value":4}' >> working/real.jsonl

- script: echo '{"value":"A"}' > working/masked.jsonl
- script: echo '{"value":"B"}' >> working/masked.jsonl
- script: echo '{"value":"C"}' >> working/masked.jsonl
- script: echo '{"value":"D"}' >> working/masked.jsonl

- script: cat working/masked.jsonl | mimo --config ../configs/config_exclude_numeric.yaml -v3 working/real.jsonl
assertions:
- result.code ShouldEqual 0
- result.systemerr ShouldContainSubstring value count-empty=1 count-masked=3 count-missed=0 count-nil=0 field=value rate-coherence=1 rate-identifiable=1 rate-masking=1

- script: mv report.html ../reports/report_bugfix_exclude_numeric.html

0 comments on commit 97943be

Please sign in to comment.