From 0a80167f63dcf58b15904a1ec279a6f81aa0dad5 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Mon, 27 Nov 2023 14:18:38 +0000 Subject: [PATCH 01/26] refactor: wip! counter and sampler --- pkg/metric/metricv2.go | 124 +++++++++++++++++++++++++++++++++++++++++ pkg/metric/sampler.go | 43 ++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 pkg/metric/metricv2.go create mode 100644 pkg/metric/sampler.go diff --git a/pkg/metric/metricv2.go b/pkg/metric/metricv2.go new file mode 100644 index 0000000..be1d64f --- /dev/null +++ b/pkg/metric/metricv2.go @@ -0,0 +1,124 @@ +package metric + +import "golang.org/x/exp/constraints" + +type Analyser[T constraints.Ordered] interface { + Read(*T) +} + +type Stateless[T constraints.Ordered] interface { + CountTotal() uint + CountNulls() uint + CountEmpty() uint + Min() *T + Max() *T + Samples() []T +} + +type Statefull[T constraints.Ordered] interface { + Stateless[T] + CountDistinct() uint +} + +type counter[T constraints.Ordered] struct { + countTotal uint + countNulls uint + countEmpty uint + min *T + max *T + samples *Sampler[T] + zero T +} + +func NewCounter[T constraints.Ordered](samplerSize uint) Stateless[T] { + return &counter[T]{ + countTotal: 0, + countNulls: 0, + countEmpty: 0, + samples: NewSampler[T](samplerSize), + zero: *new(T), + } +} + +func (c *counter[T]) Read(value *T) { + c.countTotal++ + + switch { + case value == nil: + c.countNulls++ + case *value == c.zero: + c.countEmpty++ + } + + if value != nil { + c.samples.Add(*value) + + if *value < *c.min { + c.min = value + } else if *value > *c.max { + c.max = value + } + } +} + +// CountEmpty implements Stateless. +func (c *counter[T]) CountEmpty() uint { + return c.countEmpty +} + +// CountNulls implements Stateless. +func (c *counter[T]) CountNulls() uint { + return c.countNulls +} + +// CountTotal implements Stateless. +func (c *counter[T]) CountTotal() uint { + return c.countTotal +} + +// Samples implements Stateless. +func (c *counter[T]) Samples() []T { + return c.samples.Data() +} + +// Min implements Stateless. +func (c *counter[T]) Min() *T { + return c.min +} + +// Max implements Stateless. +func (c *counter[T]) Max() *T { + return c.max +} + +type uniquecounter[T constraints.Ordered] struct { + counter[T] + values map[T]int +} + +func NewDistinctCounter[T constraints.Ordered](samplerSize uint) Statefull[T] { + return &uniquecounter[T]{ + counter: counter[T]{ + countTotal: 0, + countNulls: 0, + countEmpty: 0, + samples: NewSampler[T](samplerSize), + zero: *new(T), + }, + values: make(map[T]int, 1024), //nolint:gomnd + } +} + +// Read implements Statefull. +func (c *uniquecounter[T]) Read(value *T) { + c.counter.Read(value) + + if value != nil { + c.values[*value] = 0 + } +} + +// CountDistinct implements Statefull. +func (c *uniquecounter[T]) CountDistinct() uint { + return uint(len(c.values)) +} diff --git a/pkg/metric/sampler.go b/pkg/metric/sampler.go new file mode 100644 index 0000000..560aa73 --- /dev/null +++ b/pkg/metric/sampler.go @@ -0,0 +1,43 @@ +package metric + +import ( + "math/rand" + + "golang.org/x/exp/constraints" +) + +// Sampler implement a basic sampling algorithm. +// +// see: https://en.wikipedia.org/wiki/Reservoir_sampling#Simple:_Algorithm_R) +type Sampler[T constraints.Ordered] struct { + size uint + count int + data []T +} + +func NewSampler[T constraints.Ordered](size uint) *Sampler[T] { + return &Sampler[T]{ + size: size, + count: 0, + data: make([]T, 0, size), + } +} + +func (s *Sampler[T]) Add(data T) { + s.count++ + + if len(s.data) < int(s.size) { + s.data = append(s.data, data) + + return + } + + index := rand.Intn(s.count) //nolint:gosec + if index < int(s.size) { + s.data[index] = data + } +} + +func (s *Sampler[T]) Data() []T { + return s.data +} From eadf030b83675356c85f0c4bc1a178c02f10f3b8 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Mon, 27 Nov 2023 15:16:12 +0000 Subject: [PATCH 02/26] refactor: wip! metrics string --- pkg/metric/metricstring.go | 67 +++++++++++++++++++++++++++++++++ pkg/metric/metricstring_test.go | 21 +++++++++++ pkg/metric/metricv2.go | 55 ++++++++++++++++++--------- pkg/model/metricv2.go | 42 +++++++++++++++++++++ 4 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 pkg/model/metricv2.go diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go index a7d2ef5..a9d560f 100644 --- a/pkg/metric/metricstring.go +++ b/pkg/metric/metricstring.go @@ -138,3 +138,70 @@ func uniqueLengthSorted(lenCounter map[int]int) []int { return uniqueLengthSorted } + +type String struct { + sampleSize uint + main Stateless[string] + byLen map[int]Stateless[string] +} + +func NewString(sampleSize uint) *String { + return &String{ + sampleSize: sampleSize, + main: NewCounter[string](sampleSize), + byLen: map[int]Stateless[string]{}, + } +} + +func (s *String) Read(value *string) { + s.main.Read(value) + + if value != nil { + length := len(*value) + + analyser, exists := s.byLen[length] + if !exists { + analyser = NewCounter[string](s.sampleSize) + } + + analyser.Read(value) + + s.byLen[length] = analyser + } +} + +func (s *String) Build() model.Col[string] { + result := model.Col[string]{} + + result.MainMetric.Count = s.main.CountTotal() + result.MainMetric.Empty = s.main.CountEmpty() + result.MainMetric.Null = s.main.CountNulls() + result.MainMetric.Max = s.main.Max() + result.MainMetric.Min = s.main.Min() + result.MainMetric.Samples = s.main.Samples() + + lengths := make([]int, 0, len(s.byLen)) + for len := range s.byLen { + lengths = append(lengths, len) + } + + sort.Ints(lengths) + + result.StringMetric.CountLen = len(lengths) + result.StringMetric.MaxLen = lengths[0] + result.StringMetric.MaxLen = lengths[len(lengths)-1] + + for _, length := range lengths { + len := model.StringLen{} + len.Length = length + len.Metrics.Count = s.byLen[length].CountTotal() + len.Metrics.Empty = s.byLen[length].CountEmpty() + len.Metrics.Null = s.byLen[length].CountNulls() + len.Metrics.Max = s.byLen[length].Max() + len.Metrics.Min = s.byLen[length].Min() + len.Metrics.Samples = s.byLen[length].Samples() + result.StringMetric.Lengths = append(result.StringMetric.Lengths, len) + } + + return result +} diff --git a/pkg/metric/metricstring_test.go b/pkg/metric/metricstring_test.go index 254a923..921461d 100644 --- a/pkg/metric/metricstring_test.go +++ b/pkg/metric/metricstring_test.go @@ -18,6 +18,8 @@ package metric_test import ( + "encoding/json" + "fmt" "testing" "github.com/cgi-fr/rimo/pkg/metric" @@ -58,3 +60,22 @@ func TestStringMetric(t *testing.T) { assert.ElementsMatch(t, expectedMetric.LeastFreqLen[i].Sample, actualMetric.LeastFreqLen[i].Sample) } } + +func TestStringMetricV2(t *testing.T) { + analyser := metric.NewString(5) + + strings := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""} + + for _, s := range strings { + s := s + analyser.Read(&s) + } + + analyser.Read(nil) + + bytes, err := json.Marshal(analyser.Build()) + + assert.NoError(t, err) + + fmt.Printf("%s\n", string(bytes)) +} diff --git a/pkg/metric/metricv2.go b/pkg/metric/metricv2.go index be1d64f..4fd7e36 100644 --- a/pkg/metric/metricv2.go +++ b/pkg/metric/metricv2.go @@ -2,11 +2,24 @@ package metric import "golang.org/x/exp/constraints" +type Factory[T constraints.Ordered] interface { + Create() Analyser[T] +} + +type DefaultFactory[T constraints.Ordered] struct { + SampleSize uint +} + +func (f DefaultFactory[T]) Create() Analyser[T] { + return NewCounter[T](f.SampleSize) +} + type Analyser[T constraints.Ordered] interface { Read(*T) } type Stateless[T constraints.Ordered] interface { + Analyser[T] CountTotal() uint CountNulls() uint CountEmpty() uint @@ -20,7 +33,7 @@ type Statefull[T constraints.Ordered] interface { CountDistinct() uint } -type counter[T constraints.Ordered] struct { +type Counter[T constraints.Ordered] struct { countTotal uint countNulls uint countEmpty uint @@ -30,8 +43,8 @@ type counter[T constraints.Ordered] struct { zero T } -func NewCounter[T constraints.Ordered](samplerSize uint) Stateless[T] { - return &counter[T]{ +func NewCounter[T constraints.Ordered](samplerSize uint) *Counter[T] { + return &Counter[T]{ countTotal: 0, countNulls: 0, countEmpty: 0, @@ -40,7 +53,7 @@ func NewCounter[T constraints.Ordered](samplerSize uint) Stateless[T] { } } -func (c *counter[T]) Read(value *T) { +func (c *Counter[T]) Read(value *T) { c.countTotal++ switch { @@ -53,6 +66,14 @@ func (c *counter[T]) Read(value *T) { if value != nil { c.samples.Add(*value) + if c.min == nil { + c.min = value + } + + if c.max == nil { + c.max = value + } + if *value < *c.min { c.min = value } else if *value > *c.max { @@ -62,43 +83,43 @@ func (c *counter[T]) Read(value *T) { } // CountEmpty implements Stateless. -func (c *counter[T]) CountEmpty() uint { +func (c *Counter[T]) CountEmpty() uint { return c.countEmpty } // CountNulls implements Stateless. -func (c *counter[T]) CountNulls() uint { +func (c *Counter[T]) CountNulls() uint { return c.countNulls } // CountTotal implements Stateless. -func (c *counter[T]) CountTotal() uint { +func (c *Counter[T]) CountTotal() uint { return c.countTotal } // Samples implements Stateless. -func (c *counter[T]) Samples() []T { +func (c *Counter[T]) Samples() []T { return c.samples.Data() } // Min implements Stateless. -func (c *counter[T]) Min() *T { +func (c *Counter[T]) Min() *T { return c.min } // Max implements Stateless. -func (c *counter[T]) Max() *T { +func (c *Counter[T]) Max() *T { return c.max } -type uniquecounter[T constraints.Ordered] struct { - counter[T] +type Distinctcounter[T constraints.Ordered] struct { + Counter[T] values map[T]int } func NewDistinctCounter[T constraints.Ordered](samplerSize uint) Statefull[T] { - return &uniquecounter[T]{ - counter: counter[T]{ + return &Distinctcounter[T]{ + Counter: Counter[T]{ countTotal: 0, countNulls: 0, countEmpty: 0, @@ -110,8 +131,8 @@ func NewDistinctCounter[T constraints.Ordered](samplerSize uint) Statefull[T] { } // Read implements Statefull. -func (c *uniquecounter[T]) Read(value *T) { - c.counter.Read(value) +func (c *Distinctcounter[T]) Read(value *T) { + c.Counter.Read(value) if value != nil { c.values[*value] = 0 @@ -119,6 +140,6 @@ func (c *uniquecounter[T]) Read(value *T) { } // CountDistinct implements Statefull. -func (c *uniquecounter[T]) CountDistinct() uint { +func (c *Distinctcounter[T]) CountDistinct() uint { return uint(len(c.values)) } diff --git a/pkg/model/metricv2.go b/pkg/model/metricv2.go new file mode 100644 index 0000000..14abdcb --- /dev/null +++ b/pkg/model/metricv2.go @@ -0,0 +1,42 @@ +package model + +import "golang.org/x/exp/constraints" + +type Col[T constraints.Ordered] struct { + Name string `json:"name" jsonschema:"required" yaml:"name"` + Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll + + // The 3 following parameter should be part of a Config struct + Concept string `json:"concept" jsonschema:"required" yaml:"concept"` + Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` + Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` + + MainMetric Generic[T] `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` + + StringMetric String `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` + // NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` + // BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` +} + +type Generic[T constraints.Ordered] struct { + Count uint `json:"count" jsonschema:"required" yaml:"count"` + Empty uint `json:"empty" jsonschema:"required" yaml:"empty"` + Null uint `json:"null" jsonschema:"required" yaml:"null"` + Distinct uint `json:"distinct" jsonschema:"required" yaml:"distinct"` + Min *T `json:"min" jsonschema:"required" yaml:"min"` + Max *T `json:"max" jsonschema:"required" yaml:"max"` + Samples []T `json:"samples" jsonschema:"required" yaml:"samples"` +} + +type String struct { + MinLen int `json:"minLen" jsonschema:"required" yaml:"minLen"` + MaxLen int `json:"maxLen" jsonschema:"required" yaml:"maxLen"` + CountLen int `json:"countLen" jsonschema:"required" yaml:"countLen"` + Lengths []StringLen `json:"lengths" jsonschema:"required" yaml:"lengths"` +} + +type StringLen struct { + Length int `json:"length" jsonschema:"required" yaml:"length"` + Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` + Metrics Generic[string] `json:"metrics" jsonschema:"required" yaml:"metrics"` +} From 868ddfb3b411d644e5d7a764a3a8491bd45e8262 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Mon, 27 Nov 2023 15:21:23 +0000 Subject: [PATCH 03/26] refactor: wip! string len freq --- pkg/metric/metricstring.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go index a9d560f..ed50692 100644 --- a/pkg/metric/metricstring.go +++ b/pkg/metric/metricstring.go @@ -194,6 +194,7 @@ func (s *String) Build() model.Col[string] { for _, length := range lengths { len := model.StringLen{} len.Length = length + len.Freq = float64(s.byLen[length].CountTotal()) / float64(s.main.CountTotal()) len.Metrics.Count = s.byLen[length].CountTotal() len.Metrics.Empty = s.byLen[length].CountEmpty() len.Metrics.Null = s.byLen[length].CountNulls() From 1c3c5d6a315b8b2c4013a05a0fb945fd7af947a3 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Mon, 27 Nov 2023 16:47:48 +0000 Subject: [PATCH 04/26] refactor: wip! string sort by freq --- pkg/metric/metricstring.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go index ed50692..b984e18 100644 --- a/pkg/metric/metricstring.go +++ b/pkg/metric/metricstring.go @@ -185,7 +185,12 @@ func (s *String) Build() model.Col[string] { lengths = append(lengths, len) } - sort.Ints(lengths) + sort.Slice(lengths, func(i, j int) bool { + freqi := float64(s.byLen[lengths[i]].CountTotal()) / float64(s.main.CountTotal()) + freqj := float64(s.byLen[lengths[j]].CountTotal()) / float64(s.main.CountTotal()) + + return freqi > freqj + }) result.StringMetric.CountLen = len(lengths) result.StringMetric.MaxLen = lengths[0] From 3278b235ccf32c91cd34eefde175905ebd76a78e Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 27 Nov 2023 22:18:38 +0000 Subject: [PATCH 05/26] refactor: wip! modelv2 --- pkg/metric/metricv2.go | 12 ----- pkg/metricv2/analyser.go | 27 +++++++++++ pkg/metricv2/counter.go | 39 +++++++++++++++ pkg/metricv2/distinct.go | 26 ++++++++++ pkg/metricv2/metric_test.go | 85 ++++++++++++++++++++++++++++++++ pkg/metricv2/minmax.go | 41 ++++++++++++++++ pkg/metricv2/sampler.go | 43 +++++++++++++++++ pkg/metricv2/string.go | 96 +++++++++++++++++++++++++++++++++++++ pkg/model/metricv2.go | 26 +++++----- pkg/modelv2/metrics.go | 27 +++++++++++ 10 files changed, 397 insertions(+), 25 deletions(-) create mode 100644 pkg/metricv2/analyser.go create mode 100644 pkg/metricv2/counter.go create mode 100644 pkg/metricv2/distinct.go create mode 100644 pkg/metricv2/metric_test.go create mode 100644 pkg/metricv2/minmax.go create mode 100644 pkg/metricv2/sampler.go create mode 100644 pkg/metricv2/string.go create mode 100644 pkg/modelv2/metrics.go diff --git a/pkg/metric/metricv2.go b/pkg/metric/metricv2.go index 4fd7e36..d6bfa6e 100644 --- a/pkg/metric/metricv2.go +++ b/pkg/metric/metricv2.go @@ -2,18 +2,6 @@ package metric import "golang.org/x/exp/constraints" -type Factory[T constraints.Ordered] interface { - Create() Analyser[T] -} - -type DefaultFactory[T constraints.Ordered] struct { - SampleSize uint -} - -func (f DefaultFactory[T]) Create() Analyser[T] { - return NewCounter[T](f.SampleSize) -} - type Analyser[T constraints.Ordered] interface { Read(*T) } diff --git a/pkg/metricv2/analyser.go b/pkg/metricv2/analyser.go new file mode 100644 index 0000000..f314db9 --- /dev/null +++ b/pkg/metricv2/analyser.go @@ -0,0 +1,27 @@ +package metricv2 + +import ( + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/constraints" +) + +type Analyser[T constraints.Ordered] interface { + Read(*T) + Build(*modelv2.Generic[T]) +} + +type Multi[T constraints.Ordered] struct { + analyser []Analyser[T] +} + +func (m Multi[T]) Read(value *T) { + for _, a := range m.analyser { + a.Read(value) + } +} + +func (m Multi[T]) Build(metric *modelv2.Generic[T]) { + for _, a := range m.analyser { + a.Build(metric) + } +} diff --git a/pkg/metricv2/counter.go b/pkg/metricv2/counter.go new file mode 100644 index 0000000..beaf9b1 --- /dev/null +++ b/pkg/metricv2/counter.go @@ -0,0 +1,39 @@ +package metricv2 + +import ( + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/constraints" +) + +type Counter[T constraints.Ordered] struct { + countTotal uint + countNulls uint + countEmpty uint + zero T +} + +func NewCounter[T constraints.Ordered]() *Counter[T] { + return &Counter[T]{ + countTotal: 0, + countNulls: 0, + countEmpty: 0, + zero: *new(T), + } +} + +func (c *Counter[T]) Read(value *T) { + c.countTotal++ + + switch { + case value == nil: + c.countNulls++ + case *value == c.zero: + c.countEmpty++ + } +} + +func (c *Counter[T]) Build(metric *modelv2.Generic[T]) { + metric.Count = c.countTotal + metric.Null = c.countNulls + metric.Empty = c.countEmpty +} diff --git a/pkg/metricv2/distinct.go b/pkg/metricv2/distinct.go new file mode 100644 index 0000000..4e6d724 --- /dev/null +++ b/pkg/metricv2/distinct.go @@ -0,0 +1,26 @@ +package metricv2 + +import ( + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/constraints" +) + +type Distinct[T constraints.Ordered] struct { + values map[T]int +} + +func NewDistinct[T constraints.Ordered]() *Distinct[T] { + return &Distinct[T]{ + values: make(map[T]int, 1024), //nolint:gomnd + } +} + +func (a *Distinct[T]) Read(value *T) { + if value != nil { + a.values[*value] = 0 + } +} + +func (a *Distinct[T]) Build(metric *modelv2.Generic[T]) { + metric.Distinct = uint(len(a.values)) +} diff --git a/pkg/metricv2/metric_test.go b/pkg/metricv2/metric_test.go new file mode 100644 index 0000000..8a51a49 --- /dev/null +++ b/pkg/metricv2/metric_test.go @@ -0,0 +1,85 @@ +package metricv2_test + +import ( + "fmt" + "testing" + + "github.com/cgi-fr/rimo/pkg/metricv2" + "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/stretchr/testify/assert" + "gopkg.in/yaml.v3" +) + +// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length. +func TestStringMetric(t *testing.T) { //nolint:funlen + t.Parallel() + + text := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""} + + min := "" + max := "4441" + + expectedMetric := modelv2.Generic[string]{ + Count: 12, + Empty: 1, + Null: 1, + Distinct: 6, + Samples: []string{"22"}, + Min: &min, + Max: &max, + String: &modelv2.String{ + MinLen: 0, + MaxLen: 4, + CountLen: 5, + Lengths: []modelv2.StringLen{ + { + Length: 1, + Freq: 0.3333333333333333, + Metrics: modelv2.Generic[string]{ + Count: 4, + Empty: 0, + Null: 0, + Distinct: 1, + Min: &text[0], + Max: &text[0], + Samples: []string{"1", "1", "1", "1"}, + String: nil, + }, + }, + }, + }, + } + + actualMetric := modelv2.Generic[string]{} + + analyser := metricv2.NewString(5, true) + for index := range text { + analyser.Read(&text[index]) + } + + analyser.Read(nil) + + analyser.Build(&actualMetric) + + out, err := yaml.Marshal(actualMetric) + + assert.NoError(t, err) + + fmt.Println(string(out)) + + assert.Equal(t, expectedMetric.Count, actualMetric.Count) + assert.Equal(t, expectedMetric.Empty, actualMetric.Empty) + assert.Equal(t, expectedMetric.Null, actualMetric.Null) + assert.Equal(t, expectedMetric.Distinct, actualMetric.Distinct) + assert.Equal(t, expectedMetric.Min, actualMetric.Min) + assert.Equal(t, expectedMetric.Max, actualMetric.Max) + assert.Equal(t, expectedMetric.String.MinLen, actualMetric.String.MinLen) + assert.Equal(t, expectedMetric.String.MaxLen, actualMetric.String.MaxLen) + assert.Equal(t, expectedMetric.String.CountLen, actualMetric.String.CountLen) + + for i := 0; i < len(expectedMetric.String.Lengths); i++ { + assert.Equal(t, expectedMetric.String.Lengths[i].Length, actualMetric.String.Lengths[i].Length) + assert.Equal(t, expectedMetric.String.Lengths[i].Freq, actualMetric.String.Lengths[i].Freq) + assert.Equal(t, expectedMetric.String.Lengths[i].Metrics.Samples, actualMetric.String.Lengths[i].Metrics.Samples) + } +} diff --git a/pkg/metricv2/minmax.go b/pkg/metricv2/minmax.go new file mode 100644 index 0000000..d9ffce0 --- /dev/null +++ b/pkg/metricv2/minmax.go @@ -0,0 +1,41 @@ +package metricv2 + +import ( + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/constraints" +) + +type MinMax[T constraints.Ordered] struct { + min *T + max *T +} + +func NewMinMax[T constraints.Ordered]() *MinMax[T] { + return &MinMax[T]{ + min: nil, + max: nil, + } +} + +func (a *MinMax[T]) Read(value *T) { + if value != nil { + if a.min == nil { + a.min = value + } + + if a.max == nil { + a.max = value + } + + if *value < *a.min { + a.min = value + } else if *value > *a.max { + a.max = value + } + } +} + +func (a *MinMax[T]) Build(metric *modelv2.Generic[T]) { + metric.Min = a.min + metric.Max = a.max +} diff --git a/pkg/metricv2/sampler.go b/pkg/metricv2/sampler.go new file mode 100644 index 0000000..b193c55 --- /dev/null +++ b/pkg/metricv2/sampler.go @@ -0,0 +1,43 @@ +package metricv2 + +import ( + "math/rand" + + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/constraints" +) + +type Sampler[T constraints.Ordered] struct { + size uint + count int + samples []T +} + +func NewSampler[T constraints.Ordered](size uint) *Sampler[T] { + return &Sampler[T]{ + size: size, + count: 0, + samples: make([]T, 0, size), + } +} + +func (s *Sampler[T]) Read(value *T) { + if value != nil { + s.count++ + + if len(s.samples) < int(s.size) { + s.samples = append(s.samples, *value) + + return + } + + index := rand.Intn(s.count) //nolint:gosec + if index < int(s.size) { + s.samples[index] = *value + } + } +} + +func (s *Sampler[T]) Build(metric *modelv2.Generic[T]) { + metric.Samples = s.samples +} diff --git a/pkg/metricv2/string.go b/pkg/metricv2/string.go new file mode 100644 index 0000000..bec4716 --- /dev/null +++ b/pkg/metricv2/string.go @@ -0,0 +1,96 @@ +package metricv2 + +import ( + "sort" + + "github.com/cgi-fr/rimo/pkg/modelv2" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" +) + +type String struct { + sampleSize uint + distinct bool + main Multi[string] + byLen map[int]Multi[string] +} + +func NewString(sampleSize uint, countDistinct bool) *String { + mainAnalyser := []Analyser[string]{ + NewCounter[string](), // count total, count null, count empty + NewMinMax[string](), // store min and max values + NewSampler[string](sampleSize), // store few samples + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[string]()) + } + + return &String{ + sampleSize: sampleSize, + distinct: countDistinct, + main: Multi[string]{mainAnalyser}, + byLen: make(map[int]Multi[string], 0), + } +} + +func (a *String) Read(value *string) { + a.main.Read(value) + + if value != nil { + length := len(*value) + + analyser, exists := a.byLen[length] + if !exists { + analyser = Multi[string]{ + []Analyser[string]{ + NewCounter[string](), // count total, count null, count empty + NewMinMax[string](), // store min and max values + NewSampler[string](a.sampleSize), // store few samples + }, + } + + if a.distinct { + analyser.analyser = append(analyser.analyser, NewDistinct[string]()) + } + } + + analyser.Read(value) + + a.byLen[length] = analyser + } +} + +func (a *String) Build(metric *modelv2.Generic[string]) { + a.main.Build(metric) + + metric.String = &modelv2.String{ + MinLen: slices.Min(maps.Keys(a.byLen)), + MaxLen: slices.Max(maps.Keys(a.byLen)), + CountLen: len(a.byLen), + Lengths: make([]modelv2.StringLen, 0, len(a.byLen)), + } + + for length, analyser := range a.byLen { + lenMetric := modelv2.Generic[string]{} + analyser.Build(&lenMetric) + + strlen := modelv2.StringLen{ + Length: length, + Freq: float64(lenMetric.Count) / float64(metric.Count), + Metrics: modelv2.Generic[string]{}, + } + strlen.Metrics.Count = lenMetric.Count + strlen.Metrics.Empty = lenMetric.Empty + strlen.Metrics.Null = lenMetric.Null + strlen.Metrics.Distinct = lenMetric.Distinct + strlen.Metrics.Max = lenMetric.Max + strlen.Metrics.Min = lenMetric.Min + strlen.Metrics.Samples = lenMetric.Samples + metric.String.Lengths = append(metric.String.Lengths, strlen) + } + + sort.Slice(metric.String.Lengths, func(i, j int) bool { + return metric.String.Lengths[i].Freq > metric.String.Lengths[j].Freq + }) +} diff --git a/pkg/model/metricv2.go b/pkg/model/metricv2.go index 14abdcb..54ba31a 100644 --- a/pkg/model/metricv2.go +++ b/pkg/model/metricv2.go @@ -7,25 +7,25 @@ type Col[T constraints.Ordered] struct { Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll // The 3 following parameter should be part of a Config struct - Concept string `json:"concept" jsonschema:"required" yaml:"concept"` - Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` - Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` + Concept string `json:"concept,omitempty" yaml:"concept,omitempty"` + Constraint []string `json:"constraint,omitempty" yaml:"constraint,omitempty"` + Confidential *bool `json:"confidential,omitempty" yaml:"confidential,omitempty"` MainMetric Generic[T] `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` - StringMetric String `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` - // NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` - // BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` + StringMetric String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` + NumericMetric NumericMetric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` + BoolMetric BoolMetric `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` } type Generic[T constraints.Ordered] struct { - Count uint `json:"count" jsonschema:"required" yaml:"count"` - Empty uint `json:"empty" jsonschema:"required" yaml:"empty"` - Null uint `json:"null" jsonschema:"required" yaml:"null"` - Distinct uint `json:"distinct" jsonschema:"required" yaml:"distinct"` - Min *T `json:"min" jsonschema:"required" yaml:"min"` - Max *T `json:"max" jsonschema:"required" yaml:"max"` - Samples []T `json:"samples" jsonschema:"required" yaml:"samples"` + Count uint `json:"count" jsonschema:"required" yaml:"count"` + Empty uint `json:"empty" jsonschema:"required" yaml:"empty"` + Null uint `json:"null" jsonschema:"required" yaml:"null"` + Distinct *uint `json:"distinct" jsonschema:"required" yaml:"distinct"` + Min *T `json:"min" jsonschema:"required" yaml:"min"` + Max *T `json:"max" jsonschema:"required" yaml:"max"` + Samples []T `json:"samples" jsonschema:"required" yaml:"samples"` } type String struct { diff --git a/pkg/modelv2/metrics.go b/pkg/modelv2/metrics.go new file mode 100644 index 0000000..6508e4a --- /dev/null +++ b/pkg/modelv2/metrics.go @@ -0,0 +1,27 @@ +package modelv2 + +import "golang.org/x/exp/constraints" + +type Generic[T constraints.Ordered] struct { + Count uint `json:"count" yaml:"count" jsonschema:"required"` + Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` + Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` + Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` + Min *T `json:"min,omitempty" yaml:"min,omitempty"` + Max *T `json:"max,omitempty" yaml:"max,omitempty"` + Samples []T `json:"samples" yaml:"samples" jsonschema:"required"` + String *String `json:"string,omitempty" yaml:"string,omitempty"` +} + +type String struct { + MinLen int `json:"minLen" yaml:"minLen"` + MaxLen int `json:"maxLen" yaml:"maxLen"` + CountLen int `json:"countLen,omitempty" yaml:"countLen,omitempty"` + Lengths []StringLen `json:"lengths,omitempty" yaml:"lengths,omitempty"` +} + +type StringLen struct { + Length int `json:"length" yaml:"length" jsonschema:"required"` + Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"` + Metrics Generic[string] `json:"metrics" yaml:"metrics" jsonschema:"required"` +} From 2f728c79bb676c9d43e951f5daaf8449b52deee0 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Wed, 29 Nov 2023 15:11:19 +0000 Subject: [PATCH 06/26] refactor: build column --- pkg/metricv2/analyser.go | 4 +-- pkg/metricv2/counter.go | 8 ++--- pkg/metricv2/distinct.go | 4 +-- pkg/metricv2/metric_test.go | 59 ++++++++++++++++++------------------- pkg/metricv2/minmax.go | 6 ++-- pkg/metricv2/sampler.go | 4 +-- pkg/metricv2/string.go | 28 +++++++++--------- pkg/modelv2/column.go | 15 ++++++++++ pkg/modelv2/config.go | 7 +++++ pkg/modelv2/metrics.go | 19 +++++++----- 10 files changed, 88 insertions(+), 66 deletions(-) create mode 100644 pkg/modelv2/column.go create mode 100644 pkg/modelv2/config.go diff --git a/pkg/metricv2/analyser.go b/pkg/metricv2/analyser.go index f314db9..22087a9 100644 --- a/pkg/metricv2/analyser.go +++ b/pkg/metricv2/analyser.go @@ -7,7 +7,7 @@ import ( type Analyser[T constraints.Ordered] interface { Read(*T) - Build(*modelv2.Generic[T]) + Build(*modelv2.Column[T]) } type Multi[T constraints.Ordered] struct { @@ -20,7 +20,7 @@ func (m Multi[T]) Read(value *T) { } } -func (m Multi[T]) Build(metric *modelv2.Generic[T]) { +func (m Multi[T]) Build(metric *modelv2.Column[T]) { for _, a := range m.analyser { a.Build(metric) } diff --git a/pkg/metricv2/counter.go b/pkg/metricv2/counter.go index beaf9b1..12a4e03 100644 --- a/pkg/metricv2/counter.go +++ b/pkg/metricv2/counter.go @@ -32,8 +32,8 @@ func (c *Counter[T]) Read(value *T) { } } -func (c *Counter[T]) Build(metric *modelv2.Generic[T]) { - metric.Count = c.countTotal - metric.Null = c.countNulls - metric.Empty = c.countEmpty +func (c *Counter[T]) Build(metric *modelv2.Column[T]) { + metric.MainMetric.Count = c.countTotal + metric.MainMetric.Null = c.countNulls + metric.MainMetric.Empty = c.countEmpty } diff --git a/pkg/metricv2/distinct.go b/pkg/metricv2/distinct.go index 4e6d724..f44a242 100644 --- a/pkg/metricv2/distinct.go +++ b/pkg/metricv2/distinct.go @@ -21,6 +21,6 @@ func (a *Distinct[T]) Read(value *T) { } } -func (a *Distinct[T]) Build(metric *modelv2.Generic[T]) { - metric.Distinct = uint(len(a.values)) +func (a *Distinct[T]) Build(metric *modelv2.Column[T]) { + metric.MainMetric.Distinct = uint(len(a.values)) } diff --git a/pkg/metricv2/metric_test.go b/pkg/metricv2/metric_test.go index 8a51a49..ff4ac35 100644 --- a/pkg/metricv2/metric_test.go +++ b/pkg/metricv2/metric_test.go @@ -1,13 +1,11 @@ package metricv2_test import ( - "fmt" "testing" "github.com/cgi-fr/rimo/pkg/metricv2" "github.com/cgi-fr/rimo/pkg/modelv2" "github.com/stretchr/testify/assert" - "gopkg.in/yaml.v3" ) // Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length. @@ -19,15 +17,17 @@ func TestStringMetric(t *testing.T) { //nolint:funlen min := "" max := "4441" - expectedMetric := modelv2.Generic[string]{ - Count: 12, - Empty: 1, - Null: 1, - Distinct: 6, - Samples: []string{"22"}, - Min: &min, - Max: &max, - String: &modelv2.String{ + expectedMetric := modelv2.Column[string]{ + MainMetric: modelv2.Generic[string]{ + Count: 12, + Empty: 1, + Null: 1, + Distinct: 6, + Samples: []string{"22"}, + Min: &min, + Max: &max, + }, + StringMetric: modelv2.String{ MinLen: 0, MaxLen: 4, CountLen: 5, @@ -43,14 +43,13 @@ func TestStringMetric(t *testing.T) { //nolint:funlen Min: &text[0], Max: &text[0], Samples: []string{"1", "1", "1", "1"}, - String: nil, }, }, }, }, } - actualMetric := modelv2.Generic[string]{} + actualMetric := modelv2.Column[string]{} analyser := metricv2.NewString(5, true) for index := range text { @@ -61,25 +60,23 @@ func TestStringMetric(t *testing.T) { //nolint:funlen analyser.Build(&actualMetric) - out, err := yaml.Marshal(actualMetric) - - assert.NoError(t, err) - - fmt.Println(string(out)) + // out, err := yaml.Marshal(actualMetric) + // assert.NoError(t, err) + // fmt.Println(string(out)) - assert.Equal(t, expectedMetric.Count, actualMetric.Count) - assert.Equal(t, expectedMetric.Empty, actualMetric.Empty) - assert.Equal(t, expectedMetric.Null, actualMetric.Null) - assert.Equal(t, expectedMetric.Distinct, actualMetric.Distinct) - assert.Equal(t, expectedMetric.Min, actualMetric.Min) - assert.Equal(t, expectedMetric.Max, actualMetric.Max) - assert.Equal(t, expectedMetric.String.MinLen, actualMetric.String.MinLen) - assert.Equal(t, expectedMetric.String.MaxLen, actualMetric.String.MaxLen) - assert.Equal(t, expectedMetric.String.CountLen, actualMetric.String.CountLen) + assert.Equal(t, expectedMetric.MainMetric.Count, actualMetric.MainMetric.Count) + assert.Equal(t, expectedMetric.MainMetric.Empty, actualMetric.MainMetric.Empty) + assert.Equal(t, expectedMetric.MainMetric.Null, actualMetric.MainMetric.Null) + assert.Equal(t, expectedMetric.MainMetric.Distinct, actualMetric.MainMetric.Distinct) + assert.Equal(t, expectedMetric.MainMetric.Min, actualMetric.MainMetric.Min) + assert.Equal(t, expectedMetric.MainMetric.Max, actualMetric.MainMetric.Max) + assert.Equal(t, expectedMetric.StringMetric.MinLen, actualMetric.StringMetric.MinLen) + assert.Equal(t, expectedMetric.StringMetric.MaxLen, actualMetric.StringMetric.MaxLen) + assert.Equal(t, expectedMetric.StringMetric.CountLen, actualMetric.StringMetric.CountLen) - for i := 0; i < len(expectedMetric.String.Lengths); i++ { - assert.Equal(t, expectedMetric.String.Lengths[i].Length, actualMetric.String.Lengths[i].Length) - assert.Equal(t, expectedMetric.String.Lengths[i].Freq, actualMetric.String.Lengths[i].Freq) - assert.Equal(t, expectedMetric.String.Lengths[i].Metrics.Samples, actualMetric.String.Lengths[i].Metrics.Samples) + for i := 0; i < len(expectedMetric.StringMetric.Lengths); i++ { + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Length, actualMetric.StringMetric.Lengths[i].Length) + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Freq, actualMetric.StringMetric.Lengths[i].Freq) + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Metrics.Samples, actualMetric.StringMetric.Lengths[i].Metrics.Samples) } } diff --git a/pkg/metricv2/minmax.go b/pkg/metricv2/minmax.go index d9ffce0..1d45ba1 100644 --- a/pkg/metricv2/minmax.go +++ b/pkg/metricv2/minmax.go @@ -35,7 +35,7 @@ func (a *MinMax[T]) Read(value *T) { } } -func (a *MinMax[T]) Build(metric *modelv2.Generic[T]) { - metric.Min = a.min - metric.Max = a.max +func (a *MinMax[T]) Build(metric *modelv2.Column[T]) { + metric.MainMetric.Min = a.min + metric.MainMetric.Max = a.max } diff --git a/pkg/metricv2/sampler.go b/pkg/metricv2/sampler.go index b193c55..de769fc 100644 --- a/pkg/metricv2/sampler.go +++ b/pkg/metricv2/sampler.go @@ -38,6 +38,6 @@ func (s *Sampler[T]) Read(value *T) { } } -func (s *Sampler[T]) Build(metric *modelv2.Generic[T]) { - metric.Samples = s.samples +func (s *Sampler[T]) Build(metric *modelv2.Column[T]) { + metric.MainMetric.Samples = s.samples } diff --git a/pkg/metricv2/string.go b/pkg/metricv2/string.go index bec4716..15897ae 100644 --- a/pkg/metricv2/string.go +++ b/pkg/metricv2/string.go @@ -61,10 +61,10 @@ func (a *String) Read(value *string) { } } -func (a *String) Build(metric *modelv2.Generic[string]) { +func (a *String) Build(metric *modelv2.Column[string]) { a.main.Build(metric) - metric.String = &modelv2.String{ + metric.StringMetric = modelv2.String{ MinLen: slices.Min(maps.Keys(a.byLen)), MaxLen: slices.Max(maps.Keys(a.byLen)), CountLen: len(a.byLen), @@ -72,25 +72,25 @@ func (a *String) Build(metric *modelv2.Generic[string]) { } for length, analyser := range a.byLen { - lenMetric := modelv2.Generic[string]{} + lenMetric := modelv2.Column[string]{} analyser.Build(&lenMetric) strlen := modelv2.StringLen{ Length: length, - Freq: float64(lenMetric.Count) / float64(metric.Count), + Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count), Metrics: modelv2.Generic[string]{}, } - strlen.Metrics.Count = lenMetric.Count - strlen.Metrics.Empty = lenMetric.Empty - strlen.Metrics.Null = lenMetric.Null - strlen.Metrics.Distinct = lenMetric.Distinct - strlen.Metrics.Max = lenMetric.Max - strlen.Metrics.Min = lenMetric.Min - strlen.Metrics.Samples = lenMetric.Samples - metric.String.Lengths = append(metric.String.Lengths, strlen) + strlen.Metrics.Count = lenMetric.MainMetric.Count + strlen.Metrics.Empty = lenMetric.MainMetric.Empty + strlen.Metrics.Null = lenMetric.MainMetric.Null + strlen.Metrics.Distinct = lenMetric.MainMetric.Distinct + strlen.Metrics.Max = lenMetric.MainMetric.Max + strlen.Metrics.Min = lenMetric.MainMetric.Min + strlen.Metrics.Samples = lenMetric.MainMetric.Samples + metric.StringMetric.Lengths = append(metric.StringMetric.Lengths, strlen) } - sort.Slice(metric.String.Lengths, func(i, j int) bool { - return metric.String.Lengths[i].Freq > metric.String.Lengths[j].Freq + sort.Slice(metric.StringMetric.Lengths, func(i, j int) bool { + return metric.StringMetric.Lengths[i].Freq > metric.StringMetric.Lengths[j].Freq }) } diff --git a/pkg/modelv2/column.go b/pkg/modelv2/column.go new file mode 100644 index 0000000..67af4a5 --- /dev/null +++ b/pkg/modelv2/column.go @@ -0,0 +1,15 @@ +package modelv2 + +import "golang.org/x/exp/constraints" + +type Column[T constraints.Ordered] struct { + Name string `json:"name" yaml:"name" jsonschema:"required"` + Type string `json:"type" yaml:"type" jsonschema:"required" validate:"oneof=string numeric boolean"` + + Config + + MainMetric Generic[T] `json:"mainMetric" yaml:"mainMetric" jsonschema:"required"` + StringMetric String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` + NumericMetric Numeric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` + BoolMetric Bool `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` +} diff --git a/pkg/modelv2/config.go b/pkg/modelv2/config.go new file mode 100644 index 0000000..97d393b --- /dev/null +++ b/pkg/modelv2/config.go @@ -0,0 +1,7 @@ +package modelv2 + +type Config struct { + Concept string `json:"concept" yaml:"concept" jsonschema:"required"` + Constraint []string `json:"constraint" yaml:"constraint" jsonschema:"required"` + Confidential *bool `json:"confidential" yaml:"confidential" jsonschema:"required"` +} diff --git a/pkg/modelv2/metrics.go b/pkg/modelv2/metrics.go index 6508e4a..f214e75 100644 --- a/pkg/modelv2/metrics.go +++ b/pkg/modelv2/metrics.go @@ -3,14 +3,13 @@ package modelv2 import "golang.org/x/exp/constraints" type Generic[T constraints.Ordered] struct { - Count uint `json:"count" yaml:"count" jsonschema:"required"` - Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` - Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` - Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` - Min *T `json:"min,omitempty" yaml:"min,omitempty"` - Max *T `json:"max,omitempty" yaml:"max,omitempty"` - Samples []T `json:"samples" yaml:"samples" jsonschema:"required"` - String *String `json:"string,omitempty" yaml:"string,omitempty"` + Count uint `json:"count" yaml:"count" jsonschema:"required"` + Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` + Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` + Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` + Min *T `json:"min,omitempty" yaml:"min,omitempty"` + Max *T `json:"max,omitempty" yaml:"max,omitempty"` + Samples []T `json:"samples" yaml:"samples" jsonschema:"required"` } type String struct { @@ -25,3 +24,7 @@ type StringLen struct { Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"` Metrics Generic[string] `json:"metrics" yaml:"metrics" jsonschema:"required"` } + +type Numeric struct{} + +type Bool struct{} From eb3a6333a81b02d5c42799cf9e8dce2ce5067f3d Mon Sep 17 00:00:00 2001 From: adrienaury Date: Wed, 29 Nov 2023 15:59:05 +0000 Subject: [PATCH 07/26] refactor: driver --- pkg/metricv2/analyser.go | 4 +- pkg/metricv2/counter.go | 2 +- pkg/metricv2/distinct.go | 2 +- pkg/metricv2/metric_test.go | 14 ++--- pkg/metricv2/minmax.go | 2 +- pkg/metricv2/sampler.go | 7 ++- pkg/metricv2/string.go | 8 +-- pkg/modelv2/base.go | 17 ++++++ pkg/modelv2/column.go | 12 ++-- pkg/modelv2/metrics.go | 24 ++++---- pkg/rimo/driven.go | 17 ++++-- pkg/rimo/driven_test.go | 108 ------------------------------------ pkg/rimo/driver.go | 84 +++++++++++++++++++++++----- 13 files changed, 136 insertions(+), 165 deletions(-) create mode 100644 pkg/modelv2/base.go delete mode 100644 pkg/rimo/driven_test.go diff --git a/pkg/metricv2/analyser.go b/pkg/metricv2/analyser.go index 22087a9..1191ce1 100644 --- a/pkg/metricv2/analyser.go +++ b/pkg/metricv2/analyser.go @@ -7,7 +7,7 @@ import ( type Analyser[T constraints.Ordered] interface { Read(*T) - Build(*modelv2.Column[T]) + Build(*modelv2.Column) } type Multi[T constraints.Ordered] struct { @@ -20,7 +20,7 @@ func (m Multi[T]) Read(value *T) { } } -func (m Multi[T]) Build(metric *modelv2.Column[T]) { +func (m Multi[T]) Build(metric *modelv2.Column) { for _, a := range m.analyser { a.Build(metric) } diff --git a/pkg/metricv2/counter.go b/pkg/metricv2/counter.go index 12a4e03..7f4f9a6 100644 --- a/pkg/metricv2/counter.go +++ b/pkg/metricv2/counter.go @@ -32,7 +32,7 @@ func (c *Counter[T]) Read(value *T) { } } -func (c *Counter[T]) Build(metric *modelv2.Column[T]) { +func (c *Counter[T]) Build(metric *modelv2.Column) { metric.MainMetric.Count = c.countTotal metric.MainMetric.Null = c.countNulls metric.MainMetric.Empty = c.countEmpty diff --git a/pkg/metricv2/distinct.go b/pkg/metricv2/distinct.go index f44a242..f74d0ab 100644 --- a/pkg/metricv2/distinct.go +++ b/pkg/metricv2/distinct.go @@ -21,6 +21,6 @@ func (a *Distinct[T]) Read(value *T) { } } -func (a *Distinct[T]) Build(metric *modelv2.Column[T]) { +func (a *Distinct[T]) Build(metric *modelv2.Column) { metric.MainMetric.Distinct = uint(len(a.values)) } diff --git a/pkg/metricv2/metric_test.go b/pkg/metricv2/metric_test.go index ff4ac35..73c3a95 100644 --- a/pkg/metricv2/metric_test.go +++ b/pkg/metricv2/metric_test.go @@ -17,17 +17,17 @@ func TestStringMetric(t *testing.T) { //nolint:funlen min := "" max := "4441" - expectedMetric := modelv2.Column[string]{ - MainMetric: modelv2.Generic[string]{ + expectedMetric := modelv2.Column{ + MainMetric: modelv2.Generic{ Count: 12, Empty: 1, Null: 1, Distinct: 6, - Samples: []string{"22"}, + Samples: []any{"22"}, Min: &min, Max: &max, }, - StringMetric: modelv2.String{ + StringMetric: &modelv2.String{ MinLen: 0, MaxLen: 4, CountLen: 5, @@ -35,21 +35,21 @@ func TestStringMetric(t *testing.T) { //nolint:funlen { Length: 1, Freq: 0.3333333333333333, - Metrics: modelv2.Generic[string]{ + Metrics: modelv2.Generic{ Count: 4, Empty: 0, Null: 0, Distinct: 1, Min: &text[0], Max: &text[0], - Samples: []string{"1", "1", "1", "1"}, + Samples: []any{"1", "1", "1", "1"}, }, }, }, }, } - actualMetric := modelv2.Column[string]{} + actualMetric := modelv2.Column{} analyser := metricv2.NewString(5, true) for index := range text { diff --git a/pkg/metricv2/minmax.go b/pkg/metricv2/minmax.go index 1d45ba1..7056434 100644 --- a/pkg/metricv2/minmax.go +++ b/pkg/metricv2/minmax.go @@ -35,7 +35,7 @@ func (a *MinMax[T]) Read(value *T) { } } -func (a *MinMax[T]) Build(metric *modelv2.Column[T]) { +func (a *MinMax[T]) Build(metric *modelv2.Column) { metric.MainMetric.Min = a.min metric.MainMetric.Max = a.max } diff --git a/pkg/metricv2/sampler.go b/pkg/metricv2/sampler.go index de769fc..6e103f9 100644 --- a/pkg/metricv2/sampler.go +++ b/pkg/metricv2/sampler.go @@ -38,6 +38,9 @@ func (s *Sampler[T]) Read(value *T) { } } -func (s *Sampler[T]) Build(metric *modelv2.Column[T]) { - metric.MainMetric.Samples = s.samples +func (s *Sampler[T]) Build(metric *modelv2.Column) { + metric.MainMetric.Samples = make([]any, len(s.samples)) + for i, s := range s.samples { + metric.MainMetric.Samples[i] = s + } } diff --git a/pkg/metricv2/string.go b/pkg/metricv2/string.go index 15897ae..9ac37b2 100644 --- a/pkg/metricv2/string.go +++ b/pkg/metricv2/string.go @@ -61,10 +61,10 @@ func (a *String) Read(value *string) { } } -func (a *String) Build(metric *modelv2.Column[string]) { +func (a *String) Build(metric *modelv2.Column) { a.main.Build(metric) - metric.StringMetric = modelv2.String{ + metric.StringMetric = &modelv2.String{ MinLen: slices.Min(maps.Keys(a.byLen)), MaxLen: slices.Max(maps.Keys(a.byLen)), CountLen: len(a.byLen), @@ -72,13 +72,13 @@ func (a *String) Build(metric *modelv2.Column[string]) { } for length, analyser := range a.byLen { - lenMetric := modelv2.Column[string]{} + lenMetric := modelv2.Column{} analyser.Build(&lenMetric) strlen := modelv2.StringLen{ Length: length, Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count), - Metrics: modelv2.Generic[string]{}, + Metrics: modelv2.Generic{}, } strlen.Metrics.Count = lenMetric.MainMetric.Count strlen.Metrics.Empty = lenMetric.MainMetric.Empty diff --git a/pkg/modelv2/base.go b/pkg/modelv2/base.go new file mode 100644 index 0000000..663c562 --- /dev/null +++ b/pkg/modelv2/base.go @@ -0,0 +1,17 @@ +package modelv2 + +type Base struct { + Name string `json:"database" yaml:"database" jsonschema:"required"` + Tables map[string]Table `json:"tables" yaml:"tables" jsonschema:"required"` +} + +type Table struct { + Columns []Column `json:"columns" yaml:"columns" jsonschema:"required" ` +} + +func NewBase(name string) *Base { + return &Base{ + Name: name, + Tables: make(map[string]Table, 10), + } +} diff --git a/pkg/modelv2/column.go b/pkg/modelv2/column.go index 67af4a5..e776fe5 100644 --- a/pkg/modelv2/column.go +++ b/pkg/modelv2/column.go @@ -1,15 +1,13 @@ package modelv2 -import "golang.org/x/exp/constraints" - -type Column[T constraints.Ordered] struct { +type Column struct { Name string `json:"name" yaml:"name" jsonschema:"required"` Type string `json:"type" yaml:"type" jsonschema:"required" validate:"oneof=string numeric boolean"` Config - MainMetric Generic[T] `json:"mainMetric" yaml:"mainMetric" jsonschema:"required"` - StringMetric String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` - NumericMetric Numeric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` - BoolMetric Bool `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` + MainMetric Generic `json:"mainMetric" yaml:"mainMetric" jsonschema:"required"` + StringMetric *String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` + NumericMetric *Numeric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` + BoolMetric *Bool `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` } diff --git a/pkg/modelv2/metrics.go b/pkg/modelv2/metrics.go index f214e75..1d6fb66 100644 --- a/pkg/modelv2/metrics.go +++ b/pkg/modelv2/metrics.go @@ -1,15 +1,13 @@ package modelv2 -import "golang.org/x/exp/constraints" - -type Generic[T constraints.Ordered] struct { - Count uint `json:"count" yaml:"count" jsonschema:"required"` - Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` - Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` - Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` - Min *T `json:"min,omitempty" yaml:"min,omitempty"` - Max *T `json:"max,omitempty" yaml:"max,omitempty"` - Samples []T `json:"samples" yaml:"samples" jsonschema:"required"` +type Generic struct { + Count uint `json:"count" yaml:"count" jsonschema:"required"` + Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` + Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` + Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` + Min any `json:"min,omitempty" yaml:"min,omitempty"` + Max any `json:"max,omitempty" yaml:"max,omitempty"` + Samples []any `json:"samples" yaml:"samples" jsonschema:"required"` } type String struct { @@ -20,9 +18,9 @@ type String struct { } type StringLen struct { - Length int `json:"length" yaml:"length" jsonschema:"required"` - Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"` - Metrics Generic[string] `json:"metrics" yaml:"metrics" jsonschema:"required"` + Length int `json:"length" yaml:"length" jsonschema:"required"` + Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"` + Metrics Generic `json:"metrics" yaml:"metrics" jsonschema:"required"` } type Numeric struct{} diff --git a/pkg/rimo/driven.go b/pkg/rimo/driven.go index 1928b2a..f5cb43e 100644 --- a/pkg/rimo/driven.go +++ b/pkg/rimo/driven.go @@ -17,16 +17,21 @@ package rimo -import ( - "github.com/cgi-fr/rimo/pkg/model" -) +import "github.com/cgi-fr/rimo/pkg/modelv2" + +type ColReader interface { + ColName() string + TableName() string + Next() bool + Value() (any, error) +} type Reader interface { BaseName() string - Next() bool // itère sur les colonnes. - Value() ([]interface{}, string, string, error) // colValues, colName, tableName + Next() bool + Col() (ColReader, error) } type Writer interface { - Export(base *model.Base) error + Export(base *modelv2.Base) error } diff --git a/pkg/rimo/driven_test.go b/pkg/rimo/driven_test.go deleted file mode 100644 index da635c9..0000000 --- a/pkg/rimo/driven_test.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package rimo_test - -import ( - "log" - "math" - "testing" - - "github.com/cgi-fr/rimo/pkg/model" - "github.com/cgi-fr/rimo/pkg/rimo" -) - -// TESTS - -func TestTestInterface(t *testing.T) { - t.Parallel() - - var _ rimo.Reader = (*TestReader)(nil) - - var _ rimo.Writer = (*TestWriter)(nil) -} - -// TestReader implementation - -type colInput struct { - ColName string - ColValues []interface{} -} - -type TestReader struct { - baseName string - data []colInput - tableNames []string // Next() will progressively change tableName - // internal - index int - currentValues []interface{} - currentColName string - currentTableName string -} - -func (r *TestReader) BaseName() string { - return r.baseName -} - -func (r *TestReader) Next() bool { - if r.index == len(r.data) { - log.Println("End of data") - - return false - } - - // update tableName - if len(r.tableNames) == len(r.data) { - r.currentTableName = r.tableNames[r.index] - } else { - // use a percentage to determine the table name to use from the list - percentageComplete := float64(r.index) / float64(len(r.data)) - expectedTableIndex := percentageComplete * float64(len(r.tableNames)) - roundedTableIndex := math.Floor(expectedTableIndex) - tableNameIndex := int(roundedTableIndex) - - r.currentTableName = r.tableNames[tableNameIndex] - } - - r.currentColName = r.data[r.index].ColName - r.currentValues = r.data[r.index].ColValues - r.index++ - - return true -} - -func (r *TestReader) Value() ([]interface{}, string, string, error) { //nolint:wsl - // log.Printf("Processing %s column in %s table", r.currentTableName, r.currentColName) - - return r.currentValues, r.currentColName, r.currentTableName, nil -} - -// TestWriter implementation - -type TestWriter struct { - base model.Base -} - -func (w *TestWriter) Export(base *model.Base) error { - w.base = *base - - return nil -} - -func (w *TestWriter) Base() *model.Base { - return &w.base -} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index e626bbc..803b9e5 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -20,8 +20,8 @@ package rimo import ( "fmt" - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/metricv2" + "github.com/cgi-fr/rimo/pkg/modelv2" "github.com/rs/zerolog/log" ) @@ -32,26 +32,46 @@ func AnalyseBase(reader Reader, writer Writer) error { // log.Debug().Msgf("Processing [%s base]", baseName) - base := model.NewBase(baseName) + base := modelv2.NewBase(baseName) for reader.Next() { // itère colonne par colonne - colValues, colName, tableName, err := reader.Value() + valreader, err := reader.Col() if err != nil { - return fmt.Errorf("failed to get column value : %w", err) + return fmt.Errorf("failed to get column reader : %w", err) } - column, err := metric.ComputeMetric(colName, colValues) - if err != nil { - return fmt.Errorf("failed to compute column : %w", err) - } + nilcount := 0 + + for valreader.Next() { + val, err := valreader.Value() + if err != nil { + return fmt.Errorf("failed to read value : %w", err) + } + + log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, valreader.TableName(), valreader.ColName()) + + switch valtyped := val.(type) { + case string: + col, err := AnalyseString(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := base.Tables[valreader.TableName()] + if !exists { + table = modelv2.Table{ + Columns: []modelv2.Column{}, + } + } - log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, tableName, column.Name) - // log.Debug().Msg(valast.String(column)) + table.Columns = append(table.Columns, col) - base.AddColumn(column, tableName) + base.Tables[valreader.TableName()] = table + } + } } - base.SortBase() + // base.SortBase() // log.Debug().Msg("---------- Finish processing base :") // log.Debug().Msg(valast.String(*base)) @@ -64,3 +84,41 @@ func AnalyseBase(reader Reader, writer Writer) error { return nil } + +func AnalyseString(nilcount int, firstValue string, reader ColReader) (modelv2.Column, error) { + column := modelv2.Column{ + Name: reader.ColName(), + Type: "string", + Config: modelv2.Config{}, //nolint:exhaustruct + MainMetric: modelv2.Generic{}, //nolint:exhaustruct + StringMetric: &modelv2.String{}, //nolint:exhaustruct + NumericMetric: nil, + BoolMetric: nil, + } + + analyser := metricv2.NewString(5, true) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + analyser.Read(&firstValue) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + switch valtyped := val.(type) { + case string: + analyser.Read(&valtyped) + default: + return column, fmt.Errorf("invalue value type : %w", err) + } + } + + analyser.Build(&column) + + return column, nil +} From 47d9e50a97b648f9a8abedcc53dc20d9c8774bf9 Mon Sep 17 00:00:00 2001 From: adrienaury Date: Fri, 1 Dec 2023 15:33:56 +0000 Subject: [PATCH 08/26] refactor: infra file reader --- go.mod | 5 +- go.sum | 2 + internal/infra/fileWriter.go | 3 +- internal/infra/filesReader_v2.go | 106 +++++++++++++++++++++++++++++++ internal/infra/infra_test.go | 2 +- 5 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 internal/infra/filesReader_v2.go diff --git a/go.mod b/go.mod index cddbdd1..08ad07f 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,10 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) -require gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +require ( + github.com/goccy/go-json v0.10.2 // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +) require ( github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/go.sum b/go.sum index 4893f5a..515a36f 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= diff --git a/internal/infra/fileWriter.go b/internal/infra/fileWriter.go index d652811..c20379e 100644 --- a/internal/infra/fileWriter.go +++ b/internal/infra/fileWriter.go @@ -22,6 +22,7 @@ import ( "os" "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/modelv2" "gopkg.in/yaml.v3" ) @@ -35,7 +36,7 @@ func StdoutWriterFactory() *StdoutWriter { return &writer } -func (w *StdoutWriter) Export(base *model.Base) error { +func (w *StdoutWriter) Export(base *modelv2.Base) error { fmt.Printf("%v\n", base) return nil diff --git a/internal/infra/filesReader_v2.go b/internal/infra/filesReader_v2.go new file mode 100644 index 0000000..7902788 --- /dev/null +++ b/internal/infra/filesReader_v2.go @@ -0,0 +1,106 @@ +package infra + +import ( + "errors" + "fmt" + "os" + "path" + "strings" + + "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/goccy/go-json" +) + +var ErrReadFile = errors.New("error while reading file") + +type JSONLFileReader struct { + tablename string + source *os.File + columns []string + current int + decoder *json.Decoder + basename string +} + +func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) { + source, err := os.Open(filepath) + if err != nil { + return nil, fmt.Errorf("%w", err) + } + + template := map[string]any{} + + decoder := json.NewDecoder(source) + if err := decoder.Decode(&template); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) + } + + source.Seek(0, 0) + + columns := make([]string, 0, len(template)) + for column := range template { + columns = append(columns, column) + } + + return &JSONLFileReader{ + tablename: strings.TrimSuffix(path.Base(filepath), path.Ext(filepath)), + source: source, + columns: columns, + current: -1, + decoder: json.NewDecoder(source), + basename: basename, + }, nil +} + +func (fr *JSONLFileReader) BaseName() string { + return fr.basename +} + +func (fr *JSONLFileReader) Next() bool { + fr.current++ + + fr.source.Seek(0, 0) + fr.decoder = json.NewDecoder(fr.source) + + return fr.current < len(fr.columns) +} + +func (fr *JSONLFileReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return NewJSONLColReader(fr.tablename, fr.columns[fr.current], fr.decoder), nil +} + +type JSONLColReader struct { + table string + column string + decoder *json.Decoder +} + +func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColReader { + return &JSONLColReader{ + table: table, + column: column, + decoder: decoder, + } +} + +func (cr *JSONLColReader) ColName() string { + return cr.column +} + +func (cr *JSONLColReader) TableName() string { + return cr.table +} + +func (cr *JSONLColReader) Next() bool { + return cr.decoder.More() +} + +func (cr *JSONLColReader) Value() (any, error) { + row := map[string]any{} + + if err := cr.decoder.Decode(&row); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) + } + + return row[cr.column], nil +} diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go index ad40f79..f126d6d 100644 --- a/internal/infra/infra_test.go +++ b/internal/infra/infra_test.go @@ -36,7 +36,7 @@ func TestPipeline(t *testing.T) { inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl") - reader, err := infra.FilesReaderFactory([]string{inputPath}) + reader, err := infra.NewJSONLFileReader("base", inputPath) require.NoError(t, err) writer := infra.StdoutWriterFactory() From 77541904e3da90e5bd1a6fb562b228785bd43b8e Mon Sep 17 00:00:00 2001 From: adrienaury Date: Fri, 1 Dec 2023 16:01:25 +0000 Subject: [PATCH 09/26] refactor: fix driver --- pkg/rimo/driver.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index 803b9e5..30b143b 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -67,6 +67,8 @@ func AnalyseBase(reader Reader, writer Writer) error { table.Columns = append(table.Columns, col) base.Tables[valreader.TableName()] = table + case nil: + nilcount++ } } } From ced7cc3026b4a93ca43cf54985942bb50001d7d3 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Sat, 2 Dec 2023 21:04:53 +0000 Subject: [PATCH 10/26] refactor: reader v2 --- cmd/rimo/main.go | 17 +-- go.mod | 6 +- internal/infra/fileWriter.go | 3 +- internal/infra/fileWriter_test.go | 84 -------------- internal/infra/filesReader_test.go | 83 -------------- internal/infra/filesReader_v2.go | 50 +++++++++ pkg/rimo/driver_test.go | 175 ----------------------------- 7 files changed, 55 insertions(+), 363 deletions(-) delete mode 100644 internal/infra/fileWriter_test.go delete mode 100644 internal/infra/filesReader_test.go delete mode 100644 pkg/rimo/driver_test.go diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index e75290a..8e51ee9 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -77,25 +77,12 @@ func main() { //nolint:funlen outputDir := args[1] // Reader - - inputList, err := BuildFilepathList(inputDir, ".jsonl") - if err != nil { - log.Fatal().Msgf("error listing files: %v", err) - } - - reader, err := infra.FilesReaderFactory(inputList) + reader, err := infra.NewJSONLFolderReader(inputDir) if err != nil { log.Fatal().Msgf("error creating reader: %v", err) } - // Writer - // (could be relocated to infra.FilesReader) - baseName, _, err := infra.ExtractName(inputList[0]) - if err != nil { - log.Fatal().Msgf("error extracting base name: %v", err) - } - - outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", baseName)) + outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", reader.BaseName())) writer, err := infra.YAMLWriterFactory(outputPath) if err != nil { diff --git a/go.mod b/go.mod index 08ad07f..50a9bd2 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/cgi-fr/rimo go 1.20 require ( + github.com/goccy/go-json v0.10.2 github.com/hexops/valast v1.4.4 github.com/rs/zerolog v1.30.0 github.com/spf13/cobra v1.7.0 @@ -11,10 +12,7 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) -require ( - github.com/goccy/go-json v0.10.2 // indirect - gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect -) +require gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect require ( github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/internal/infra/fileWriter.go b/internal/infra/fileWriter.go index c20379e..617770d 100644 --- a/internal/infra/fileWriter.go +++ b/internal/infra/fileWriter.go @@ -21,7 +21,6 @@ import ( "fmt" "os" - "github.com/cgi-fr/rimo/pkg/model" "github.com/cgi-fr/rimo/pkg/modelv2" "gopkg.in/yaml.v3" ) @@ -62,7 +61,7 @@ func YAMLWriterFactory(filepath string) (*YAMLWriter, error) { } // Write a YAML file from RIMO base at outputPath. -func (w *YAMLWriter) Export(base *model.Base) error { +func (w *YAMLWriter) Export(base *modelv2.Base) error { outputFile, err := os.Create(w.outputPath) if err != nil { return fmt.Errorf("failed to create output file: %w", err) diff --git a/internal/infra/fileWriter_test.go b/internal/infra/fileWriter_test.go deleted file mode 100644 index cd34651..0000000 --- a/internal/infra/fileWriter_test.go +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -const ( - dataDir = "../../testdata/" -) - -func TestWriterYAML(t *testing.T) { - t.Parallel() - - base := model.Base{ - Name: "databaseName", - Tables: []model.Table{ - { - Name: "tableName", - Columns: []model.Column{}, - }, - }, - } - - // Create a temporary directory for the test - tempDir, err := os.MkdirTemp(dataDir, "export_test") - require.NoError(t, err) - - defer os.RemoveAll(tempDir) - - // Create a temporary file for the output - outputFile := filepath.Join(tempDir, "output.yaml") - - // Create the writer - writer, err := infra.YAMLWriterFactory(outputFile) - require.NoError(t, err) - - err = writer.Export(&base) - require.NoError(t, err) - - // Read the output file and check its contents - file, err := os.Open(outputFile) - require.NoError(t, err) - - defer file.Close() - - stat, err := file.Stat() - require.NoError(t, err) - - outputData := make([]byte, stat.Size()) - _, err = file.Read(outputData) - require.NoError(t, err) - - expectedData := `database: databaseName -tables: - - name: tableName - columns: [] -` - - assert.Equal(t, expectedData, string(outputData)) -} diff --git a/internal/infra/filesReader_test.go b/internal/infra/filesReader_test.go deleted file mode 100644 index ed6cdc0..0000000 --- a/internal/infra/filesReader_test.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "fmt" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/stretchr/testify/assert" -) - -func TestReader(t *testing.T) { - t.Parallel() - - inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") - - reader, err := infra.FilesReaderFactory([]string{inputFile}) - assert.NoError(t, err) - - // Assertions. - - actualBaseName := reader.BaseName() - expectedBaseName := "data" - assert.Equal(t, expectedBaseName, actualBaseName) - - expectedTableName := "input" - expectedDataMap := map[string][]interface{}{ - "address": {"PSC", "095", "06210"}, - "age": {nil, nil, float64(61)}, - "major": {true, false, true}, - "empty": {nil, nil, nil}, - } - - for reader.Next() { - values, colName, tableName, err := reader.Value() - if err != nil { - assert.NoError(t, err) - } - - expectedColData, ok := expectedDataMap[colName] - if !ok { - assert.Fail(t, "column name not found : %s", colName) - } - - assert.Equal(t, expectedColData, values) - assert.Equal(t, expectedTableName, tableName) - } -} - -func TestReaderMultipleFiles(t *testing.T) { - t.Parallel() - - inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") - inputFile2 := filepath.Join(dataDir, "data0/data_input2.jsonl") - reader, err := infra.FilesReaderFactory([]string{inputFile, inputFile2}) - assert.NoError(t, err) - - for reader.Next() { - values, colName, tableName, err := reader.Value() - if err != nil { - assert.NoError(t, err) - } - - fmt.Printf("%s.%s: %v\n", tableName, colName, values) - } -} diff --git a/internal/infra/filesReader_v2.go b/internal/infra/filesReader_v2.go index 7902788..bef06a6 100644 --- a/internal/infra/filesReader_v2.go +++ b/internal/infra/filesReader_v2.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path" + "path/filepath" "strings" "github.com/cgi-fr/rimo/pkg/rimo" @@ -13,6 +14,55 @@ import ( var ErrReadFile = errors.New("error while reading file") +type JSONLFolderReader struct { + basename string + readers []*JSONLFileReader + current int +} + +func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { + basename := path.Base(folderpath) + + pattern := filepath.Join(folderpath, "*.jsonl") + + files, err := filepath.Glob(pattern) + if err != nil { + return nil, fmt.Errorf("error listing files: %w", err) + } + + readers := make([]*JSONLFileReader, len(files)) + for index, filepath := range files { + readers[index], err = NewJSONLFileReader(basename, filepath) + if err != nil { + return nil, fmt.Errorf("error opening files: %w", err) + } + } + + return &JSONLFolderReader{ + basename: basename, + readers: readers, + current: -1, + }, nil +} + +func (r *JSONLFolderReader) BaseName() string { + return r.basename +} + +func (r *JSONLFolderReader) Next() bool { + if r.current < len(r.readers) && !r.readers[r.current].Next() { + r.current++ + + return r.Next() + } + + return r.current < len(r.readers) +} + +func (r *JSONLFolderReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return r.readers[r.current].Col() +} + type JSONLFileReader struct { tablename string source *os.File diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go deleted file mode 100644 index 186e18f..0000000 --- a/pkg/rimo/driver_test.go +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package rimo_test - -import ( - "fmt" - "path/filepath" - "testing" - "time" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/cgi-fr/rimo/pkg/rimo" - - "github.com/hexops/valast" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// Run Analyse pipeline with FilesReader and TestWriter and compare with expected result. -const ( - dataDir = "../../testdata/" - inputName = "data_input.jsonl" - outputName = "interface_data_output.yaml" - expectedName = "data_expected.yaml" -) - -type testCase struct { - name string - inputPath string - expectedPath string -} - -func getTestCase(dataFolder string) testCase { - return testCase{ - name: filepath.Base(dataFolder), - inputPath: filepath.Join(dataFolder, inputName), - expectedPath: filepath.Join(dataFolder, expectedName), - } -} - -// PIPELINE TESTS - -// Note : numeric value should be converted to float64. -func TestManualPipeline(t *testing.T) { - t.Parallel() - - // Set up TestReader - baseName := "databaseName" - tableNames := []string{"tableTest"} - testInput := []colInput{ - { - ColName: "string", - ColValues: []interface{}{"val1", "val2", "val3"}, - }, - { - ColName: "col2", - ColValues: []interface{}{true, false, nil}, - }, - { - ColName: "col9", - ColValues: []interface{}{float64(31), float64(29), float64(42)}, - }, - { - ColName: "empty", - ColValues: []interface{}{nil, nil, nil}, - }, - } - - testReader := TestReader{ //nolint:exhaustruct - baseName: baseName, - tableNames: tableNames, - data: testInput, - index: 0, - } - - testWriter := TestWriter{} //nolint:exhaustruct - - err := rimo.AnalyseBase(&testReader, &testWriter) - if err != nil { - t.Errorf("Error: %v", err) - } - - t.Logf("Base returned : %s", valast.String(*testWriter.Base())) -} - -// Ensure that the pipeline produce the same base as expected. -func TestPipeline(t *testing.T) { - t.Parallel() - - testCases := []testCase{} - testCases = append(testCases, getTestCase("../../testdata/data1/")) - // testCases = append(testCases, getTestCase("../../testdata/data2/")) - - for _, testCase := range testCases { - testCase := testCase // capture range variable - t.Run(testCase.name, func(t *testing.T) { - t.Parallel() - - // Actual base - - reader, err := infra.FilesReaderFactory([]string{testCase.inputPath}) - assert.NoError(t, err) - - writer := &TestWriter{} //nolint:exhaustruct - - err = rimo.AnalyseBase(reader, writer) - assert.NoError(t, err) - - actualBase := writer.Base() - - // Expected base - expectedBase, err := model.LoadBase(testCase.expectedPath) - if err != nil { - t.Errorf("Error: %v", err) - } - - // Remove sample - model.RemoveSampleFromBase(expectedBase) - model.RemoveSampleFromBase(actualBase) - - fmt.Printf("Actual base : %s\n", valast.String(*actualBase)) - // Compare - equal, diff := model.SameBase(expectedBase, actualBase) - if !equal { - t.Errorf("Base are not equal:\n%s", diff) - } - }) - } -} - -// Benchmark (same as previous analyse_test.go benchmark). -func BenchmarkAnalyseInterface(b *testing.B) { - for _, numLines := range []int{100, 1000, 10000, 100000} { - inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines)) - inputList := []string{inputPath} - outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) - - b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) { - startTime := time.Now() - - reader, err := infra.FilesReaderFactory(inputList) - require.NoError(b, err) - - writer, err := infra.YAMLWriterFactory(outputPath) - require.NoError(b, err) - - b.ResetTimer() - for n := 0; n < b.N; n++ { - err := rimo.AnalyseBase(reader, writer) - require.NoError(b, err) - } - b.StopTimer() - - elapsed := time.Since(startTime) - linesPerSecond := float64(numLines*b.N) / elapsed.Seconds() - b.ReportMetric(linesPerSecond, "lines/s") - }) - } -} From f805374b16b26991bd6ce86fa01d1c7372be393f Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Sat, 2 Dec 2023 21:06:30 +0000 Subject: [PATCH 11/26] refactor: reader v2 --- internal/infra/filesReader_v2.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/infra/filesReader_v2.go b/internal/infra/filesReader_v2.go index bef06a6..c37dfe5 100644 --- a/internal/infra/filesReader_v2.go +++ b/internal/infra/filesReader_v2.go @@ -41,7 +41,7 @@ func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { return &JSONLFolderReader{ basename: basename, readers: readers, - current: -1, + current: 0, }, nil } From a5b067ea9fbb0473014f8bead4641f44c8bcfa0f Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 11:14:55 +0000 Subject: [PATCH 12/26] refactor: lint + sample-size flag --- cmd/rimo/main.go | 10 +++- internal/infra/filesReader_v2.go | 9 +++- internal/infra/infra_test.go | 93 -------------------------------- internal/infra/loader_test.go | 39 -------------- pkg/metric/metricstring.go | 20 +++---- pkg/metric/metricstring_test.go | 2 + pkg/metricv2/metric_test.go | 6 +-- pkg/metricv2/string.go | 4 +- pkg/modelv2/base.go | 4 +- pkg/rimo/driver.go | 12 +++-- 10 files changed, 44 insertions(+), 155 deletions(-) delete mode 100644 internal/infra/infra_test.go delete mode 100644 internal/infra/loader_test.go diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 8e51ee9..2b0f6a5 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -30,6 +30,8 @@ import ( "github.com/spf13/cobra" ) +const DefaultSampleSize = uint(5) + // Provisioned by ldflags. var ( name string //nolint: gochecknoglobals @@ -37,6 +39,8 @@ var ( commit string //nolint: gochecknoglobals buildDate string //nolint: gochecknoglobals builtBy string //nolint: gochecknoglobals + + sampleSize uint //nolint: gochecknoglobals ) func main() { //nolint:funlen @@ -89,7 +93,9 @@ func main() { //nolint:funlen log.Fatal().Msgf("error creating writer: %v", err) } - err = rimo.AnalyseBase(reader, writer) + driver := rimo.Driver{SampleSize: sampleSize} + + err = driver.AnalyseBase(reader, writer) if err != nil { log.Fatal().Msgf("error generating rimo.yaml: %v", err) } @@ -98,6 +104,8 @@ func main() { //nolint:funlen }, } + rimoAnalyseCmd.Flags().UintVar(&sampleSize, "sample-size", DefaultSampleSize, "number of sample value to collect") + rootCmd.AddCommand(rimoAnalyseCmd) rootCmd.AddCommand(rimoSchemaCmd) diff --git a/internal/infra/filesReader_v2.go b/internal/infra/filesReader_v2.go index c37dfe5..1e4a5d9 100644 --- a/internal/infra/filesReader_v2.go +++ b/internal/infra/filesReader_v2.go @@ -85,7 +85,9 @@ func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, err return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - source.Seek(0, 0) + if _, err := source.Seek(0, 0); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) + } columns := make([]string, 0, len(template)) for column := range template { @@ -109,7 +111,10 @@ func (fr *JSONLFileReader) BaseName() string { func (fr *JSONLFileReader) Next() bool { fr.current++ - fr.source.Seek(0, 0) + if _, err := fr.source.Seek(0, 0); err != nil { + panic(err) + } + fr.decoder = json.NewDecoder(fr.source) return fr.current < len(fr.columns) diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go deleted file mode 100644 index f126d6d..0000000 --- a/internal/infra/infra_test.go +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/rimo" - "github.com/stretchr/testify/require" -) - -const ( - testdataDir = "../../testdata/" -) - -// Test RIMO pipeline with FilesReader, JSONLinesLoader and YAMLWriter. -func TestPipeline(t *testing.T) { - t.Parallel() - - inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl") - - reader, err := infra.NewJSONLFileReader("base", inputPath) - require.NoError(t, err) - - writer := infra.StdoutWriterFactory() - - err = rimo.AnalyseBase(reader, writer) - require.NoError(t, err) -} - -// var ( -// Readers []*rimo.Reader -// Writers []*rimo.Writer -// ) - -// // List of implemented readers and writers. -// func GetReaders(filepathList []string) []*rimo.Reader { -// filesReader, err := infra.FilesReaderFactory(filepathList) -// if err != nil { -// panic(err) -// } - -// Readers = []*rimo.Reader{filesReader} - -// return Readers -// } - -// func GetWriters() []*rimo.Writer { -// yamlWriter := infra.YAMLWriterFactory("../../testdata/data1/data_output.yaml") - -// Writers = []*rimo.Writer{yamlWriter, infra.StdoutWriter{}} - -// return Writers -// } - -// func TestInterface(t *testing.T) { -// t.Parallel() - -// Writers = GetWriters() -// Readers = GetReaders([]string{"../../testdata/data1/data_input.jsonl"}) -// // Assert that all readers and writers implement the Reader and Writer interfaces. -// for _, reader := range Readers { -// var _ rimo.Reader = (reader)(nil) -// } -// for _, writer := range Writers { -// var _ rimo.Reader = (writer)(nil) -// } - -// // Assert that all combinations of readers and writers can be used in the pipeline. -// for _, reader := range Readers { -// for _, writer := range Writers { -// err := rimo.AnalyseBase(reader, writer) -// require.NoError(t, err) -// } -// } -// } diff --git a/internal/infra/loader_test.go b/internal/infra/loader_test.go deleted file mode 100644 index d55186d..0000000 --- a/internal/infra/loader_test.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "fmt" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/stretchr/testify/require" -) - -func TestLoaderJSONL(t *testing.T) { - t.Parallel() - - path := filepath.Join(testdataDir, "data1/data_input.jsonl") - - LoaderJSONL := infra.JSONLinesLoader{} - - data, err := LoaderJSONL.Load(path) - require.NoError(t, err) - fmt.Printf("dataMap: %v\n", data) -} diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go index b984e18..9765b97 100644 --- a/pkg/metric/metricstring.go +++ b/pkg/metric/metricstring.go @@ -197,16 +197,16 @@ func (s *String) Build() model.Col[string] { result.StringMetric.MaxLen = lengths[len(lengths)-1] for _, length := range lengths { - len := model.StringLen{} - len.Length = length - len.Freq = float64(s.byLen[length].CountTotal()) / float64(s.main.CountTotal()) - len.Metrics.Count = s.byLen[length].CountTotal() - len.Metrics.Empty = s.byLen[length].CountEmpty() - len.Metrics.Null = s.byLen[length].CountNulls() - len.Metrics.Max = s.byLen[length].Max() - len.Metrics.Min = s.byLen[length].Min() - len.Metrics.Samples = s.byLen[length].Samples() - result.StringMetric.Lengths = append(result.StringMetric.Lengths, len) + strlen := model.StringLen{} //nolint:exhaustruct + strlen.Length = length + strlen.Freq = float64(s.byLen[length].CountTotal()) / float64(s.main.CountTotal()) + strlen.Metrics.Count = s.byLen[length].CountTotal() + strlen.Metrics.Empty = s.byLen[length].CountEmpty() + strlen.Metrics.Null = s.byLen[length].CountNulls() + strlen.Metrics.Max = s.byLen[length].Max() + strlen.Metrics.Min = s.byLen[length].Min() + strlen.Metrics.Samples = s.byLen[length].Samples() + result.StringMetric.Lengths = append(result.StringMetric.Lengths, strlen) } return result diff --git a/pkg/metric/metricstring_test.go b/pkg/metric/metricstring_test.go index 921461d..c97c247 100644 --- a/pkg/metric/metricstring_test.go +++ b/pkg/metric/metricstring_test.go @@ -62,6 +62,8 @@ func TestStringMetric(t *testing.T) { } func TestStringMetricV2(t *testing.T) { + t.Parallel() + analyser := metric.NewString(5) strings := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""} diff --git a/pkg/metricv2/metric_test.go b/pkg/metricv2/metric_test.go index 73c3a95..93a646d 100644 --- a/pkg/metricv2/metric_test.go +++ b/pkg/metricv2/metric_test.go @@ -17,7 +17,7 @@ func TestStringMetric(t *testing.T) { //nolint:funlen min := "" max := "4441" - expectedMetric := modelv2.Column{ + expectedMetric := modelv2.Column{ //nolint:exhaustruct MainMetric: modelv2.Generic{ Count: 12, Empty: 1, @@ -49,7 +49,7 @@ func TestStringMetric(t *testing.T) { //nolint:funlen }, } - actualMetric := modelv2.Column{} + actualMetric := modelv2.Column{} //nolint:exhaustruct analyser := metricv2.NewString(5, true) for index := range text { @@ -77,6 +77,6 @@ func TestStringMetric(t *testing.T) { //nolint:funlen for i := 0; i < len(expectedMetric.StringMetric.Lengths); i++ { assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Length, actualMetric.StringMetric.Lengths[i].Length) assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Freq, actualMetric.StringMetric.Lengths[i].Freq) - assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Metrics.Samples, actualMetric.StringMetric.Lengths[i].Metrics.Samples) + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Metrics.Samples, actualMetric.StringMetric.Lengths[i].Metrics.Samples) //nolint:lll } } diff --git a/pkg/metricv2/string.go b/pkg/metricv2/string.go index 9ac37b2..fc55069 100644 --- a/pkg/metricv2/string.go +++ b/pkg/metricv2/string.go @@ -72,13 +72,13 @@ func (a *String) Build(metric *modelv2.Column) { } for length, analyser := range a.byLen { - lenMetric := modelv2.Column{} + lenMetric := modelv2.Column{} //nolint:exhaustruct analyser.Build(&lenMetric) strlen := modelv2.StringLen{ Length: length, Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count), - Metrics: modelv2.Generic{}, + Metrics: modelv2.Generic{}, //nolint:exhaustruct } strlen.Metrics.Count = lenMetric.MainMetric.Count strlen.Metrics.Empty = lenMetric.MainMetric.Empty diff --git a/pkg/modelv2/base.go b/pkg/modelv2/base.go index 663c562..653ee7c 100644 --- a/pkg/modelv2/base.go +++ b/pkg/modelv2/base.go @@ -1,5 +1,7 @@ package modelv2 +const DefaultTableSize = 10 + type Base struct { Name string `json:"database" yaml:"database" jsonschema:"required"` Tables map[string]Table `json:"tables" yaml:"tables" jsonschema:"required"` @@ -12,6 +14,6 @@ type Table struct { func NewBase(name string) *Base { return &Base{ Name: name, - Tables: make(map[string]Table, 10), + Tables: make(map[string]Table, DefaultTableSize), } } diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index 30b143b..cb93d67 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -26,7 +26,11 @@ import ( "github.com/rs/zerolog/log" ) -func AnalyseBase(reader Reader, writer Writer) error { +type Driver struct { + SampleSize uint +} + +func (d Driver) AnalyseBase(reader Reader, writer Writer) error { // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel) baseName := reader.BaseName() @@ -52,7 +56,7 @@ func AnalyseBase(reader Reader, writer Writer) error { switch valtyped := val.(type) { case string: - col, err := AnalyseString(nilcount, valtyped, valreader) + col, err := d.AnalyseString(nilcount, valtyped, valreader) if err != nil { return fmt.Errorf("failed to analyse column : %w", err) } @@ -87,7 +91,7 @@ func AnalyseBase(reader Reader, writer Writer) error { return nil } -func AnalyseString(nilcount int, firstValue string, reader ColReader) (modelv2.Column, error) { +func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) (modelv2.Column, error) { column := modelv2.Column{ Name: reader.ColName(), Type: "string", @@ -98,7 +102,7 @@ func AnalyseString(nilcount int, firstValue string, reader ColReader) (modelv2.C BoolMetric: nil, } - analyser := metricv2.NewString(5, true) + analyser := metricv2.NewString(d.SampleSize, true) for i := 0; i < nilcount; i++ { analyser.Read(nil) From 132a4142cb61bcc7847fc8232c5d05c61140af3e Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:18:49 +0000 Subject: [PATCH 13/26] refactor: numeric metric --- pkg/metricv2/mean.go | 31 +++++++++++++ pkg/metricv2/numeric.go | 22 +++++++++ pkg/modelv2/metrics.go | 4 +- pkg/rimo/driver.go | 100 ++++++++++++++++++++++++++++++++++++---- pkg/rimo/error.go | 5 ++ 5 files changed, 152 insertions(+), 10 deletions(-) create mode 100644 pkg/metricv2/mean.go create mode 100644 pkg/metricv2/numeric.go create mode 100644 pkg/rimo/error.go diff --git a/pkg/metricv2/mean.go b/pkg/metricv2/mean.go new file mode 100644 index 0000000..8338a08 --- /dev/null +++ b/pkg/metricv2/mean.go @@ -0,0 +1,31 @@ +package metricv2 + +import "github.com/cgi-fr/rimo/pkg/modelv2" + +type Mean struct { + count uint + mean float64 +} + +func NewMean() *Mean { + return &Mean{ + count: 0, + mean: 0, + } +} + +func (a *Mean) Read(value *float64) { + if value == nil { + return + } + + a.count++ + + a.mean += (*value - a.mean) / float64(a.count) +} + +func (a *Mean) Build(metric *modelv2.Column) { + metric.NumericMetric = &modelv2.Numeric{ + Mean: a.mean, + } +} diff --git a/pkg/metricv2/numeric.go b/pkg/metricv2/numeric.go new file mode 100644 index 0000000..65d2eae --- /dev/null +++ b/pkg/metricv2/numeric.go @@ -0,0 +1,22 @@ +package metricv2 + +type Numeric struct { + Multi[float64] +} + +func NewNumeric(sampleSize uint, countDistinct bool) *Numeric { + mainAnalyser := []Analyser[float64]{ + NewCounter[float64](), // count total, count null, count empty + NewMinMax[float64](), // store min and max values + NewSampler[float64](sampleSize), // store few samples + NewMean(), // calculate running mean + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[float64]()) + } + + return &Numeric{ + Multi: Multi[float64]{mainAnalyser}, + } +} diff --git a/pkg/modelv2/metrics.go b/pkg/modelv2/metrics.go index 1d6fb66..212bbc3 100644 --- a/pkg/modelv2/metrics.go +++ b/pkg/modelv2/metrics.go @@ -23,6 +23,8 @@ type StringLen struct { Metrics Generic `json:"metrics" yaml:"metrics" jsonschema:"required"` } -type Numeric struct{} +type Numeric struct { + Mean float64 `json:"mean" yaml:"mean" jsonschema:"required"` +} type Bool struct{} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index cb93d67..534ed6e 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -30,12 +30,10 @@ type Driver struct { SampleSize uint } +//nolint:funlen,cyclop func (d Driver) AnalyseBase(reader Reader, writer Writer) error { - // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel) baseName := reader.BaseName() - // log.Debug().Msgf("Processing [%s base]", baseName) - base := modelv2.NewBase(baseName) for reader.Next() { // itère colonne par colonne @@ -70,6 +68,22 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table.Columns = append(table.Columns, col) + base.Tables[valreader.TableName()] = table + case float64: + col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := base.Tables[valreader.TableName()] + if !exists { + table = modelv2.Table{ + Columns: []modelv2.Column{}, + } + } + + table.Columns = append(table.Columns, col) + base.Tables[valreader.TableName()] = table case nil: nilcount++ @@ -77,12 +91,6 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { } } - // base.SortBase() - - // log.Debug().Msg("---------- Finish processing base :") - // log.Debug().Msg(valast.String(*base)) - // log.Debug().Msg("----------") - err := writer.Export(base) if err != nil { return fmt.Errorf("failed to export base : %w", err) @@ -128,3 +136,77 @@ func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) return column, nil } + +func (d Driver) AnalyseNumeric(nilcount int, firstValue float64, reader ColReader) (modelv2.Column, error) { + column := modelv2.Column{ + Name: reader.ColName(), + Type: "string", + Config: modelv2.Config{}, //nolint:exhaustruct + MainMetric: modelv2.Generic{}, //nolint:exhaustruct + StringMetric: nil, + NumericMetric: &modelv2.Numeric{}, //nolint:exhaustruct + BoolMetric: nil, + } + + analyser := metricv2.NewNumeric(d.SampleSize, true) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + analyser.Read(&firstValue) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + valtyped, err := GetFloat64(val) + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + analyser.Read(valtyped) + } + + analyser.Build(&column) + + return column, nil +} + +//nolint:cyclop +func GetFloat64(value any) (*float64, error) { + var converted float64 + + switch valtyped := value.(type) { + case float64: + converted = valtyped + case float32: + converted = float64(valtyped) + case int: + converted = float64(valtyped) + case int8: + converted = float64(valtyped) + case int16: + converted = float64(valtyped) + case int32: + converted = float64(valtyped) + case int64: + converted = float64(valtyped) + case uint: + converted = float64(valtyped) + case uint8: + converted = float64(valtyped) + case uint16: + converted = float64(valtyped) + case uint32: + converted = float64(valtyped) + case uint64: + converted = float64(valtyped) + default: + return nil, fmt.Errorf("%w : %T", ErrInvalidValueType, value) + } + + return &converted, nil +} diff --git a/pkg/rimo/error.go b/pkg/rimo/error.go new file mode 100644 index 0000000..083daa5 --- /dev/null +++ b/pkg/rimo/error.go @@ -0,0 +1,5 @@ +package rimo + +import "errors" + +var ErrInvalidValueType = errors.New("invalue value type") From 742c0caa20c2b1070064c5306b8caa3581116c8f Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:34:15 +0000 Subject: [PATCH 14/26] refactor: stable output --- pkg/modelv2/base.go | 9 +++++---- pkg/rimo/driver.go | 24 ++++++++++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pkg/modelv2/base.go b/pkg/modelv2/base.go index 653ee7c..2a45f81 100644 --- a/pkg/modelv2/base.go +++ b/pkg/modelv2/base.go @@ -3,17 +3,18 @@ package modelv2 const DefaultTableSize = 10 type Base struct { - Name string `json:"database" yaml:"database" jsonschema:"required"` - Tables map[string]Table `json:"tables" yaml:"tables" jsonschema:"required"` + Name string `json:"database" yaml:"database" jsonschema:"required"` + Tables []Table `json:"tables" yaml:"tables" jsonschema:"required"` } type Table struct { - Columns []Column `json:"columns" yaml:"columns" jsonschema:"required" ` + Name string `json:"name" yaml:"name" jsonschema:"required"` + Columns []Column `json:"columns" yaml:"columns" jsonschema:"required"` } func NewBase(name string) *Base { return &Base{ Name: name, - Tables: make(map[string]Table, DefaultTableSize), + Tables: make([]Table, 0, DefaultTableSize), } } diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index 534ed6e..ebf28a0 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -19,6 +19,7 @@ package rimo import ( "fmt" + "sort" "github.com/cgi-fr/rimo/pkg/metricv2" "github.com/cgi-fr/rimo/pkg/modelv2" @@ -35,6 +36,7 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { baseName := reader.BaseName() base := modelv2.NewBase(baseName) + tables := map[string]modelv2.Table{} for reader.Next() { // itère colonne par colonne valreader, err := reader.Col() @@ -59,38 +61,52 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { return fmt.Errorf("failed to analyse column : %w", err) } - table, exists := base.Tables[valreader.TableName()] + table, exists := tables[valreader.TableName()] if !exists { table = modelv2.Table{ + Name: valreader.TableName(), Columns: []modelv2.Column{}, } } table.Columns = append(table.Columns, col) - base.Tables[valreader.TableName()] = table + tables[valreader.TableName()] = table case float64: col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) if err != nil { return fmt.Errorf("failed to analyse column : %w", err) } - table, exists := base.Tables[valreader.TableName()] + table, exists := tables[valreader.TableName()] if !exists { table = modelv2.Table{ + Name: valreader.TableName(), Columns: []modelv2.Column{}, } } table.Columns = append(table.Columns, col) - base.Tables[valreader.TableName()] = table + tables[valreader.TableName()] = table case nil: nilcount++ } } } + for _, table := range tables { + sort.SliceStable(table.Columns, func(i, j int) bool { + return table.Columns[i].Name < table.Columns[j].Name + }) + + base.Tables = append(base.Tables, table) + } + + sort.SliceStable(base.Tables, func(i, j int) bool { + return base.Tables[i].Name < base.Tables[j].Name + }) + err := writer.Export(base) if err != nil { return fmt.Errorf("failed to export base : %w", err) From 7e0f04c75c0689c82270a3b30c0b4c2d28046bdd Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:55:08 +0000 Subject: [PATCH 15/26] refactor: bool metric --- pkg/metricv2/analyser.go | 5 ++- pkg/metricv2/bool.go | 20 +++++++++++ pkg/metricv2/counter.go | 5 ++- pkg/metricv2/distinct.go | 5 ++- pkg/metricv2/sampler.go | 5 ++- pkg/metricv2/trueratio.go | 33 ++++++++++++++++++ pkg/metricv2/types.go | 7 ++++ pkg/modelv2/metrics.go | 4 ++- pkg/rimo/driver.go | 70 ++++++++++++++++++++++++++++++++++++--- 9 files changed, 136 insertions(+), 18 deletions(-) create mode 100644 pkg/metricv2/bool.go create mode 100644 pkg/metricv2/trueratio.go create mode 100644 pkg/metricv2/types.go diff --git a/pkg/metricv2/analyser.go b/pkg/metricv2/analyser.go index 1191ce1..d6d5187 100644 --- a/pkg/metricv2/analyser.go +++ b/pkg/metricv2/analyser.go @@ -2,15 +2,14 @@ package metricv2 import ( "github.com/cgi-fr/rimo/pkg/modelv2" - "golang.org/x/exp/constraints" ) -type Analyser[T constraints.Ordered] interface { +type Analyser[T Accepted] interface { Read(*T) Build(*modelv2.Column) } -type Multi[T constraints.Ordered] struct { +type Multi[T Accepted] struct { analyser []Analyser[T] } diff --git a/pkg/metricv2/bool.go b/pkg/metricv2/bool.go new file mode 100644 index 0000000..2a219b7 --- /dev/null +++ b/pkg/metricv2/bool.go @@ -0,0 +1,20 @@ +package metricv2 + +type Bool struct { + Multi[bool] +} + +func NewBool(sampleSize uint, countDistinct bool) *Bool { + mainAnalyser := []Analyser[bool]{ + NewCounter[bool](), // count total, count null + NewTrueRatio(), // calculate true ratio + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[bool]()) + } + + return &Bool{ + Multi: Multi[bool]{mainAnalyser}, + } +} diff --git a/pkg/metricv2/counter.go b/pkg/metricv2/counter.go index 7f4f9a6..e4d8c58 100644 --- a/pkg/metricv2/counter.go +++ b/pkg/metricv2/counter.go @@ -2,17 +2,16 @@ package metricv2 import ( "github.com/cgi-fr/rimo/pkg/modelv2" - "golang.org/x/exp/constraints" ) -type Counter[T constraints.Ordered] struct { +type Counter[T Accepted] struct { countTotal uint countNulls uint countEmpty uint zero T } -func NewCounter[T constraints.Ordered]() *Counter[T] { +func NewCounter[T Accepted]() *Counter[T] { return &Counter[T]{ countTotal: 0, countNulls: 0, diff --git a/pkg/metricv2/distinct.go b/pkg/metricv2/distinct.go index f74d0ab..64b180f 100644 --- a/pkg/metricv2/distinct.go +++ b/pkg/metricv2/distinct.go @@ -2,14 +2,13 @@ package metricv2 import ( "github.com/cgi-fr/rimo/pkg/modelv2" - "golang.org/x/exp/constraints" ) -type Distinct[T constraints.Ordered] struct { +type Distinct[T Accepted] struct { values map[T]int } -func NewDistinct[T constraints.Ordered]() *Distinct[T] { +func NewDistinct[T Accepted]() *Distinct[T] { return &Distinct[T]{ values: make(map[T]int, 1024), //nolint:gomnd } diff --git a/pkg/metricv2/sampler.go b/pkg/metricv2/sampler.go index 6e103f9..2ab449b 100644 --- a/pkg/metricv2/sampler.go +++ b/pkg/metricv2/sampler.go @@ -4,16 +4,15 @@ import ( "math/rand" "github.com/cgi-fr/rimo/pkg/modelv2" - "golang.org/x/exp/constraints" ) -type Sampler[T constraints.Ordered] struct { +type Sampler[T Accepted] struct { size uint count int samples []T } -func NewSampler[T constraints.Ordered](size uint) *Sampler[T] { +func NewSampler[T Accepted](size uint) *Sampler[T] { return &Sampler[T]{ size: size, count: 0, diff --git a/pkg/metricv2/trueratio.go b/pkg/metricv2/trueratio.go new file mode 100644 index 0000000..5c3106d --- /dev/null +++ b/pkg/metricv2/trueratio.go @@ -0,0 +1,33 @@ +package metricv2 + +import "github.com/cgi-fr/rimo/pkg/modelv2" + +type TrueRatio struct { + countTrue uint + count uint +} + +func NewTrueRatio() *TrueRatio { + return &TrueRatio{ + countTrue: 0, + count: 0, + } +} + +func (a *TrueRatio) Read(value *bool) { + if value == nil { + return + } + + a.count++ + + if *value { + a.countTrue++ + } +} + +func (a *TrueRatio) Build(metric *modelv2.Column) { + metric.BoolMetric = &modelv2.Bool{ + TrueRatio: float64(a.countTrue) / float64(a.count), + } +} diff --git a/pkg/metricv2/types.go b/pkg/metricv2/types.go new file mode 100644 index 0000000..a58e60e --- /dev/null +++ b/pkg/metricv2/types.go @@ -0,0 +1,7 @@ +package metricv2 + +import "golang.org/x/exp/constraints" + +type Accepted interface { + constraints.Ordered | ~bool +} diff --git a/pkg/modelv2/metrics.go b/pkg/modelv2/metrics.go index 212bbc3..8c9069b 100644 --- a/pkg/modelv2/metrics.go +++ b/pkg/modelv2/metrics.go @@ -27,4 +27,6 @@ type Numeric struct { Mean float64 `json:"mean" yaml:"mean" jsonschema:"required"` } -type Bool struct{} +type Bool struct { + TrueRatio float64 `json:"trueRatio" yaml:"trueRatio" jsonschema:"required"` +} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index ebf28a0..d4a142e 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -31,7 +31,7 @@ type Driver struct { SampleSize uint } -//nolint:funlen,cyclop +//nolint:funlen,cyclop,gocognit func (d Driver) AnalyseBase(reader Reader, writer Writer) error { baseName := reader.BaseName() @@ -72,7 +72,7 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table.Columns = append(table.Columns, col) tables[valreader.TableName()] = table - case float64: + case float64, float32, int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) if err != nil { return fmt.Errorf("failed to analyse column : %w", err) @@ -88,6 +88,23 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table.Columns = append(table.Columns, col) + tables[valreader.TableName()] = table + case bool: + col, err := d.AnalyseBool(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := tables[valreader.TableName()] + if !exists { + table = modelv2.Table{ + Name: valreader.TableName(), + Columns: []modelv2.Column{}, + } + } + + table.Columns = append(table.Columns, col) + tables[valreader.TableName()] = table case nil: nilcount++ @@ -153,10 +170,10 @@ func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) return column, nil } -func (d Driver) AnalyseNumeric(nilcount int, firstValue float64, reader ColReader) (modelv2.Column, error) { +func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) (modelv2.Column, error) { column := modelv2.Column{ Name: reader.ColName(), - Type: "string", + Type: "numeric", Config: modelv2.Config{}, //nolint:exhaustruct MainMetric: modelv2.Generic{}, //nolint:exhaustruct StringMetric: nil, @@ -170,7 +187,12 @@ func (d Driver) AnalyseNumeric(nilcount int, firstValue float64, reader ColReade analyser.Read(nil) } - analyser.Read(&firstValue) + valtyped, err := GetFloat64(firstValue) + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + analyser.Read(valtyped) for reader.Next() { val, err := reader.Value() @@ -191,6 +213,44 @@ func (d Driver) AnalyseNumeric(nilcount int, firstValue float64, reader ColReade return column, nil } +func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (modelv2.Column, error) { + column := modelv2.Column{ + Name: reader.ColName(), + Type: "bool", + Config: modelv2.Config{}, //nolint:exhaustruct + MainMetric: modelv2.Generic{}, //nolint:exhaustruct + StringMetric: nil, + NumericMetric: nil, + BoolMetric: &modelv2.Bool{}, //nolint:exhaustruct + } + + analyser := metricv2.NewBool(d.SampleSize, true) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + analyser.Read(&firstValue) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + switch valtyped := val.(type) { + case bool: + analyser.Read(&valtyped) + default: + return column, fmt.Errorf("invalue value type : %w", err) + } + } + + analyser.Build(&column) + + return column, nil +} + //nolint:cyclop func GetFloat64(value any) (*float64, error) { var converted float64 From 8470fcc0ae15114415bd4a0c1dd7e8c071e551bb Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:57:42 +0000 Subject: [PATCH 16/26] refactor: cleanup --- cmd/rimo/main.go | 4 +- go.mod | 11 +- go.sum | 20 +-- pkg/metric/build.go | 118 ----------------- pkg/metric/generic.go | 124 ------------------ pkg/metric/generic_test.go | 128 ------------------- pkg/metric/metricbool.go | 51 -------- pkg/metric/metricbool_test.go | 42 ------ pkg/metric/metricnumeric.go | 69 ---------- pkg/metric/metricnumeric_test.go | 46 ------- pkg/metric/metricstring.go | 213 ------------------------------- pkg/metric/metricstring_test.go | 83 ------------ pkg/metric/metricv2.go | 133 ------------------- pkg/metric/sampler.go | 43 ------- pkg/model/base.go | 50 -------- pkg/model/column.go | 44 ------- pkg/model/metric.go | 66 ---------- pkg/model/metricv2.go | 42 ------ pkg/model/utils.go | 123 ------------------ pkg/model/utils_test.go | 66 ---------- pkg/modelv2/schema.go | 17 +++ 21 files changed, 30 insertions(+), 1463 deletions(-) delete mode 100644 pkg/metric/build.go delete mode 100644 pkg/metric/generic.go delete mode 100644 pkg/metric/generic_test.go delete mode 100644 pkg/metric/metricbool.go delete mode 100644 pkg/metric/metricbool_test.go delete mode 100644 pkg/metric/metricnumeric.go delete mode 100644 pkg/metric/metricnumeric_test.go delete mode 100644 pkg/metric/metricstring.go delete mode 100644 pkg/metric/metricstring_test.go delete mode 100644 pkg/metric/metricv2.go delete mode 100644 pkg/metric/sampler.go delete mode 100644 pkg/model/base.go delete mode 100644 pkg/model/column.go delete mode 100644 pkg/model/metric.go delete mode 100644 pkg/model/metricv2.go delete mode 100644 pkg/model/utils.go delete mode 100644 pkg/model/utils_test.go create mode 100644 pkg/modelv2/schema.go diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 2b0f6a5..645cad6 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -23,7 +23,7 @@ import ( "path/filepath" "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/modelv2" "github.com/cgi-fr/rimo/pkg/rimo" "github.com/rs/zerolog" "github.com/rs/zerolog/log" @@ -63,7 +63,7 @@ func main() { //nolint:funlen Short: "Return rimo jsonschema", Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { - jsonschema, err := model.GetJSONSchema() + jsonschema, err := modelv2.GetJSONSchema() if err != nil { os.Exit(1) } diff --git a/go.mod b/go.mod index 50a9bd2..ec3a675 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.20 require ( github.com/goccy/go-json v0.10.2 - github.com/hexops/valast v1.4.4 github.com/rs/zerolog v1.30.0 github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.8.4 @@ -12,11 +11,14 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) -require gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +require ( + github.com/kr/pretty v0.3.1 // indirect + github.com/rogpeppe/go-internal v1.10.0 // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/iancoleman/orderedmap v0.3.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/invopop/jsonschema v0.7.0 // direct @@ -24,8 +26,5 @@ require ( github.com/mattn/go-isatty v0.0.19 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/mod v0.13.0 // indirect golang.org/x/sys v0.13.0 // indirect - golang.org/x/tools v0.14.0 // indirect - mvdan.cc/gofumpt v0.5.0 // indirect ) diff --git a/go.sum b/go.sum index 515a36f..8577e51 100644 --- a/go.sum +++ b/go.sum @@ -1,18 +1,12 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/hexops/autogold v0.8.1 h1:wvyd/bAJ+Dy+DcE09BoLk6r4Fa5R5W+O+GUzmR985WM= -github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= -github.com/hexops/valast v1.4.4 h1:rETyycw+/L2ZVJHHNxEBgh8KUn+87WugH9MxcEv9PGs= -github.com/hexops/valast v1.4.4/go.mod h1:Jcy1pNH7LNraVaAZDLyv21hHg2WBv9Nf9FL6fGxU7o4= github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0/go.mod h1:N0Wam8K1arqPXNWjMo21EXnBPOPp36vB07FNRdD2geA= github.com/iancoleman/orderedmap v0.3.0 h1:5cbR2grmZR/DiVt+VJopEhtVs9YGInGIxAoMJn+Ichc= github.com/iancoleman/orderedmap v0.3.0/go.mod h1:XuLcCUkdL5owUCQeF2Ue9uuw1EptkJDkXXS7VoV7XGE= @@ -21,7 +15,9 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf github.com/invopop/jsonschema v0.7.0 h1:2vgQcBz1n256N+FpX3Jq7Y17AjYt46Ig3zIWyy770So= github.com/invopop/jsonschema v0.7.0/go.mod h1:O9uiLokuu0+MGFlyiaqtWxwqJm41/+8Nj0lD7A36YH0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= @@ -29,10 +25,13 @@ github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27k github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.30.0 h1:SymVODrcRsaRaSInD9yQtKbtWqwsfoPcRff/oRXLj4c= github.com/rs/zerolog v1.30.0/go.mod h1:/tk+P47gFdPXq4QYjvCmT5/Gsug2nagsFWBWhAiSi1w= @@ -47,21 +46,14 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= -golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= -golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= -golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -mvdan.cc/gofumpt v0.5.0 h1:0EQ+Z56k8tXjj/6TQD25BFNKQXpCvT0rnansIc7Ug5E= -mvdan.cc/gofumpt v0.5.0/go.mod h1:HBeVDtMKRZpXyxFciAirzdKklDlGu8aAy1wEbH5Y9js= diff --git a/pkg/metric/build.go b/pkg/metric/build.go deleted file mode 100644 index 058c539..0000000 --- a/pkg/metric/build.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "encoding/json" - "errors" - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -var ErrValueType = errors.New("value type error") - -// Return a model.Column. -func ComputeMetric(colName string, values []interface{}) (model.Column, error) { - var confidential *bool = nil //nolint - - // Create the column. - col := model.Column{ - Name: colName, - Type: GetColType(values), - Concept: "", - Constraint: []string{}, - Confidential: confidential, - MainMetric: model.GenericMetric{}, //nolint:exhaustruct - StringMetric: model.StringMetric{}, //nolint:exhaustruct - NumericMetric: model.NumericMetric{}, //nolint:exhaustruct - BoolMetric: model.BoolMetric{}, //nolint:exhaustruct - } - - // Generic metric - err := SetGenericMetric(values, &col.MainMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing generic metric in column %v : %w", col.Name, err) - } - - // Type specific metric - switch col.Type { - case model.ColType.String: - err := SetStringMetric(values, &col.StringMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing string metric in column %v : %w", col.Name, err) - } - - case model.ColType.Numeric: - err := SetNumericMetric(values, &col.NumericMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing numeric metric in column %v : %w", col.Name, err) - } - - case model.ColType.Bool: - err := SetBoolMetric(values, &col.BoolMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing bool metric in column %v : %w", col.Name, err) - } - } - - return col, nil -} - -func GetColType(values []interface{}) model.ValueType { - colType := model.ColType.Undefined - for i := 0; i < len(values) && colType == model.ColType.Undefined; i++ { - colType = ColType(values[i]) - } - - return colType -} - -// Utils functions. - -func GetFrequency(occurrence int, count int) float64 { - return float64(occurrence) / float64(count) -} - -// To check why not using isNil() ? -func GetFirstValue(values []interface{}) interface{} { - for _, value := range values { - if value != nil { - return value - } - } - - return nil -} - -func ColType(value interface{}) model.ValueType { - switch value.(type) { - case int: - return model.ColType.Numeric - case float64: - return model.ColType.Numeric - case json.Number: - return model.ColType.Numeric - case string: - return model.ColType.String - case bool: - return model.ColType.Bool - default: - return model.ColType.Undefined - } -} diff --git a/pkg/metric/generic.go b/pkg/metric/generic.go deleted file mode 100644 index ac85bbd..0000000 --- a/pkg/metric/generic.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "errors" - "fmt" - "math/rand" - - "github.com/cgi-fr/rimo/pkg/model" - "golang.org/x/exp/constraints" -) - -var ErrEmptySlice = errors.New("slice is empty") - -func SetGenericMetric(values []interface{}, metric *model.GenericMetric) error { - sample, err := Sample(values, model.SampleSize) - if err != nil { - return fmt.Errorf("error computing sample: %w", err) - } - - metric.Count = len(values) - metric.Unique = CountUnique(values) - metric.Empty = CountEmpty(values) - metric.Sample = sample - - return nil -} - -func CountEmpty[T comparable](values []T) int { - empty := 0 - - for _, value := range values { - if isNil(value) { - empty++ - } - } - - return empty -} - -// Return a sample of size sampleSize from values. -func Sample[T comparable](values []T, sampleSize int) ([]T, error) { - uniqueValues := Unique(values) - - if sampleSize >= len(uniqueValues) { - return uniqueValues, nil - } - - sample := make([]T, sampleSize) - for i := 0; i < sampleSize; i++ { - sample[i] = uniqueValues[rand.Intn(len(uniqueValues)-1)] //nolint:gosec - } - - return sample, nil -} - -func CountUnique[T comparable](values []T) int { - unique := make(map[T]bool) - - for _, value := range values { - if isNil(value) { - continue - } - - unique[value] = true - } - - return len(unique) -} - -func Unique[T comparable](values []T) []T { - unique := make(map[T]bool) - - for _, value := range values { - if isNil(value) { - continue - } - - unique[value] = true - } - - uniqueValues := make([]T, 0, len(unique)) - for value := range unique { - uniqueValues = append(uniqueValues, value) - } - - return uniqueValues -} - -func isNil[T comparable](v T) bool { - return v == *new(T) -} - -func min[T constraints.Ordered](a, b T) T { - if a < b { - return a - } - - return b -} - -func max[T constraints.Ordered](a, b T) T { - if a > b { - return a - } - - return b -} diff --git a/pkg/metric/generic_test.go b/pkg/metric/generic_test.go deleted file mode 100644 index 4af5071..0000000 --- a/pkg/metric/generic_test.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestCountEmpty(t *testing.T) { - t.Parallel() - - slice := []interface{}{1, 2, 3, nil} - expected := 1 - actual := metric.CountEmpty(slice) - - assert.Equal(t, expected, actual) -} - -func TestGetColType(t *testing.T) { - t.Parallel() - - t.Run("numeric", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, 2, 3} - expected := model.ColType.Numeric - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("string", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, "text", nil} - expected := model.ColType.String - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("boolean", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, true, false} - expected := model.ColType.Bool - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - // Treat this case as error would imply to type assert each element of the slice when Loading. - t.Run("mixed", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{"text", 2, false} - expected := model.ColType.String - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("unknown", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, nil, nil} - expected := model.ColType.Undefined - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) -} - -// Implementation questions : -// should Unique() append nil element ? -// should CountUnique() count nil as a unique value ? - -func TestUnique(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 1, 2, 3, nil} - expected := []interface{}{1, 2, 3} - actual := metric.Unique(values) - - assert.ElementsMatch(t, expected, actual) -} - -func TestCountUnique(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 1, 2, 3, nil} - expected := 3 - actual := metric.CountUnique(values) - - assert.Equal(t, expected, actual) -} - -func TestSample(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 2, 3, nil, 5, 6} - actualOutput, _ := metric.Sample(values, 5) - - assert.Len(t, actualOutput, 5) - - actualOutput, _ = metric.Sample(values, 10) - assert.Len(t, actualOutput, 5) -} diff --git a/pkg/metric/metricbool.go b/pkg/metric/metricbool.go deleted file mode 100644 index a3a38ed..0000000 --- a/pkg/metric/metricbool.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// rimo is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// rimo is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with rimo. If not, see . - -package metric - -import ( - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -// Bool metric : TrueRatio. -func SetBoolMetric(values []interface{}, metric *model.BoolMetric) error { - nullCount := 0 - trueCount := 0 - - for _, value := range values { - if value == nil { - nullCount++ - - continue - } - - boolValue, ok := value.(bool) - if !ok { - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - if boolValue { - trueCount++ - } - } - - metric.TrueRatio = GetFrequency(trueCount, len(values)-nullCount) - - return nil -} diff --git a/pkg/metric/metricbool_test.go b/pkg/metric/metricbool_test.go deleted file mode 100644 index 5db4576..0000000 --- a/pkg/metric/metricbool_test.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestBooleanMetric(t *testing.T) { - t.Parallel() - - values := []interface{}{true, true, nil, false} - expectedMetric := model.BoolMetric{ - TrueRatio: float64(2) / float64(3), - } - - actualMetric := model.BoolMetric{} //nolint:exhaustruct - err := metric.SetBoolMetric(values, &actualMetric) - require.NoError(t, err) - - assert.Equal(t, expectedMetric, actualMetric) -} diff --git a/pkg/metric/metricnumeric.go b/pkg/metric/metricnumeric.go deleted file mode 100644 index 77f50bd..0000000 --- a/pkg/metric/metricnumeric.go +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func SetNumericMetric(values []interface{}, metric *model.NumericMetric) error { - nonNullCount := 0 - - value := GetFirstValue(values) - - floatValue, ok := value.(float64) - if !ok { - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - min := floatValue - max := floatValue - sum := 0.0 - - for _, value := range values { - floatValue, ok := value.(float64) - if !ok { - if value == nil { - continue - } - - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - sum += floatValue - nonNullCount++ - - if floatValue > max { - max = floatValue - } - - if floatValue < min { - min = floatValue - } - } - - mean := sum / float64(nonNullCount) - - metric.Min = min - metric.Max = max - metric.Mean = mean - - return nil -} diff --git a/pkg/metric/metricnumeric_test.go b/pkg/metric/metricnumeric_test.go deleted file mode 100644 index 997b506..0000000 --- a/pkg/metric/metricnumeric_test.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" -) - -func TestNumericMetric(t *testing.T) { - t.Parallel() - - values := []interface{}{1.0, 2.0, 3.0, nil} - expectedMetric := model.NumericMetric{ - Min: 1, - Max: 3, - Mean: 2, - } - - actualMetric := model.NumericMetric{} //nolint:exhaustruct - - err := metric.SetNumericMetric(values, &actualMetric) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - assert.Equal(t, expectedMetric, actualMetric) -} diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go deleted file mode 100644 index 9765b97..0000000 --- a/pkg/metric/metricstring.go +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "fmt" - "math" - "sort" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func SetStringMetric(values []interface{}, metric *model.StringMetric) error { - // Store strings by length. - lenMap := make(map[int][]string) - // Count length occurrence. - lenCounter := make(map[int]int) - totalCount := len(values) - - metric.MinLen = math.MaxInt - metric.MaxLen = 0 - - for _, value := range values { - if value == nil { - continue - } - - stringValue, ok := value.(string) - if !ok { - return fmt.Errorf("%w : expected string found %T: %v", ErrValueType, value, value) - } - - length := len(stringValue) - lenMap[length] = append(lenMap[length], stringValue) - lenCounter[length]++ - - metric.MinLen = min(metric.MinLen, length) - metric.MaxLen = max(metric.MaxLen, length) - } - - // Create a list of unique lengths sorted by descending frequency, break ties with ascending length - sortedLength := uniqueLengthSorted(lenCounter) - - // Get size of MostFreqLen and LeastFreqLen - mostFrequentLenSize, leastFrequentLenSize := getFreqSize(len(sortedLength)) - - // Get ordered slice of least and most frequent length - lenMostFreqLen := sortedLength[0:mostFrequentLenSize] - - lenLeastFreqLen := make([]int, leastFrequentLenSize) - - for i := 0; i < leastFrequentLenSize; i++ { - index := len(sortedLength) - 1 - i - lenLeastFreqLen[i] = sortedLength[index] - } - - leastFreqLen, err := buildFreqLen(lenLeastFreqLen, lenMap, lenCounter, totalCount, model.LeastFrequentSampleSize) - if err != nil { - return fmt.Errorf("error building least frequent length : %w", err) - } - - metric.LeastFreqLen = leastFreqLen - - mostFreqLen, err := buildFreqLen(lenMostFreqLen, lenMap, lenCounter, totalCount, model.MostFrequentSampleSize) - if err != nil { - return fmt.Errorf("error building most frequent length : %w", err) - } - - metric.MostFreqLen = mostFreqLen - - return nil -} - -func buildFreqLen(freqLen []int, lenMap map[int][]string, lenCounter map[int]int, totalCount int, sampleLen int) ([]model.LenFreq, error) { //nolint - lenFreqs := make([]model.LenFreq, len(freqLen)) - - for index, len := range freqLen { - // Get unique value from lenMap[len].. - sample, err := Sample(lenMap[len], sampleLen) - if err != nil { - return lenFreqs, fmt.Errorf("error getting sample for length %v : %w", len, err) - } - - lenFreqs[index] = model.LenFreq{ - Length: len, - Freq: GetFrequency(lenCounter[len], totalCount), - Sample: sample, - } - } - - return lenFreqs, nil -} - -func getFreqSize(nunique int) (int, int) { - mostFrequentLenSize := model.MostFrequentLenSize - leastFrequentLenSize := model.LeastFrequentLenSize - - if nunique < model.MostFrequentLenSize+model.LeastFrequentLenSize { - // Modify MostFrequentLenSize and LeastFrequentLenSize to fit the number of unique length. - // Should keep ratio of MostFrequentLenSize and LeastFrequentLenSize. - ratio := float64(model.MostFrequentLenSize) / float64(model.MostFrequentLenSize+model.LeastFrequentLenSize) - mostFrequentLenSize = int(math.Round(float64(nunique) * ratio)) - leastFrequentLenSize = nunique - mostFrequentLenSize - } - - return mostFrequentLenSize, leastFrequentLenSize -} - -func uniqueLengthSorted(lenCounter map[int]int) []int { - uniqueLengthSorted := make([]int, 0, len(lenCounter)) - for l := range lenCounter { - uniqueLengthSorted = append(uniqueLengthSorted, l) - } - - // Sort the string lengths by descending count of occurrence, breaks ties with ascending length - sort.Slice(uniqueLengthSorted, func(i, j int) bool { - if lenCounter[uniqueLengthSorted[i]] == lenCounter[uniqueLengthSorted[j]] { - return uniqueLengthSorted[i] < uniqueLengthSorted[j] - } - - return lenCounter[uniqueLengthSorted[i]] > lenCounter[uniqueLengthSorted[j]] - }) - - return uniqueLengthSorted -} - -type String struct { - sampleSize uint - main Stateless[string] - byLen map[int]Stateless[string] -} - -func NewString(sampleSize uint) *String { - return &String{ - sampleSize: sampleSize, - main: NewCounter[string](sampleSize), - byLen: map[int]Stateless[string]{}, - } -} - -func (s *String) Read(value *string) { - s.main.Read(value) - - if value != nil { - length := len(*value) - - analyser, exists := s.byLen[length] - if !exists { - analyser = NewCounter[string](s.sampleSize) - } - - analyser.Read(value) - - s.byLen[length] = analyser - } -} - -func (s *String) Build() model.Col[string] { - result := model.Col[string]{} - - result.MainMetric.Count = s.main.CountTotal() - result.MainMetric.Empty = s.main.CountEmpty() - result.MainMetric.Null = s.main.CountNulls() - result.MainMetric.Max = s.main.Max() - result.MainMetric.Min = s.main.Min() - result.MainMetric.Samples = s.main.Samples() - - lengths := make([]int, 0, len(s.byLen)) - for len := range s.byLen { - lengths = append(lengths, len) - } - - sort.Slice(lengths, func(i, j int) bool { - freqi := float64(s.byLen[lengths[i]].CountTotal()) / float64(s.main.CountTotal()) - freqj := float64(s.byLen[lengths[j]].CountTotal()) / float64(s.main.CountTotal()) - - return freqi > freqj - }) - - result.StringMetric.CountLen = len(lengths) - result.StringMetric.MaxLen = lengths[0] - result.StringMetric.MaxLen = lengths[len(lengths)-1] - - for _, length := range lengths { - strlen := model.StringLen{} //nolint:exhaustruct - strlen.Length = length - strlen.Freq = float64(s.byLen[length].CountTotal()) / float64(s.main.CountTotal()) - strlen.Metrics.Count = s.byLen[length].CountTotal() - strlen.Metrics.Empty = s.byLen[length].CountEmpty() - strlen.Metrics.Null = s.byLen[length].CountNulls() - strlen.Metrics.Max = s.byLen[length].Max() - strlen.Metrics.Min = s.byLen[length].Min() - strlen.Metrics.Samples = s.byLen[length].Samples() - result.StringMetric.Lengths = append(result.StringMetric.Lengths, strlen) - } - - return result -} diff --git a/pkg/metric/metricstring_test.go b/pkg/metric/metricstring_test.go deleted file mode 100644 index c97c247..0000000 --- a/pkg/metric/metricstring_test.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "encoding/json" - "fmt" - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" -) - -// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length. -func TestStringMetric(t *testing.T) { - t.Parallel() - - text := []interface{}{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441"} - expectedMetric := model.StringMetric{ - MinLen: 1, - MaxLen: 4, - MostFreqLen: []model.LenFreq{{Length: 1, Freq: 0.4, Sample: []string{"1"}}, {Length: 2, Freq: 0.3, Sample: []string{"22"}}}, //nolint:lll - LeastFreqLen: []model.LenFreq{{Length: 4, Freq: 0.1, Sample: []string{"4441"}}, {Length: 3, Freq: 0.2, Sample: []string{"331", "332"}}}, //nolint:lll - } - - actualMetric := model.StringMetric{} //nolint:exhaustruct - - err := metric.SetStringMetric(text, &actualMetric) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - // t.Logf(valast.String(actualMetric)) - - for i := 0; i < len(expectedMetric.MostFreqLen); i++ { - assert.Equal(t, expectedMetric.MostFreqLen[i].Length, actualMetric.MostFreqLen[i].Length) - assert.Equal(t, expectedMetric.MostFreqLen[i].Freq, actualMetric.MostFreqLen[i].Freq) - assert.Equal(t, expectedMetric.MostFreqLen[i].Sample, actualMetric.MostFreqLen[i].Sample) - } - - for i := 0; i < len(expectedMetric.LeastFreqLen); i++ { - assert.Equal(t, expectedMetric.LeastFreqLen[i].Length, actualMetric.LeastFreqLen[i].Length) - assert.Equal(t, expectedMetric.LeastFreqLen[i].Freq, actualMetric.LeastFreqLen[i].Freq) - assert.ElementsMatch(t, expectedMetric.LeastFreqLen[i].Sample, actualMetric.LeastFreqLen[i].Sample) - } -} - -func TestStringMetricV2(t *testing.T) { - t.Parallel() - - analyser := metric.NewString(5) - - strings := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""} - - for _, s := range strings { - s := s - analyser.Read(&s) - } - - analyser.Read(nil) - - bytes, err := json.Marshal(analyser.Build()) - - assert.NoError(t, err) - - fmt.Printf("%s\n", string(bytes)) -} diff --git a/pkg/metric/metricv2.go b/pkg/metric/metricv2.go deleted file mode 100644 index d6bfa6e..0000000 --- a/pkg/metric/metricv2.go +++ /dev/null @@ -1,133 +0,0 @@ -package metric - -import "golang.org/x/exp/constraints" - -type Analyser[T constraints.Ordered] interface { - Read(*T) -} - -type Stateless[T constraints.Ordered] interface { - Analyser[T] - CountTotal() uint - CountNulls() uint - CountEmpty() uint - Min() *T - Max() *T - Samples() []T -} - -type Statefull[T constraints.Ordered] interface { - Stateless[T] - CountDistinct() uint -} - -type Counter[T constraints.Ordered] struct { - countTotal uint - countNulls uint - countEmpty uint - min *T - max *T - samples *Sampler[T] - zero T -} - -func NewCounter[T constraints.Ordered](samplerSize uint) *Counter[T] { - return &Counter[T]{ - countTotal: 0, - countNulls: 0, - countEmpty: 0, - samples: NewSampler[T](samplerSize), - zero: *new(T), - } -} - -func (c *Counter[T]) Read(value *T) { - c.countTotal++ - - switch { - case value == nil: - c.countNulls++ - case *value == c.zero: - c.countEmpty++ - } - - if value != nil { - c.samples.Add(*value) - - if c.min == nil { - c.min = value - } - - if c.max == nil { - c.max = value - } - - if *value < *c.min { - c.min = value - } else if *value > *c.max { - c.max = value - } - } -} - -// CountEmpty implements Stateless. -func (c *Counter[T]) CountEmpty() uint { - return c.countEmpty -} - -// CountNulls implements Stateless. -func (c *Counter[T]) CountNulls() uint { - return c.countNulls -} - -// CountTotal implements Stateless. -func (c *Counter[T]) CountTotal() uint { - return c.countTotal -} - -// Samples implements Stateless. -func (c *Counter[T]) Samples() []T { - return c.samples.Data() -} - -// Min implements Stateless. -func (c *Counter[T]) Min() *T { - return c.min -} - -// Max implements Stateless. -func (c *Counter[T]) Max() *T { - return c.max -} - -type Distinctcounter[T constraints.Ordered] struct { - Counter[T] - values map[T]int -} - -func NewDistinctCounter[T constraints.Ordered](samplerSize uint) Statefull[T] { - return &Distinctcounter[T]{ - Counter: Counter[T]{ - countTotal: 0, - countNulls: 0, - countEmpty: 0, - samples: NewSampler[T](samplerSize), - zero: *new(T), - }, - values: make(map[T]int, 1024), //nolint:gomnd - } -} - -// Read implements Statefull. -func (c *Distinctcounter[T]) Read(value *T) { - c.Counter.Read(value) - - if value != nil { - c.values[*value] = 0 - } -} - -// CountDistinct implements Statefull. -func (c *Distinctcounter[T]) CountDistinct() uint { - return uint(len(c.values)) -} diff --git a/pkg/metric/sampler.go b/pkg/metric/sampler.go deleted file mode 100644 index 560aa73..0000000 --- a/pkg/metric/sampler.go +++ /dev/null @@ -1,43 +0,0 @@ -package metric - -import ( - "math/rand" - - "golang.org/x/exp/constraints" -) - -// Sampler implement a basic sampling algorithm. -// -// see: https://en.wikipedia.org/wiki/Reservoir_sampling#Simple:_Algorithm_R) -type Sampler[T constraints.Ordered] struct { - size uint - count int - data []T -} - -func NewSampler[T constraints.Ordered](size uint) *Sampler[T] { - return &Sampler[T]{ - size: size, - count: 0, - data: make([]T, 0, size), - } -} - -func (s *Sampler[T]) Add(data T) { - s.count++ - - if len(s.data) < int(s.size) { - s.data = append(s.data, data) - - return - } - - index := rand.Intn(s.count) //nolint:gosec - if index < int(s.size) { - s.data[index] = data - } -} - -func (s *Sampler[T]) Data() []T { - return s.data -} diff --git a/pkg/model/base.go b/pkg/model/base.go deleted file mode 100644 index 49eed63..0000000 --- a/pkg/model/base.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -import ( - "fmt" - "reflect" - - "github.com/hexops/valast" -) - -// RIMO YAML structure. -type ( - Base struct { - Name string `json:"database" jsonschema:"required" yaml:"database"` - // Tables should be map[string][]Column - Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"` - } - - Table struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"` - } -) - -// Should be improved with more detail about difference. -func SameBase(base1, base2 *Base) (bool, string) { - if !reflect.DeepEqual(base1, base2) { - msg := fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2)) - - return false, msg - } - - return true, "" -} diff --git a/pkg/model/column.go b/pkg/model/column.go deleted file mode 100644 index 223bcb0..0000000 --- a/pkg/model/column.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -const ( - SampleSize int = 5 - MostFrequentLenSize int = 5 - MostFrequentSampleSize int = 5 - LeastFrequentLenSize int = 5 - LeastFrequentSampleSize int = 5 -) - -type ( - Column struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll - - // The 3 following parameter should be part of a Config struct - Concept string `json:"concept" jsonschema:"required" yaml:"concept"` - Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` - Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` - - MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` - - StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` - NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` - BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` - } -) diff --git a/pkg/model/metric.go b/pkg/model/metric.go deleted file mode 100644 index c0e8d03..0000000 --- a/pkg/model/metric.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -// RIMO YAML metrics. -type ( - GenericMetric struct { - Count int `json:"count" jsonschema:"required" yaml:"count"` - Empty int `json:"empty" jsonschema:"required" yaml:"empty"` - Unique int `json:"unique" jsonschema:"required" yaml:"unique"` - Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"` - } - - StringMetric struct { - MinLen int `json:"minLen" jsonschema:"required" yaml:"minLen"` - MaxLen int `json:"maxLen" jsonschema:"required" yaml:"maxLen"` - MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"` - LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"` - } - - LenFreq struct { - Length int `json:"length" jsonschema:"required" yaml:"length"` - Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` - Sample []string `json:"sample" jsonschema:"required" yaml:"sample"` - } - - NumericMetric struct { - Min float64 `json:"min" jsonschema:"required" yaml:"min"` - Max float64 `json:"max" jsonschema:"required" yaml:"max"` - Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"` - } - - BoolMetric struct { - TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"` - } -) - -// Type that a column can be. -type ValueType string - -var ColType = struct { //nolint:gochecknoglobals - String ValueType - Numeric ValueType - Bool ValueType - Undefined ValueType -}{ - String: "string", - Numeric: "numeric", - Bool: "bool", - Undefined: "undefined", -} diff --git a/pkg/model/metricv2.go b/pkg/model/metricv2.go deleted file mode 100644 index 54ba31a..0000000 --- a/pkg/model/metricv2.go +++ /dev/null @@ -1,42 +0,0 @@ -package model - -import "golang.org/x/exp/constraints" - -type Col[T constraints.Ordered] struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll - - // The 3 following parameter should be part of a Config struct - Concept string `json:"concept,omitempty" yaml:"concept,omitempty"` - Constraint []string `json:"constraint,omitempty" yaml:"constraint,omitempty"` - Confidential *bool `json:"confidential,omitempty" yaml:"confidential,omitempty"` - - MainMetric Generic[T] `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` - - StringMetric String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` - NumericMetric NumericMetric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` - BoolMetric BoolMetric `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` -} - -type Generic[T constraints.Ordered] struct { - Count uint `json:"count" jsonschema:"required" yaml:"count"` - Empty uint `json:"empty" jsonschema:"required" yaml:"empty"` - Null uint `json:"null" jsonschema:"required" yaml:"null"` - Distinct *uint `json:"distinct" jsonschema:"required" yaml:"distinct"` - Min *T `json:"min" jsonschema:"required" yaml:"min"` - Max *T `json:"max" jsonschema:"required" yaml:"max"` - Samples []T `json:"samples" jsonschema:"required" yaml:"samples"` -} - -type String struct { - MinLen int `json:"minLen" jsonschema:"required" yaml:"minLen"` - MaxLen int `json:"maxLen" jsonschema:"required" yaml:"maxLen"` - CountLen int `json:"countLen" jsonschema:"required" yaml:"countLen"` - Lengths []StringLen `json:"lengths" jsonschema:"required" yaml:"lengths"` -} - -type StringLen struct { - Length int `json:"length" jsonschema:"required" yaml:"length"` - Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` - Metrics Generic[string] `json:"metrics" jsonschema:"required" yaml:"metrics"` -} diff --git a/pkg/model/utils.go b/pkg/model/utils.go deleted file mode 100644 index f80a6e2..0000000 --- a/pkg/model/utils.go +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -import ( - "encoding/json" - "errors" - "fmt" - "os" - "sort" - - "github.com/invopop/jsonschema" - "gopkg.in/yaml.v3" -) - -func GetJSONSchema() (string, error) { - resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct - if err != nil { - return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) - } - - return string(resBytes), nil -} - -func NewBase(name string) *Base { - return &Base{ - Name: name, - Tables: make([]Table, 0), - } -} - -var ErrBaseFormat = errors.New("error while decoding yaml file in a Base struct") - -// Can be improved. -func LoadBase(path string) (*Base, error) { - file, err := os.Open(path) - if err != nil { - return nil, fmt.Errorf("error while opening file: %w", err) - } - - decoder := yaml.NewDecoder(file) - - var base Base - - err = decoder.Decode(&base) - if err != nil { - return nil, ErrBaseFormat - } - - file.Close() - - return &base, nil -} - -func RemoveSampleFromBase(base *Base) { - for tableI, table := range base.Tables { - for columnJ, column := range table.Columns { - column.MainMetric.Sample = nil - - if column.Type == ColType.String { - for freqLen := range column.StringMetric.MostFreqLen { - column.StringMetric.MostFreqLen[freqLen].Sample = nil - } - - for freqLen := range column.StringMetric.LeastFreqLen { - column.StringMetric.LeastFreqLen[freqLen].Sample = nil - } - } - - base.Tables[tableI].Columns[columnJ] = column - } - } -} - -func (base *Base) SortBase() { - for _, table := range base.Tables { - sort.Slice(table.Columns, func(i, j int) bool { - return table.Columns[i].Name < table.Columns[j].Name - }) - } - - sort.Slice(base.Tables, func(i, j int) bool { - return base.Tables[i].Name < base.Tables[j].Name - }) -} - -func (base *Base) AddColumn(column Column, tableName string) { - mapTableName := make(map[string]int) - for index, table := range base.Tables { - mapTableName[table.Name] = index - } - - if index, ok := mapTableName[tableName]; ok { - // If the table exists, append the column to the table - base.Tables[index].Columns = append(base.Tables[index].Columns, column) - } else { - // If the table does not exist, create a new table and add it to the base - table := Table{ - Name: tableName, - Columns: []Column{column}, - } - base.Tables = append(base.Tables, table) - } -} - -// If the table does not exist, create a new table and add it to the base -// table := Table{Name: tableName, Columns: []Column{column}} -// base.Tables = append(base.Tables, table) diff --git a/pkg/model/utils_test.go b/pkg/model/utils_test.go deleted file mode 100644 index 16c619e..0000000 --- a/pkg/model/utils_test.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func TestAddColumn(t *testing.T) { - t.Parallel() - - base := model.NewBase("test_base") - - column := model.Column{ //nolint:exhaustruct - Name: "test_column", - Type: model.ColType.String, - Concept: "test_concept", - } - - tableName := "test_table" - - base.AddColumn(column, tableName) - - // fmt.Print(valast.String(base)) - - if len(base.Tables) != 1 { - t.Errorf("expected 1 table, got %d", len(base.Tables)) - } - - if base.Tables[0].Name != tableName { - t.Errorf("expected table name %q, got %q", tableName, base.Tables[0].Name) - } - - if len(base.Tables[0].Columns) != 1 { - t.Errorf("expected 1 column, got %d", len(base.Tables[0].Columns)) - } - - if base.Tables[0].Columns[0].Name != column.Name { - t.Errorf("expected column name %q, got %q", column.Name, base.Tables[0].Columns[0].Name) - } - - if base.Tables[0].Columns[0].Type != column.Type { - t.Errorf("expected column type %q, got %q", column.Type, base.Tables[0].Columns[0].Type) - } - - if base.Tables[0].Columns[0].Concept != column.Concept { - t.Errorf("expected column concept %q, got %q", column.Concept, base.Tables[0].Columns[0].Concept) - } -} diff --git a/pkg/modelv2/schema.go b/pkg/modelv2/schema.go new file mode 100644 index 0000000..552bd90 --- /dev/null +++ b/pkg/modelv2/schema.go @@ -0,0 +1,17 @@ +package modelv2 + +import ( + "encoding/json" + "fmt" + + "github.com/invopop/jsonschema" +) + +func GetJSONSchema() (string, error) { + resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct + if err != nil { + return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) + } + + return string(resBytes), nil +} From 2445cbc05b8d4bf2de4db5e0b302e4cdca1e0dd5 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:58:55 +0000 Subject: [PATCH 17/26] refactor: update schema --- cmd/rimo/main.go | 4 +- internal/infra/fileWriter.go | 6 +- pkg/{metricv2 => metric}/analyser.go | 8 +-- pkg/{metricv2 => metric}/bool.go | 2 +- pkg/{metricv2 => metric}/counter.go | 6 +- pkg/{metricv2 => metric}/distinct.go | 6 +- pkg/{metricv2 => metric}/mean.go | 8 +-- pkg/{metricv2 => metric}/metric_test.go | 20 +++--- pkg/{metricv2 => metric}/minmax.go | 6 +- pkg/{metricv2 => metric}/numeric.go | 2 +- pkg/{metricv2 => metric}/sampler.go | 6 +- pkg/{metricv2 => metric}/string.go | 16 ++--- pkg/{metricv2 => metric}/trueratio.go | 8 +-- pkg/{metricv2 => metric}/types.go | 2 +- pkg/{modelv2 => model}/base.go | 2 +- pkg/{modelv2 => model}/column.go | 2 +- pkg/{modelv2 => model}/config.go | 2 +- pkg/{modelv2 => model}/metrics.go | 2 +- pkg/{modelv2 => model}/schema.go | 2 +- pkg/rimo/driven.go | 4 +- pkg/rimo/driver.go | 56 +++++++-------- schema/v1/rimo.schema.json | 95 +++++++++++-------------- 22 files changed, 126 insertions(+), 139 deletions(-) rename pkg/{metricv2 => metric}/analyser.go (67%) rename pkg/{metricv2 => metric}/bool.go (95%) rename pkg/{metricv2 => metric}/counter.go (83%) rename pkg/{metricv2 => metric}/distinct.go (75%) rename pkg/{metricv2 => metric}/mean.go (64%) rename pkg/{metricv2 => metric}/metric_test.go (85%) rename pkg/{metricv2 => metric}/minmax.go (82%) rename pkg/{metricv2 => metric}/numeric.go (97%) rename pkg/{metricv2 => metric}/sampler.go (85%) rename pkg/{metricv2 => metric}/string.go (86%) rename pkg/{metricv2 => metric}/trueratio.go (68%) rename pkg/{metricv2 => metric}/types.go (85%) rename pkg/{modelv2 => model}/base.go (96%) rename pkg/{modelv2 => model}/column.go (97%) rename pkg/{modelv2 => model}/config.go (94%) rename pkg/{modelv2 => model}/metrics.go (98%) rename pkg/{modelv2 => model}/schema.go (95%) diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 645cad6..2b0f6a5 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -23,7 +23,7 @@ import ( "path/filepath" "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" "github.com/cgi-fr/rimo/pkg/rimo" "github.com/rs/zerolog" "github.com/rs/zerolog/log" @@ -63,7 +63,7 @@ func main() { //nolint:funlen Short: "Return rimo jsonschema", Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { - jsonschema, err := modelv2.GetJSONSchema() + jsonschema, err := model.GetJSONSchema() if err != nil { os.Exit(1) } diff --git a/internal/infra/fileWriter.go b/internal/infra/fileWriter.go index 617770d..d652811 100644 --- a/internal/infra/fileWriter.go +++ b/internal/infra/fileWriter.go @@ -21,7 +21,7 @@ import ( "fmt" "os" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" "gopkg.in/yaml.v3" ) @@ -35,7 +35,7 @@ func StdoutWriterFactory() *StdoutWriter { return &writer } -func (w *StdoutWriter) Export(base *modelv2.Base) error { +func (w *StdoutWriter) Export(base *model.Base) error { fmt.Printf("%v\n", base) return nil @@ -61,7 +61,7 @@ func YAMLWriterFactory(filepath string) (*YAMLWriter, error) { } // Write a YAML file from RIMO base at outputPath. -func (w *YAMLWriter) Export(base *modelv2.Base) error { +func (w *YAMLWriter) Export(base *model.Base) error { outputFile, err := os.Create(w.outputPath) if err != nil { return fmt.Errorf("failed to create output file: %w", err) diff --git a/pkg/metricv2/analyser.go b/pkg/metric/analyser.go similarity index 67% rename from pkg/metricv2/analyser.go rename to pkg/metric/analyser.go index d6d5187..9d1d936 100644 --- a/pkg/metricv2/analyser.go +++ b/pkg/metric/analyser.go @@ -1,12 +1,12 @@ -package metricv2 +package metric import ( - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" ) type Analyser[T Accepted] interface { Read(*T) - Build(*modelv2.Column) + Build(*model.Column) } type Multi[T Accepted] struct { @@ -19,7 +19,7 @@ func (m Multi[T]) Read(value *T) { } } -func (m Multi[T]) Build(metric *modelv2.Column) { +func (m Multi[T]) Build(metric *model.Column) { for _, a := range m.analyser { a.Build(metric) } diff --git a/pkg/metricv2/bool.go b/pkg/metric/bool.go similarity index 95% rename from pkg/metricv2/bool.go rename to pkg/metric/bool.go index 2a219b7..289dfcc 100644 --- a/pkg/metricv2/bool.go +++ b/pkg/metric/bool.go @@ -1,4 +1,4 @@ -package metricv2 +package metric type Bool struct { Multi[bool] diff --git a/pkg/metricv2/counter.go b/pkg/metric/counter.go similarity index 83% rename from pkg/metricv2/counter.go rename to pkg/metric/counter.go index e4d8c58..ede347a 100644 --- a/pkg/metricv2/counter.go +++ b/pkg/metric/counter.go @@ -1,7 +1,7 @@ -package metricv2 +package metric import ( - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" ) type Counter[T Accepted] struct { @@ -31,7 +31,7 @@ func (c *Counter[T]) Read(value *T) { } } -func (c *Counter[T]) Build(metric *modelv2.Column) { +func (c *Counter[T]) Build(metric *model.Column) { metric.MainMetric.Count = c.countTotal metric.MainMetric.Null = c.countNulls metric.MainMetric.Empty = c.countEmpty diff --git a/pkg/metricv2/distinct.go b/pkg/metric/distinct.go similarity index 75% rename from pkg/metricv2/distinct.go rename to pkg/metric/distinct.go index 64b180f..c01e7b0 100644 --- a/pkg/metricv2/distinct.go +++ b/pkg/metric/distinct.go @@ -1,7 +1,7 @@ -package metricv2 +package metric import ( - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" ) type Distinct[T Accepted] struct { @@ -20,6 +20,6 @@ func (a *Distinct[T]) Read(value *T) { } } -func (a *Distinct[T]) Build(metric *modelv2.Column) { +func (a *Distinct[T]) Build(metric *model.Column) { metric.MainMetric.Distinct = uint(len(a.values)) } diff --git a/pkg/metricv2/mean.go b/pkg/metric/mean.go similarity index 64% rename from pkg/metricv2/mean.go rename to pkg/metric/mean.go index 8338a08..35137cc 100644 --- a/pkg/metricv2/mean.go +++ b/pkg/metric/mean.go @@ -1,6 +1,6 @@ -package metricv2 +package metric -import "github.com/cgi-fr/rimo/pkg/modelv2" +import "github.com/cgi-fr/rimo/pkg/model" type Mean struct { count uint @@ -24,8 +24,8 @@ func (a *Mean) Read(value *float64) { a.mean += (*value - a.mean) / float64(a.count) } -func (a *Mean) Build(metric *modelv2.Column) { - metric.NumericMetric = &modelv2.Numeric{ +func (a *Mean) Build(metric *model.Column) { + metric.NumericMetric = &model.Numeric{ Mean: a.mean, } } diff --git a/pkg/metricv2/metric_test.go b/pkg/metric/metric_test.go similarity index 85% rename from pkg/metricv2/metric_test.go rename to pkg/metric/metric_test.go index 93a646d..3bad918 100644 --- a/pkg/metricv2/metric_test.go +++ b/pkg/metric/metric_test.go @@ -1,10 +1,10 @@ -package metricv2_test +package metric_test import ( "testing" - "github.com/cgi-fr/rimo/pkg/metricv2" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/metric" + "github.com/cgi-fr/rimo/pkg/model" "github.com/stretchr/testify/assert" ) @@ -17,8 +17,8 @@ func TestStringMetric(t *testing.T) { //nolint:funlen min := "" max := "4441" - expectedMetric := modelv2.Column{ //nolint:exhaustruct - MainMetric: modelv2.Generic{ + expectedMetric := model.Column{ //nolint:exhaustruct + MainMetric: model.Generic{ Count: 12, Empty: 1, Null: 1, @@ -27,15 +27,15 @@ func TestStringMetric(t *testing.T) { //nolint:funlen Min: &min, Max: &max, }, - StringMetric: &modelv2.String{ + StringMetric: &model.String{ MinLen: 0, MaxLen: 4, CountLen: 5, - Lengths: []modelv2.StringLen{ + Lengths: []model.StringLen{ { Length: 1, Freq: 0.3333333333333333, - Metrics: modelv2.Generic{ + Metrics: model.Generic{ Count: 4, Empty: 0, Null: 0, @@ -49,9 +49,9 @@ func TestStringMetric(t *testing.T) { //nolint:funlen }, } - actualMetric := modelv2.Column{} //nolint:exhaustruct + actualMetric := model.Column{} //nolint:exhaustruct - analyser := metricv2.NewString(5, true) + analyser := metric.NewString(5, true) for index := range text { analyser.Read(&text[index]) } diff --git a/pkg/metricv2/minmax.go b/pkg/metric/minmax.go similarity index 82% rename from pkg/metricv2/minmax.go rename to pkg/metric/minmax.go index 7056434..9bf0ccb 100644 --- a/pkg/metricv2/minmax.go +++ b/pkg/metric/minmax.go @@ -1,7 +1,7 @@ -package metricv2 +package metric import ( - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" "golang.org/x/exp/constraints" ) @@ -35,7 +35,7 @@ func (a *MinMax[T]) Read(value *T) { } } -func (a *MinMax[T]) Build(metric *modelv2.Column) { +func (a *MinMax[T]) Build(metric *model.Column) { metric.MainMetric.Min = a.min metric.MainMetric.Max = a.max } diff --git a/pkg/metricv2/numeric.go b/pkg/metric/numeric.go similarity index 97% rename from pkg/metricv2/numeric.go rename to pkg/metric/numeric.go index 65d2eae..2b34fe0 100644 --- a/pkg/metricv2/numeric.go +++ b/pkg/metric/numeric.go @@ -1,4 +1,4 @@ -package metricv2 +package metric type Numeric struct { Multi[float64] diff --git a/pkg/metricv2/sampler.go b/pkg/metric/sampler.go similarity index 85% rename from pkg/metricv2/sampler.go rename to pkg/metric/sampler.go index 2ab449b..db59f59 100644 --- a/pkg/metricv2/sampler.go +++ b/pkg/metric/sampler.go @@ -1,9 +1,9 @@ -package metricv2 +package metric import ( "math/rand" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" ) type Sampler[T Accepted] struct { @@ -37,7 +37,7 @@ func (s *Sampler[T]) Read(value *T) { } } -func (s *Sampler[T]) Build(metric *modelv2.Column) { +func (s *Sampler[T]) Build(metric *model.Column) { metric.MainMetric.Samples = make([]any, len(s.samples)) for i, s := range s.samples { metric.MainMetric.Samples[i] = s diff --git a/pkg/metricv2/string.go b/pkg/metric/string.go similarity index 86% rename from pkg/metricv2/string.go rename to pkg/metric/string.go index fc55069..7e5861d 100644 --- a/pkg/metricv2/string.go +++ b/pkg/metric/string.go @@ -1,9 +1,9 @@ -package metricv2 +package metric import ( "sort" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/model" "golang.org/x/exp/maps" "golang.org/x/exp/slices" ) @@ -61,24 +61,24 @@ func (a *String) Read(value *string) { } } -func (a *String) Build(metric *modelv2.Column) { +func (a *String) Build(metric *model.Column) { a.main.Build(metric) - metric.StringMetric = &modelv2.String{ + metric.StringMetric = &model.String{ MinLen: slices.Min(maps.Keys(a.byLen)), MaxLen: slices.Max(maps.Keys(a.byLen)), CountLen: len(a.byLen), - Lengths: make([]modelv2.StringLen, 0, len(a.byLen)), + Lengths: make([]model.StringLen, 0, len(a.byLen)), } for length, analyser := range a.byLen { - lenMetric := modelv2.Column{} //nolint:exhaustruct + lenMetric := model.Column{} //nolint:exhaustruct analyser.Build(&lenMetric) - strlen := modelv2.StringLen{ + strlen := model.StringLen{ Length: length, Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count), - Metrics: modelv2.Generic{}, //nolint:exhaustruct + Metrics: model.Generic{}, //nolint:exhaustruct } strlen.Metrics.Count = lenMetric.MainMetric.Count strlen.Metrics.Empty = lenMetric.MainMetric.Empty diff --git a/pkg/metricv2/trueratio.go b/pkg/metric/trueratio.go similarity index 68% rename from pkg/metricv2/trueratio.go rename to pkg/metric/trueratio.go index 5c3106d..e9d8271 100644 --- a/pkg/metricv2/trueratio.go +++ b/pkg/metric/trueratio.go @@ -1,6 +1,6 @@ -package metricv2 +package metric -import "github.com/cgi-fr/rimo/pkg/modelv2" +import "github.com/cgi-fr/rimo/pkg/model" type TrueRatio struct { countTrue uint @@ -26,8 +26,8 @@ func (a *TrueRatio) Read(value *bool) { } } -func (a *TrueRatio) Build(metric *modelv2.Column) { - metric.BoolMetric = &modelv2.Bool{ +func (a *TrueRatio) Build(metric *model.Column) { + metric.BoolMetric = &model.Bool{ TrueRatio: float64(a.countTrue) / float64(a.count), } } diff --git a/pkg/metricv2/types.go b/pkg/metric/types.go similarity index 85% rename from pkg/metricv2/types.go rename to pkg/metric/types.go index a58e60e..05de82d 100644 --- a/pkg/metricv2/types.go +++ b/pkg/metric/types.go @@ -1,4 +1,4 @@ -package metricv2 +package metric import "golang.org/x/exp/constraints" diff --git a/pkg/modelv2/base.go b/pkg/model/base.go similarity index 96% rename from pkg/modelv2/base.go rename to pkg/model/base.go index 2a45f81..2641719 100644 --- a/pkg/modelv2/base.go +++ b/pkg/model/base.go @@ -1,4 +1,4 @@ -package modelv2 +package model const DefaultTableSize = 10 diff --git a/pkg/modelv2/column.go b/pkg/model/column.go similarity index 97% rename from pkg/modelv2/column.go rename to pkg/model/column.go index e776fe5..8eadc62 100644 --- a/pkg/modelv2/column.go +++ b/pkg/model/column.go @@ -1,4 +1,4 @@ -package modelv2 +package model type Column struct { Name string `json:"name" yaml:"name" jsonschema:"required"` diff --git a/pkg/modelv2/config.go b/pkg/model/config.go similarity index 94% rename from pkg/modelv2/config.go rename to pkg/model/config.go index 97d393b..d6742e4 100644 --- a/pkg/modelv2/config.go +++ b/pkg/model/config.go @@ -1,4 +1,4 @@ -package modelv2 +package model type Config struct { Concept string `json:"concept" yaml:"concept" jsonschema:"required"` diff --git a/pkg/modelv2/metrics.go b/pkg/model/metrics.go similarity index 98% rename from pkg/modelv2/metrics.go rename to pkg/model/metrics.go index 8c9069b..912d077 100644 --- a/pkg/modelv2/metrics.go +++ b/pkg/model/metrics.go @@ -1,4 +1,4 @@ -package modelv2 +package model type Generic struct { Count uint `json:"count" yaml:"count" jsonschema:"required"` diff --git a/pkg/modelv2/schema.go b/pkg/model/schema.go similarity index 95% rename from pkg/modelv2/schema.go rename to pkg/model/schema.go index 552bd90..5258dca 100644 --- a/pkg/modelv2/schema.go +++ b/pkg/model/schema.go @@ -1,4 +1,4 @@ -package modelv2 +package model import ( "encoding/json" diff --git a/pkg/rimo/driven.go b/pkg/rimo/driven.go index f5cb43e..fb79719 100644 --- a/pkg/rimo/driven.go +++ b/pkg/rimo/driven.go @@ -17,7 +17,7 @@ package rimo -import "github.com/cgi-fr/rimo/pkg/modelv2" +import "github.com/cgi-fr/rimo/pkg/model" type ColReader interface { ColName() string @@ -33,5 +33,5 @@ type Reader interface { } type Writer interface { - Export(base *modelv2.Base) error + Export(base *model.Base) error } diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index d4a142e..d5ced38 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -21,8 +21,8 @@ import ( "fmt" "sort" - "github.com/cgi-fr/rimo/pkg/metricv2" - "github.com/cgi-fr/rimo/pkg/modelv2" + "github.com/cgi-fr/rimo/pkg/metric" + "github.com/cgi-fr/rimo/pkg/model" "github.com/rs/zerolog/log" ) @@ -35,8 +35,8 @@ type Driver struct { func (d Driver) AnalyseBase(reader Reader, writer Writer) error { baseName := reader.BaseName() - base := modelv2.NewBase(baseName) - tables := map[string]modelv2.Table{} + base := model.NewBase(baseName) + tables := map[string]model.Table{} for reader.Next() { // itère colonne par colonne valreader, err := reader.Col() @@ -63,9 +63,9 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table, exists := tables[valreader.TableName()] if !exists { - table = modelv2.Table{ + table = model.Table{ Name: valreader.TableName(), - Columns: []modelv2.Column{}, + Columns: []model.Column{}, } } @@ -80,9 +80,9 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table, exists := tables[valreader.TableName()] if !exists { - table = modelv2.Table{ + table = model.Table{ Name: valreader.TableName(), - Columns: []modelv2.Column{}, + Columns: []model.Column{}, } } @@ -97,9 +97,9 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { table, exists := tables[valreader.TableName()] if !exists { - table = modelv2.Table{ + table = model.Table{ Name: valreader.TableName(), - Columns: []modelv2.Column{}, + Columns: []model.Column{}, } } @@ -132,18 +132,18 @@ func (d Driver) AnalyseBase(reader Reader, writer Writer) error { return nil } -func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) (modelv2.Column, error) { - column := modelv2.Column{ +func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) (model.Column, error) { + column := model.Column{ Name: reader.ColName(), Type: "string", - Config: modelv2.Config{}, //nolint:exhaustruct - MainMetric: modelv2.Generic{}, //nolint:exhaustruct - StringMetric: &modelv2.String{}, //nolint:exhaustruct + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct + StringMetric: &model.String{}, //nolint:exhaustruct NumericMetric: nil, BoolMetric: nil, } - analyser := metricv2.NewString(d.SampleSize, true) + analyser := metric.NewString(d.SampleSize, true) for i := 0; i < nilcount; i++ { analyser.Read(nil) @@ -170,18 +170,18 @@ func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) return column, nil } -func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) (modelv2.Column, error) { - column := modelv2.Column{ +func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) (model.Column, error) { + column := model.Column{ Name: reader.ColName(), Type: "numeric", - Config: modelv2.Config{}, //nolint:exhaustruct - MainMetric: modelv2.Generic{}, //nolint:exhaustruct + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct StringMetric: nil, - NumericMetric: &modelv2.Numeric{}, //nolint:exhaustruct + NumericMetric: &model.Numeric{}, //nolint:exhaustruct BoolMetric: nil, } - analyser := metricv2.NewNumeric(d.SampleSize, true) + analyser := metric.NewNumeric(d.SampleSize, true) for i := 0; i < nilcount; i++ { analyser.Read(nil) @@ -213,18 +213,18 @@ func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) ( return column, nil } -func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (modelv2.Column, error) { - column := modelv2.Column{ +func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (model.Column, error) { + column := model.Column{ Name: reader.ColName(), Type: "bool", - Config: modelv2.Config{}, //nolint:exhaustruct - MainMetric: modelv2.Generic{}, //nolint:exhaustruct + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct StringMetric: nil, NumericMetric: nil, - BoolMetric: &modelv2.Bool{}, //nolint:exhaustruct + BoolMetric: &model.Bool{}, //nolint:exhaustruct } - analyser := metricv2.NewBool(d.SampleSize, true) + analyser := metric.NewBool(d.SampleSize, true) for i := 0; i < nilcount; i++ { analyser.Read(nil) diff --git a/schema/v1/rimo.schema.json b/schema/v1/rimo.schema.json index be205df..df9c60a 100644 --- a/schema/v1/rimo.schema.json +++ b/schema/v1/rimo.schema.json @@ -22,7 +22,7 @@ "tables" ] }, - "BoolMetric": { + "Bool": { "properties": { "trueRatio": { "type": "number" @@ -55,16 +55,16 @@ "type": "boolean" }, "mainMetric": { - "$ref": "#/$defs/GenericMetric" + "$ref": "#/$defs/Generic" }, "stringMetric": { - "$ref": "#/$defs/StringMetric" + "$ref": "#/$defs/String" }, "numericMetric": { - "$ref": "#/$defs/NumericMetric" + "$ref": "#/$defs/Numeric" }, "boolMetric": { - "$ref": "#/$defs/BoolMetric" + "$ref": "#/$defs/Bool" } }, "additionalProperties": false, @@ -78,7 +78,7 @@ "mainMetric" ] }, - "GenericMetric": { + "Generic": { "properties": { "count": { "type": "integer" @@ -86,54 +86,28 @@ "empty": { "type": "integer" }, - "unique": { + "nulls": { "type": "integer" }, - "sample": { - "items": true, - "type": "array" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "count", - "empty", - "unique", - "sample" - ] - }, - "LenFreq": { - "properties": { - "length": { + "distinct": { "type": "integer" }, - "freq": { - "type": "number" - }, - "sample": { - "items": { - "type": "string" - }, + "min": true, + "max": true, + "samples": { + "items": true, "type": "array" } }, "additionalProperties": false, "type": "object", "required": [ - "length", - "freq", - "sample" + "count", + "samples" ] }, - "NumericMetric": { + "Numeric": { "properties": { - "min": { - "type": "number" - }, - "max": { - "type": "number" - }, "mean": { "type": "number" } @@ -141,12 +115,10 @@ "additionalProperties": false, "type": "object", "required": [ - "min", - "max", "mean" ] }, - "StringMetric": { + "String": { "properties": { "minLen": { "type": "integer" @@ -154,15 +126,12 @@ "maxLen": { "type": "integer" }, - "mostFrequentLen": { - "items": { - "$ref": "#/$defs/LenFreq" - }, - "type": "array" + "countLen": { + "type": "integer" }, - "leastFrequentLen": { + "lengths": { "items": { - "$ref": "#/$defs/LenFreq" + "$ref": "#/$defs/StringLen" }, "type": "array" } @@ -171,9 +140,27 @@ "type": "object", "required": [ "minLen", - "maxLen", - "mostFrequentLen", - "leastFrequentLen" + "maxLen" + ] + }, + "StringLen": { + "properties": { + "length": { + "type": "integer" + }, + "freq": { + "type": "number" + }, + "metrics": { + "$ref": "#/$defs/Generic" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "length", + "freq", + "metrics" ] }, "Table": { From 279a456600ba41e4fcce2d942b2bac30a1dff0f9 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:26:07 +0000 Subject: [PATCH 18/26] refactor: count distinct --- cmd/rimo/main.go | 56 +++------------------------------------------- pkg/rimo/driver.go | 7 +++--- 2 files changed, 7 insertions(+), 56 deletions(-) diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 2b0f6a5..39d9ef8 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -41,6 +41,7 @@ var ( builtBy string //nolint: gochecknoglobals sampleSize uint //nolint: gochecknoglobals + distinct bool //nolint: gochecknoglobals ) func main() { //nolint:funlen @@ -93,7 +94,7 @@ func main() { //nolint:funlen log.Fatal().Msgf("error creating writer: %v", err) } - driver := rimo.Driver{SampleSize: sampleSize} + driver := rimo.Driver{SampleSize: sampleSize, Distinct: distinct} err = driver.AnalyseBase(reader, writer) if err != nil { @@ -105,6 +106,7 @@ func main() { //nolint:funlen } rimoAnalyseCmd.Flags().UintVar(&sampleSize, "sample-size", DefaultSampleSize, "number of sample value to collect") + rimoAnalyseCmd.Flags().BoolVarP(&distinct, "distinct", "d", false, "count distinct values") rootCmd.AddCommand(rimoAnalyseCmd) rootCmd.AddCommand(rimoSchemaCmd) @@ -114,55 +116,3 @@ func main() { //nolint:funlen os.Exit(1) } } - -func FilesList(path string, extension string) ([]string, error) { - pattern := filepath.Join(path, "*"+extension) - - files, err := filepath.Glob(pattern) - if err != nil { - return nil, fmt.Errorf("error listing files: %w", err) - } - - return files, nil -} - -var ErrNoFile = fmt.Errorf("no file found") - -func BuildFilepathList(path string, extension string) ([]string, error) { - err := ValidateDirPath(path) - if err != nil { - return nil, fmt.Errorf("failed to validate input directory: %w", err) - } - - pattern := filepath.Join(path, "*"+extension) - - files, err := filepath.Glob(pattern) - if err != nil { - return nil, fmt.Errorf("error listing files: %w", err) - } - - if len(files) == 0 { - return nil, fmt.Errorf("%w : no %s files found in %s", ErrNoFile, extension, path) - } - - return files, nil -} - -func ValidateDirPath(path string) error { - fileInfo, err := os.Stat(path) - if os.IsNotExist(err) { - return fmt.Errorf("%w: %s", infra.ErrDirDoesNotExist, path) - } else if err != nil { - return fmt.Errorf("failed to get directory info: %w", err) - } - - if !fileInfo.IsDir() { - return fmt.Errorf("%w: %s", infra.ErrPathIsNotDir, path) - } - - if fileInfo.Mode().Perm()&infra.WriteDirPerm != infra.WriteDirPerm { - return fmt.Errorf("%w: %s", infra.ErrWriteDirPermission, path) - } - - return nil -} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index d5ced38..144218c 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -29,6 +29,7 @@ import ( type Driver struct { SampleSize uint + Distinct bool } //nolint:funlen,cyclop,gocognit @@ -143,7 +144,7 @@ func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) BoolMetric: nil, } - analyser := metric.NewString(d.SampleSize, true) + analyser := metric.NewString(d.SampleSize, d.Distinct) for i := 0; i < nilcount; i++ { analyser.Read(nil) @@ -181,7 +182,7 @@ func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) ( BoolMetric: nil, } - analyser := metric.NewNumeric(d.SampleSize, true) + analyser := metric.NewNumeric(d.SampleSize, d.Distinct) for i := 0; i < nilcount; i++ { analyser.Read(nil) @@ -224,7 +225,7 @@ func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (mo BoolMetric: &model.Bool{}, //nolint:exhaustruct } - analyser := metric.NewBool(d.SampleSize, true) + analyser := metric.NewBool(d.SampleSize, d.Distinct) for i := 0; i < nilcount; i++ { analyser.Read(nil) From 1db1082561faea468b835461846fa2c619f9e9f3 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:39:33 +0000 Subject: [PATCH 19/26] refactor: bool samples --- pkg/metric/bool.go | 5 +++-- pkg/model/metrics.go | 4 ++-- schema/v1/rimo.schema.json | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/metric/bool.go b/pkg/metric/bool.go index 289dfcc..47fa462 100644 --- a/pkg/metric/bool.go +++ b/pkg/metric/bool.go @@ -6,8 +6,9 @@ type Bool struct { func NewBool(sampleSize uint, countDistinct bool) *Bool { mainAnalyser := []Analyser[bool]{ - NewCounter[bool](), // count total, count null - NewTrueRatio(), // calculate true ratio + NewCounter[bool](), // count total, count null, count empty + NewSampler[bool](sampleSize), // store few samples + NewTrueRatio(), // calculate true ratio } if countDistinct { diff --git a/pkg/model/metrics.go b/pkg/model/metrics.go index 912d077..5073cc6 100644 --- a/pkg/model/metrics.go +++ b/pkg/model/metrics.go @@ -2,8 +2,8 @@ package model type Generic struct { Count uint `json:"count" yaml:"count" jsonschema:"required"` - Empty uint `json:"empty,omitempty" yaml:"empty,omitempty"` - Null uint `json:"nulls,omitempty" yaml:"nulls,omitempty"` + Empty uint `json:"empty" yaml:"empty" jsonschema:"required"` + Null uint `json:"nulls" yaml:"nulls" jsonschema:"required"` Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` Min any `json:"min,omitempty" yaml:"min,omitempty"` Max any `json:"max,omitempty" yaml:"max,omitempty"` diff --git a/schema/v1/rimo.schema.json b/schema/v1/rimo.schema.json index df9c60a..187194b 100644 --- a/schema/v1/rimo.schema.json +++ b/schema/v1/rimo.schema.json @@ -103,6 +103,8 @@ "type": "object", "required": [ "count", + "empty", + "nulls", "samples" ] }, From 2982a660090d5ec3dea55945482b36e30adf56b3 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:27:23 +0000 Subject: [PATCH 20/26] refactor: put benchmark back --- internal/infra/filesReader.go | 244 +++++++++++++-------------- internal/infra/filesReader_v2.go | 161 ------------------ pkg/rimo/driver_test.go | 50 ++++++ testdata/benchmark/buildBenchData.sh | 56 +++--- 4 files changed, 196 insertions(+), 315 deletions(-) delete mode 100644 internal/infra/filesReader_v2.go create mode 100644 pkg/rimo/driver_test.go diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go index 8430ae9..1e4a5d9 100644 --- a/internal/infra/filesReader.go +++ b/internal/infra/filesReader.go @@ -1,173 +1,161 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - package infra import ( "errors" "fmt" -) + "os" + "path" + "path/filepath" + "strings" -// Errors declaration. -var ( - ErrInvalidFilePath = errors.New("failed to validate path") - ErrNoFilePath = errors.New("no file path provided") - ErrNonUniqueBase = errors.New("base name is not unique") + "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/goccy/go-json" ) -// FilesReader can read multiple type of file and feed data to rimo. -// FilesReader is responsible of : -// - BaseName() return the name of the base -// - Next() return true if there is a next value to read -// - Value() return the value of the current column, the name of the column and the name of the table -// Interface itself with a Loader interface. Which currently only supports YAML files. -// Loader and FilesReader can be initialized with LoaderFactory and FilesReaderFactory. -type FilesReader struct { - filepathList []string - loader JSONLinesLoader // responsible of loading a file format - baseName string - // variable for looping over columns - fileIndex int - colNameMapIndex map[int]string // map of column name by index - colIndex int // value of current column index - // given by Value() - dataMap map[string][]interface{} - tableName string // filled by FilesReader -} - -// Constructor for FilesReader. -func FilesReaderFactory(filepathList []string) (*FilesReader, error) { - var err error - - // Process inputDirList - if len(filepathList) == 0 { - return nil, ErrNoFilePath +var ErrReadFile = errors.New("error while reading file") + +type JSONLFolderReader struct { + basename string + readers []*JSONLFileReader + current int +} + +func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { + basename := path.Base(folderpath) + + pattern := filepath.Join(folderpath, "*.jsonl") + + files, err := filepath.Glob(pattern) + if err != nil { + return nil, fmt.Errorf("error listing files: %w", err) } - for _, path := range filepathList { - err := ValidateFilePath(path) + readers := make([]*JSONLFileReader, len(files)) + for index, filepath := range files { + readers[index], err = NewJSONLFileReader(basename, filepath) if err != nil { - return nil, ErrInvalidFilePath + return nil, fmt.Errorf("error opening files: %w", err) } } - // Initialize FilesReader - var filesReader FilesReader - filesReader.filepathList = filepathList - filesReader.fileIndex = -1 + return &JSONLFolderReader{ + basename: basename, + readers: readers, + current: 0, + }, nil +} - filesReader.baseName, err = filesReader.isBaseUnique() - if err != nil { - return nil, fmt.Errorf("base is not unique: %w", err) - } +func (r *JSONLFolderReader) BaseName() string { + return r.basename +} - // Use of JSONLinesLoader - filesReader.loader = JSONLinesLoader{} +func (r *JSONLFolderReader) Next() bool { + if r.current < len(r.readers) && !r.readers[r.current].Next() { + r.current++ - return &filesReader, nil + return r.Next() + } + + return r.current < len(r.readers) } -// Reader interface implementation +func (r *JSONLFolderReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return r.readers[r.current].Col() +} -func (r *FilesReader) BaseName() string { - return r.baseName +type JSONLFileReader struct { + tablename string + source *os.File + columns []string + current int + decoder *json.Decoder + basename string } -func (r *FilesReader) Next() bool { - // First call to Next() - if r.fileIndex == -1 { - r.fileIndex = 0 - r.colIndex = 0 +func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) { + source, err := os.Open(filepath) + if err != nil { + return nil, fmt.Errorf("%w", err) + } + + template := map[string]any{} - return true + decoder := json.NewDecoder(source) + if err := decoder.Decode(&template); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - // Current file contain column left to process. - if r.colIndex < len(r.dataMap) { - r.colIndex++ + if _, err := source.Seek(0, 0); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - // Current file contain no columns left to process. - if r.colIndex == len(r.dataMap) { - // Current file is last file. - if r.fileIndex == len(r.filepathList)-1 { - return false - } - // There is a next file. - r.fileIndex++ - r.colIndex = 0 + columns := make([]string, 0, len(template)) + for column := range template { + columns = append(columns, column) } - return true + return &JSONLFileReader{ + tablename: strings.TrimSuffix(path.Base(filepath), path.Ext(filepath)), + source: source, + columns: columns, + current: -1, + decoder: json.NewDecoder(source), + basename: basename, + }, nil +} + +func (fr *JSONLFileReader) BaseName() string { + return fr.basename } -// Charger les fichiers un à un dans une dataMap. -// Retourne les valeurs d'une colonne, son nom et le nom de table. -func (r *FilesReader) Value() ([]interface{}, string, string, error) { - var err error +func (fr *JSONLFileReader) Next() bool { + fr.current++ - // colIndex = 0 : new file to load - if r.colIndex == 0 { - filepath := r.filepathList[r.fileIndex] + if _, err := fr.source.Seek(0, 0); err != nil { + panic(err) + } - // Extract table name from file name - _, r.tableName, err = ExtractName(filepath) - if err != nil { - return nil, "", "", fmt.Errorf("failed to extract table name: %w", err) - } + fr.decoder = json.NewDecoder(fr.source) - // Load file in dataMap - r.dataMap, err = r.loader.Load(r.filepathList[r.fileIndex]) - if err != nil { - panic(err) - } + return fr.current < len(fr.columns) +} - // Create a map of column name by index - r.colNameMapIndex = make(map[int]string, 0) - i := 0 +func (fr *JSONLFileReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return NewJSONLColReader(fr.tablename, fr.columns[fr.current], fr.decoder), nil +} - for k := range r.dataMap { - r.colNameMapIndex[i] = k - i++ - } +type JSONLColReader struct { + table string + column string + decoder *json.Decoder +} + +func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColReader { + return &JSONLColReader{ + table: table, + column: column, + decoder: decoder, } +} - // colIndex = n : current file have been partially processed - currentColName := r.colNameMapIndex[r.colIndex] - // return values, colName, tableName - return r.dataMap[currentColName], currentColName, r.tableName, nil +func (cr *JSONLColReader) ColName() string { + return cr.column } -func (r *FilesReader) isBaseUnique() (string, error) { - baseName, _, err := ExtractName(r.filepathList[0]) - if err != nil { - return "", err - } +func (cr *JSONLColReader) TableName() string { + return cr.table +} - for _, path := range r.filepathList { - baseNameI, _, err := ExtractName(path) - if err != nil { - return "", err - } +func (cr *JSONLColReader) Next() bool { + return cr.decoder.More() +} - if baseName != baseNameI { - return "", fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI) - } +func (cr *JSONLColReader) Value() (any, error) { + row := map[string]any{} + + if err := cr.decoder.Decode(&row); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - return baseName, nil + return row[cr.column], nil } diff --git a/internal/infra/filesReader_v2.go b/internal/infra/filesReader_v2.go deleted file mode 100644 index 1e4a5d9..0000000 --- a/internal/infra/filesReader_v2.go +++ /dev/null @@ -1,161 +0,0 @@ -package infra - -import ( - "errors" - "fmt" - "os" - "path" - "path/filepath" - "strings" - - "github.com/cgi-fr/rimo/pkg/rimo" - "github.com/goccy/go-json" -) - -var ErrReadFile = errors.New("error while reading file") - -type JSONLFolderReader struct { - basename string - readers []*JSONLFileReader - current int -} - -func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { - basename := path.Base(folderpath) - - pattern := filepath.Join(folderpath, "*.jsonl") - - files, err := filepath.Glob(pattern) - if err != nil { - return nil, fmt.Errorf("error listing files: %w", err) - } - - readers := make([]*JSONLFileReader, len(files)) - for index, filepath := range files { - readers[index], err = NewJSONLFileReader(basename, filepath) - if err != nil { - return nil, fmt.Errorf("error opening files: %w", err) - } - } - - return &JSONLFolderReader{ - basename: basename, - readers: readers, - current: 0, - }, nil -} - -func (r *JSONLFolderReader) BaseName() string { - return r.basename -} - -func (r *JSONLFolderReader) Next() bool { - if r.current < len(r.readers) && !r.readers[r.current].Next() { - r.current++ - - return r.Next() - } - - return r.current < len(r.readers) -} - -func (r *JSONLFolderReader) Col() (rimo.ColReader, error) { //nolint:ireturn - return r.readers[r.current].Col() -} - -type JSONLFileReader struct { - tablename string - source *os.File - columns []string - current int - decoder *json.Decoder - basename string -} - -func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) { - source, err := os.Open(filepath) - if err != nil { - return nil, fmt.Errorf("%w", err) - } - - template := map[string]any{} - - decoder := json.NewDecoder(source) - if err := decoder.Decode(&template); err != nil { - return nil, fmt.Errorf("%w: %w", ErrReadFile, err) - } - - if _, err := source.Seek(0, 0); err != nil { - return nil, fmt.Errorf("%w: %w", ErrReadFile, err) - } - - columns := make([]string, 0, len(template)) - for column := range template { - columns = append(columns, column) - } - - return &JSONLFileReader{ - tablename: strings.TrimSuffix(path.Base(filepath), path.Ext(filepath)), - source: source, - columns: columns, - current: -1, - decoder: json.NewDecoder(source), - basename: basename, - }, nil -} - -func (fr *JSONLFileReader) BaseName() string { - return fr.basename -} - -func (fr *JSONLFileReader) Next() bool { - fr.current++ - - if _, err := fr.source.Seek(0, 0); err != nil { - panic(err) - } - - fr.decoder = json.NewDecoder(fr.source) - - return fr.current < len(fr.columns) -} - -func (fr *JSONLFileReader) Col() (rimo.ColReader, error) { //nolint:ireturn - return NewJSONLColReader(fr.tablename, fr.columns[fr.current], fr.decoder), nil -} - -type JSONLColReader struct { - table string - column string - decoder *json.Decoder -} - -func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColReader { - return &JSONLColReader{ - table: table, - column: column, - decoder: decoder, - } -} - -func (cr *JSONLColReader) ColName() string { - return cr.column -} - -func (cr *JSONLColReader) TableName() string { - return cr.table -} - -func (cr *JSONLColReader) Next() bool { - return cr.decoder.More() -} - -func (cr *JSONLColReader) Value() (any, error) { - row := map[string]any{} - - if err := cr.decoder.Decode(&row); err != nil { - return nil, fmt.Errorf("%w: %w", ErrReadFile, err) - } - - return row[cr.column], nil -} diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go new file mode 100644 index 0000000..453101e --- /dev/null +++ b/pkg/rimo/driver_test.go @@ -0,0 +1,50 @@ +package rimo_test + +import ( + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/cgi-fr/rimo/internal/infra" + "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/stretchr/testify/require" +) + +const ( + dataDir = "../../testdata/" + inputName = "data_input.jsonl" + outputName = "interface_data_output.yaml" + expectedName = "data_expected.yaml" +) + +// Benchmark (same as previous analyse_test.go benchmark). +func BenchmarkAnalyseInterface(b *testing.B) { + for _, numLines := range []int{100, 1000, 10000, 100000} { + inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d", numLines)) + outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) + + b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) { + startTime := time.Now() + + reader, err := infra.NewJSONLFolderReader(inputPath) + require.NoError(b, err) + + writer, err := infra.YAMLWriterFactory(outputPath) + require.NoError(b, err) + + driver := rimo.Driver{SampleSize: 5, Distinct: true} + + b.ResetTimer() + for n := 0; n < b.N; n++ { + err := driver.AnalyseBase(reader, writer) + require.NoError(b, err) + } + b.StopTimer() + + elapsed := time.Since(startTime) + linesPerSecond := float64(numLines*b.N) / elapsed.Seconds() + b.ReportMetric(linesPerSecond, "lines/s") + }) + } +} diff --git a/testdata/benchmark/buildBenchData.sh b/testdata/benchmark/buildBenchData.sh index fa046ce..60b2138 100755 --- a/testdata/benchmark/buildBenchData.sh +++ b/testdata/benchmark/buildBenchData.sh @@ -3,53 +3,57 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd "${SCRIPT_DIR}/mixed/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi -if [ ! -f 100000_input.jsonl ]; then - pimo --empty-input --repeat=100000 > 100000_input.jsonl +if [ ! -f 100000/input.jsonl ]; then + pimo --empty-input --repeat=100000 > 100000/input.jsonl fi echo "data for mixed : OK" cd "${SCRIPT_DIR}/bool/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data for mixed : OK" cd "${SCRIPT_DIR}/numeric/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data for numeric : OK" cd "${SCRIPT_DIR}/text/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data generated for text : OK" From 4a98222d7949d415cd44b9391c212553bf8d0c29 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:37:26 +0000 Subject: [PATCH 21/26] refactor: disable 100000 lines bench --- pkg/rimo/driver_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go index 453101e..f9900fe 100644 --- a/pkg/rimo/driver_test.go +++ b/pkg/rimo/driver_test.go @@ -20,7 +20,7 @@ const ( // Benchmark (same as previous analyse_test.go benchmark). func BenchmarkAnalyseInterface(b *testing.B) { - for _, numLines := range []int{100, 1000, 10000, 100000} { + for _, numLines := range []int{100, 1000, 10000} { inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d", numLines)) outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) From 3769e06d55d3d4f12024594434ee4f8837e05de9 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:05:49 +0000 Subject: [PATCH 22/26] refactor: add logs --- cmd/rimo/main.go | 75 ++++++++++++++++++++++++---- go.mod | 2 +- internal/infra/filesReader.go | 19 +++++++ pkg/rimo/driver.go | 6 +++ test/suites/cli/metrics.yml | 8 +++ test/suites/testdata/main/data.jsonl | 10 ++++ 6 files changed, 109 insertions(+), 11 deletions(-) create mode 100644 test/suites/cli/metrics.yml create mode 100644 test/suites/testdata/main/data.jsonl diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 39d9ef8..57bdaac 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -21,10 +21,13 @@ import ( "fmt" "os" "path/filepath" + "runtime" + "strings" "github.com/cgi-fr/rimo/internal/infra" "github.com/cgi-fr/rimo/pkg/model" "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/mattn/go-isatty" "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/spf13/cobra" @@ -32,21 +35,25 @@ import ( const DefaultSampleSize = uint(5) -// Provisioned by ldflags. +//nolint:gochecknoglobals var ( - name string //nolint: gochecknoglobals - version string //nolint: gochecknoglobals - commit string //nolint: gochecknoglobals - buildDate string //nolint: gochecknoglobals - builtBy string //nolint: gochecknoglobals - - sampleSize uint //nolint: gochecknoglobals + name string // provisioned by ldflags + version string // provisioned by ldflags + commit string // provisioned by ldflags + buildDate string // provisioned by ldflags + builtBy string // provisioned by ldflags + + verbosity string + jsonlog bool + debug bool + colormode string + + sampleSize uint distinct bool //nolint: gochecknoglobals ) func main() { //nolint:funlen - log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) //nolint: exhaustruct - + cobra.OnInitialize(initLog) log.Info().Msgf("%v %v (commit=%v date=%v by=%v)", name, version, commit, buildDate, builtBy) rootCmd := &cobra.Command{ //nolint:exhaustruct @@ -59,6 +66,12 @@ func main() { //nolint:funlen There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDate, builtBy), } + rootCmd.PersistentFlags().StringVarP(&verbosity, "verbosity", "v", "warn", + "set level of log verbosity : none (0), error (1), warn (2), info (3), debug (4), trace (5)") + rootCmd.PersistentFlags().BoolVar(&debug, "debug", false, "add debug information to logs (very slow)") + rootCmd.PersistentFlags().BoolVar(&jsonlog, "log-json", false, "output logs in JSON format") + rootCmd.PersistentFlags().StringVar(&colormode, "color", "auto", "use colors in log outputs : yes, no or auto") + rimoSchemaCmd := &cobra.Command{ //nolint:exhaustruct Use: "jsonschema", Short: "Return rimo jsonschema", @@ -116,3 +129,45 @@ func main() { //nolint:funlen os.Exit(1) } } + +func initLog() { + color := false + + switch strings.ToLower(colormode) { + case "auto": + if isatty.IsTerminal(os.Stdout.Fd()) && runtime.GOOS != "windows" { + color = true + } + case "yes", "true", "1", "on", "enable": + color = true + } + + if jsonlog { + log.Logger = zerolog.New(os.Stderr) + } else { + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, NoColor: !color}) //nolint:exhaustruct + } + + if debug { + log.Logger = log.Logger.With().Caller().Logger() + } + + setVerbosity() +} + +func setVerbosity() { + switch verbosity { + case "trace", "5": + zerolog.SetGlobalLevel(zerolog.TraceLevel) + case "debug", "4": + zerolog.SetGlobalLevel(zerolog.DebugLevel) + case "info", "3": + zerolog.SetGlobalLevel(zerolog.InfoLevel) + case "warn", "2": + zerolog.SetGlobalLevel(zerolog.WarnLevel) + case "error", "1": + zerolog.SetGlobalLevel(zerolog.ErrorLevel) + default: + zerolog.SetGlobalLevel(zerolog.Disabled) + } +} diff --git a/go.mod b/go.mod index ec3a675..a472ce0 100644 --- a/go.mod +++ b/go.mod @@ -23,7 +23,7 @@ require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/invopop/jsonschema v0.7.0 // direct github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.19 github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spf13/pflag v1.0.5 // indirect golang.org/x/sys v0.13.0 // indirect diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go index 1e4a5d9..9503edd 100644 --- a/internal/infra/filesReader.go +++ b/internal/infra/filesReader.go @@ -10,6 +10,7 @@ import ( "github.com/cgi-fr/rimo/pkg/rimo" "github.com/goccy/go-json" + "github.com/rs/zerolog/log" ) var ErrReadFile = errors.New("error while reading file") @@ -21,6 +22,8 @@ type JSONLFolderReader struct { } func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { + log.Trace().Str("path", folderpath).Msg("reading folder") + basename := path.Base(folderpath) pattern := filepath.Join(folderpath, "*.jsonl") @@ -31,7 +34,10 @@ func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { } readers := make([]*JSONLFileReader, len(files)) + for index, filepath := range files { + log.Trace().Str("path", filepath).Msg("scanning file") + readers[index], err = NewJSONLFileReader(basename, filepath) if err != nil { return nil, fmt.Errorf("error opening files: %w", err) @@ -73,6 +79,8 @@ type JSONLFileReader struct { } func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) { + log.Trace().Str("path", filepath).Msg("opening file") + source, err := os.Open(filepath) if err != nil { return nil, fmt.Errorf("%w", err) @@ -80,17 +88,24 @@ func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, err template := map[string]any{} + log.Trace().Str("path", filepath).Msg("decoding line template") + decoder := json.NewDecoder(source) if err := decoder.Decode(&template); err != nil { return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } + log.Trace().Str("path", filepath).Any("template", template).Msg("decoded line template") + if _, err := source.Seek(0, 0); err != nil { return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } columns := make([]string, 0, len(template)) + for column := range template { + log.Trace().Str("path", filepath).Any("column", column).Msg("registering column") + columns = append(columns, column) } @@ -117,6 +132,8 @@ func (fr *JSONLFileReader) Next() bool { fr.decoder = json.NewDecoder(fr.source) + log.Trace().Str("base", fr.basename).Any("index", fr.current).Msg("successful jump to next column") + return fr.current < len(fr.columns) } @@ -157,5 +174,7 @@ func (cr *JSONLColReader) Value() (any, error) { return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } + log.Trace().Str("table", cr.table).Str("column", cr.column).Any("value", row[cr.column]).Msg("read value") + return row[cr.column], nil } diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index 144218c..6e28979 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -161,6 +161,8 @@ func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) switch valtyped := val.(type) { case string: analyser.Read(&valtyped) + case nil: + analyser.Read(nil) default: return column, fmt.Errorf("invalue value type : %w", err) } @@ -242,6 +244,8 @@ func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (mo switch valtyped := val.(type) { case bool: analyser.Read(&valtyped) + case nil: + analyser.Read(nil) default: return column, fmt.Errorf("invalue value type : %w", err) } @@ -281,6 +285,8 @@ func GetFloat64(value any) (*float64, error) { converted = float64(valtyped) case uint64: converted = float64(valtyped) + case nil: + return nil, nil //nolint:nilnil default: return nil, fmt.Errorf("%w : %T", ErrInvalidValueType, value) } diff --git a/test/suites/cli/metrics.yml b/test/suites/cli/metrics.yml new file mode 100644 index 0000000..047adab --- /dev/null +++ b/test/suites/cli/metrics.yml @@ -0,0 +1,8 @@ +name: test metrics +testcases: + - name: main metric + steps: + - script: |- + rimo analyse ../testdata/data1/ ../testdata/data1/output/ + assertions: + - result.code ShouldEqual 0 diff --git a/test/suites/testdata/main/data.jsonl b/test/suites/testdata/main/data.jsonl new file mode 100644 index 0000000..869be65 --- /dev/null +++ b/test/suites/testdata/main/data.jsonl @@ -0,0 +1,10 @@ +{"string":"","bool":true,"numeric":0} +{"string":" ","bool":false,"numeric":1} +{"string":"__","bool":true,"numeric":3.1415} +{"string":"new\nline","bool":false,"numeric":1.0E+2} +{"string":"hello world","bool":true,"numeric":21.2e-7} +{"string":"tabs\t","bool":false,"numeric":-235} +{"string":"教育漢字","bool":true,"numeric":-0} +{"string":"\\","bool":false,"numeric":0.0} +{"string":"\u20ac","bool":true,"numeric":9.0} +{"string":null,"bool":null,"numeric":null} From 0f4d51df33a0ff3f4d486261e88581e32f6529ec Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:10:36 +0000 Subject: [PATCH 23/26] refactor: bench set global level warn --- pkg/rimo/driver_test.go | 3 +++ test/suites/cli/metrics.yml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go index f9900fe..f8dd16f 100644 --- a/pkg/rimo/driver_test.go +++ b/pkg/rimo/driver_test.go @@ -8,6 +8,7 @@ import ( "github.com/cgi-fr/rimo/internal/infra" "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/rs/zerolog" "github.com/stretchr/testify/require" ) @@ -20,6 +21,8 @@ const ( // Benchmark (same as previous analyse_test.go benchmark). func BenchmarkAnalyseInterface(b *testing.B) { + zerolog.SetGlobalLevel(zerolog.WarnLevel) + for _, numLines := range []int{100, 1000, 10000} { inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d", numLines)) outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) diff --git a/test/suites/cli/metrics.yml b/test/suites/cli/metrics.yml index 047adab..4f4c5ea 100644 --- a/test/suites/cli/metrics.yml +++ b/test/suites/cli/metrics.yml @@ -3,6 +3,6 @@ testcases: - name: main metric steps: - script: |- - rimo analyse ../testdata/data1/ ../testdata/data1/output/ + rimo analyse -d ../testdata/main ../testdata/main/output assertions: - result.code ShouldEqual 0 From d7519192fd2b62f69da689610cc627224f4da2b0 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:52:39 +0000 Subject: [PATCH 24/26] refactor: test int --- .devcontainer/Dockerfile | 4 +- build.yml | 6 + test/suites/cli/metrics.yml | 240 +++++++++++++++++++++- test/suites/testdata/main/data.jsonl | 10 +- test/suites/testdata/main/output/.gitkeep | 0 5 files changed, 252 insertions(+), 8 deletions(-) create mode 100644 test/suites/testdata/main/output/.gitkeep diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index ba5566f..24f3fe4 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -10,9 +10,11 @@ RUN apk add --update --progress --no-cache make gomplate ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 +ARG VERSION_YQ=4.40.4 RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr + && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \ + && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq COPY --from=pimo /usr/bin/pimo /usr/bin/pimo diff --git a/build.yml b/build.yml index 394b39f..af94db6 100644 --- a/build.yml +++ b/build.yml @@ -255,6 +255,12 @@ targets: - ldflags = ldflags + " -s -w" # Omit the DWARF symbol table. Omit the symbol table and debug information. - call: compile + test-int-debug: + doc: "Run all integration tests" + depends: ["info"] + steps: + - $: venom run test/suites/* + test-int: doc: "Run all integration tests" depends: ["info", "refresh", "lint", "test", "benchmark", "release"] diff --git a/test/suites/cli/metrics.yml b/test/suites/cli/metrics.yml index 4f4c5ea..108eeac 100644 --- a/test/suites/cli/metrics.yml +++ b/test/suites/cli/metrics.yml @@ -2,7 +2,243 @@ name: test metrics testcases: - name: main metric steps: - - script: |- - rimo analyse -d ../testdata/main ../testdata/main/output + - script: rimo analyse -d ../testdata/main ../testdata/main/output assertions: - result.code ShouldEqual 0 + - script: yq '.tables[0].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "data" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "bool" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "bool" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "4" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "2" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].boolMetric.trueRatio' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.5555555555555556" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "numeric" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "numeric" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "7" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "-235" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "100" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].numericMetric.mean' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "-13.539833097777777" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "string" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "string" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "9" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "教育漢字" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.minLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.maxLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "12" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.countLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "12" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[0].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '"hello world "' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "教育漢字" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[1].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '" "' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "_" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '""' + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '""' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 diff --git a/test/suites/testdata/main/data.jsonl b/test/suites/testdata/main/data.jsonl index 869be65..260040e 100644 --- a/test/suites/testdata/main/data.jsonl +++ b/test/suites/testdata/main/data.jsonl @@ -1,10 +1,10 @@ {"string":"","bool":true,"numeric":0} {"string":" ","bool":false,"numeric":1} -{"string":"__","bool":true,"numeric":3.1415} -{"string":"new\nline","bool":false,"numeric":1.0E+2} -{"string":"hello world","bool":true,"numeric":21.2e-7} -{"string":"tabs\t","bool":false,"numeric":-235} +{"string":"_","bool":true,"numeric":3.1415} +{"string":"new\nline ","bool":false,"numeric":1.0E+2} +{"string":"hello world ","bool":true,"numeric":21.2e-7} +{"string":"tabs\t ","bool":false,"numeric":-235} {"string":"教育漢字","bool":true,"numeric":-0} {"string":"\\","bool":false,"numeric":0.0} -{"string":"\u20ac","bool":true,"numeric":9.0} +{"string":"\u20ac ","bool":true,"numeric":9.0} {"string":null,"bool":null,"numeric":null} diff --git a/test/suites/testdata/main/output/.gitkeep b/test/suites/testdata/main/output/.gitkeep new file mode 100644 index 0000000..e69de29 From 4903622755665d19762a6e602758c438a6523a29 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:57:54 +0000 Subject: [PATCH 25/26] chore: add yq in ci --- .devcontainer/Dockerfile.ci | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile.ci b/.devcontainer/Dockerfile.ci index 0e547cd..f679260 100644 --- a/.devcontainer/Dockerfile.ci +++ b/.devcontainer/Dockerfile.ci @@ -10,8 +10,10 @@ RUN apk add --update --progress --no-cache make gomplate ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 +ARG VERSION_YQ=4.40.4 RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr + && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \ + && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq COPY --from=pimo /usr/bin/pimo /usr/bin/pimo From 74e7d4b572bf552316eb7440dfdfd9a7f3bd7243 Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Fri, 8 Dec 2023 15:05:57 +0000 Subject: [PATCH 26/26] docs: update readme --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06380a3..792ec12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,13 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [0.3.0] + +- `Added` moved `min` and `max` to the main metric. +- `Added` `countNulls` to the main metric. +- `Added` all main metrics to the lengths section in string metrics. +- `Removed` `leastFrequentLen` and `mostFrequentLen` all lengths are listed with the most frequent length in first position + ## [0.2.0] - `Added` new string metrics `minLen` and `maxLen`