From 58c3e27806bef7d33dc04c5c16507a11a8c3e14a Mon Sep 17 00:00:00 2001
From: Adrien Aury <44274230+adrienaury@users.noreply.github.com>
Date: Fri, 8 Dec 2023 16:33:54 +0100
Subject: [PATCH] refactor: stateless (#33)
* refactor: wip! counter and sampler
* refactor: wip! metrics string
* refactor: wip! string len freq
* refactor: wip! string sort by freq
* refactor: wip! modelv2
* refactor: build column
* refactor: driver
* refactor: infra file reader
* refactor: fix driver
* refactor: reader v2
* refactor: reader v2
* refactor: lint + sample-size flag
* refactor: numeric metric
* refactor: stable output
* refactor: bool metric
* refactor: cleanup
* refactor: update schema
* refactor: count distinct
* refactor: bool samples
* refactor: put benchmark back
* refactor: disable 100000 lines bench
* refactor: add logs
* refactor: bench set global level warn
* refactor: test int
* chore: add yq in ci
* docs: update readme
---
.devcontainer/Dockerfile | 4 +-
.devcontainer/Dockerfile.ci | 4 +-
CHANGELOG.md | 7 +
build.yml | 6 +
cmd/rimo/main.go | 128 +++++------
go.mod | 14 +-
go.sum | 22 +-
internal/infra/fileWriter_test.go | 84 -------
internal/infra/filesReader.go | 263 +++++++++++-----------
internal/infra/filesReader_test.go | 83 -------
internal/infra/infra_test.go | 93 --------
internal/infra/loader_test.go | 39 ----
pkg/metric/analyser.go | 26 +++
pkg/metric/bool.go | 21 ++
pkg/metric/build.go | 118 ----------
pkg/metric/counter.go | 38 ++++
pkg/metric/distinct.go | 25 ++
pkg/metric/generic.go | 124 ----------
pkg/metric/generic_test.go | 128 -----------
pkg/metric/mean.go | 31 +++
pkg/metric/metric_test.go | 82 +++++++
pkg/metric/metricbool.go | 51 -----
pkg/metric/metricbool_test.go | 42 ----
pkg/metric/metricnumeric.go | 69 ------
pkg/metric/metricnumeric_test.go | 46 ----
pkg/metric/metricstring.go | 140 ------------
pkg/metric/metricstring_test.go | 60 -----
pkg/metric/minmax.go | 41 ++++
pkg/metric/numeric.go | 22 ++
pkg/metric/sampler.go | 45 ++++
pkg/metric/string.go | 96 ++++++++
pkg/metric/trueratio.go | 33 +++
pkg/metric/types.go | 7 +
pkg/model/base.go | 56 ++---
pkg/model/column.go | 49 +---
pkg/model/config.go | 7 +
pkg/model/metric.go | 66 ------
pkg/model/metrics.go | 32 +++
pkg/model/schema.go | 17 ++
pkg/model/utils.go | 123 ----------
pkg/model/utils_test.go | 66 ------
pkg/rimo/driven.go | 15 +-
pkg/rimo/driven_test.go | 108 ---------
pkg/rimo/driver.go | 263 ++++++++++++++++++++--
pkg/rimo/driver_test.go | 140 +-----------
pkg/rimo/error.go | 5 +
schema/v1/rimo.schema.json | 95 ++++----
test/suites/cli/metrics.yml | 244 ++++++++++++++++++++
test/suites/testdata/main/data.jsonl | 10 +
test/suites/testdata/main/output/.gitkeep | 0
testdata/benchmark/buildBenchData.sh | 56 ++---
51 files changed, 1374 insertions(+), 1970 deletions(-)
delete mode 100644 internal/infra/fileWriter_test.go
delete mode 100644 internal/infra/filesReader_test.go
delete mode 100644 internal/infra/infra_test.go
delete mode 100644 internal/infra/loader_test.go
create mode 100644 pkg/metric/analyser.go
create mode 100644 pkg/metric/bool.go
delete mode 100644 pkg/metric/build.go
create mode 100644 pkg/metric/counter.go
create mode 100644 pkg/metric/distinct.go
delete mode 100644 pkg/metric/generic.go
delete mode 100644 pkg/metric/generic_test.go
create mode 100644 pkg/metric/mean.go
create mode 100644 pkg/metric/metric_test.go
delete mode 100644 pkg/metric/metricbool.go
delete mode 100644 pkg/metric/metricbool_test.go
delete mode 100644 pkg/metric/metricnumeric.go
delete mode 100644 pkg/metric/metricnumeric_test.go
delete mode 100644 pkg/metric/metricstring.go
delete mode 100644 pkg/metric/metricstring_test.go
create mode 100644 pkg/metric/minmax.go
create mode 100644 pkg/metric/numeric.go
create mode 100644 pkg/metric/sampler.go
create mode 100644 pkg/metric/string.go
create mode 100644 pkg/metric/trueratio.go
create mode 100644 pkg/metric/types.go
create mode 100644 pkg/model/config.go
delete mode 100644 pkg/model/metric.go
create mode 100644 pkg/model/metrics.go
create mode 100644 pkg/model/schema.go
delete mode 100644 pkg/model/utils.go
delete mode 100644 pkg/model/utils_test.go
delete mode 100644 pkg/rimo/driven_test.go
create mode 100644 pkg/rimo/error.go
create mode 100644 test/suites/cli/metrics.yml
create mode 100644 test/suites/testdata/main/data.jsonl
create mode 100644 test/suites/testdata/main/output/.gitkeep
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index ba5566f..24f3fe4 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -10,9 +10,11 @@ RUN apk add --update --progress --no-cache make gomplate
ARG VERSION_GOLICENSE=0.2.0
ARG VERSION_MILLER=6.2.0
+ARG VERSION_YQ=4.40.4
RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \
&& wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \
- && chmod +x /usr/bin/golicense /usr/bin/mlr
+ && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \
+ && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq
COPY --from=pimo /usr/bin/pimo /usr/bin/pimo
diff --git a/.devcontainer/Dockerfile.ci b/.devcontainer/Dockerfile.ci
index 0e547cd..f679260 100644
--- a/.devcontainer/Dockerfile.ci
+++ b/.devcontainer/Dockerfile.ci
@@ -10,8 +10,10 @@ RUN apk add --update --progress --no-cache make gomplate
ARG VERSION_GOLICENSE=0.2.0
ARG VERSION_MILLER=6.2.0
+ARG VERSION_YQ=4.40.4
RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \
&& wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \
- && chmod +x /usr/bin/golicense /usr/bin/mlr
+ && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \
+ && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq
COPY --from=pimo /usr/bin/pimo /usr/bin/pimo
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06380a3..792ec12 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,13 @@ Types of changes
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.
+## [0.3.0]
+
+- `Added` moved `min` and `max` to the main metric.
+- `Added` `countNulls` to the main metric.
+- `Added` all main metrics to the lengths section in string metrics.
+- `Removed` `leastFrequentLen` and `mostFrequentLen` all lengths are listed with the most frequent length in first position
+
## [0.2.0]
- `Added` new string metrics `minLen` and `maxLen`
diff --git a/build.yml b/build.yml
index 394b39f..af94db6 100644
--- a/build.yml
+++ b/build.yml
@@ -255,6 +255,12 @@ targets:
- ldflags = ldflags + " -s -w" # Omit the DWARF symbol table. Omit the symbol table and debug information.
- call: compile
+ test-int-debug:
+ doc: "Run all integration tests"
+ depends: ["info"]
+ steps:
+ - $: venom run test/suites/*
+
test-int:
doc: "Run all integration tests"
depends: ["info", "refresh", "lint", "test", "benchmark", "release"]
diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go
index e75290a..57bdaac 100644
--- a/cmd/rimo/main.go
+++ b/cmd/rimo/main.go
@@ -21,27 +21,39 @@ import (
"fmt"
"os"
"path/filepath"
+ "runtime"
+ "strings"
"github.com/cgi-fr/rimo/internal/infra"
"github.com/cgi-fr/rimo/pkg/model"
"github.com/cgi-fr/rimo/pkg/rimo"
+ "github.com/mattn/go-isatty"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"github.com/spf13/cobra"
)
-// Provisioned by ldflags.
+const DefaultSampleSize = uint(5)
+
+//nolint:gochecknoglobals
var (
- name string //nolint: gochecknoglobals
- version string //nolint: gochecknoglobals
- commit string //nolint: gochecknoglobals
- buildDate string //nolint: gochecknoglobals
- builtBy string //nolint: gochecknoglobals
+ name string // provisioned by ldflags
+ version string // provisioned by ldflags
+ commit string // provisioned by ldflags
+ buildDate string // provisioned by ldflags
+ builtBy string // provisioned by ldflags
+
+ verbosity string
+ jsonlog bool
+ debug bool
+ colormode string
+
+ sampleSize uint
+ distinct bool //nolint: gochecknoglobals
)
func main() { //nolint:funlen
- log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) //nolint: exhaustruct
-
+ cobra.OnInitialize(initLog)
log.Info().Msgf("%v %v (commit=%v date=%v by=%v)", name, version, commit, buildDate, builtBy)
rootCmd := &cobra.Command{ //nolint:exhaustruct
@@ -54,6 +66,12 @@ func main() { //nolint:funlen
There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDate, builtBy),
}
+ rootCmd.PersistentFlags().StringVarP(&verbosity, "verbosity", "v", "warn",
+ "set level of log verbosity : none (0), error (1), warn (2), info (3), debug (4), trace (5)")
+ rootCmd.PersistentFlags().BoolVar(&debug, "debug", false, "add debug information to logs (very slow)")
+ rootCmd.PersistentFlags().BoolVar(&jsonlog, "log-json", false, "output logs in JSON format")
+ rootCmd.PersistentFlags().StringVar(&colormode, "color", "auto", "use colors in log outputs : yes, no or auto")
+
rimoSchemaCmd := &cobra.Command{ //nolint:exhaustruct
Use: "jsonschema",
Short: "Return rimo jsonschema",
@@ -77,32 +95,21 @@ func main() { //nolint:funlen
outputDir := args[1]
// Reader
-
- inputList, err := BuildFilepathList(inputDir, ".jsonl")
- if err != nil {
- log.Fatal().Msgf("error listing files: %v", err)
- }
-
- reader, err := infra.FilesReaderFactory(inputList)
+ reader, err := infra.NewJSONLFolderReader(inputDir)
if err != nil {
log.Fatal().Msgf("error creating reader: %v", err)
}
- // Writer
- // (could be relocated to infra.FilesReader)
- baseName, _, err := infra.ExtractName(inputList[0])
- if err != nil {
- log.Fatal().Msgf("error extracting base name: %v", err)
- }
-
- outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", baseName))
+ outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", reader.BaseName()))
writer, err := infra.YAMLWriterFactory(outputPath)
if err != nil {
log.Fatal().Msgf("error creating writer: %v", err)
}
- err = rimo.AnalyseBase(reader, writer)
+ driver := rimo.Driver{SampleSize: sampleSize, Distinct: distinct}
+
+ err = driver.AnalyseBase(reader, writer)
if err != nil {
log.Fatal().Msgf("error generating rimo.yaml: %v", err)
}
@@ -111,6 +118,9 @@ func main() { //nolint:funlen
},
}
+ rimoAnalyseCmd.Flags().UintVar(&sampleSize, "sample-size", DefaultSampleSize, "number of sample value to collect")
+ rimoAnalyseCmd.Flags().BoolVarP(&distinct, "distinct", "d", false, "count distinct values")
+
rootCmd.AddCommand(rimoAnalyseCmd)
rootCmd.AddCommand(rimoSchemaCmd)
@@ -120,54 +130,44 @@ func main() { //nolint:funlen
}
}
-func FilesList(path string, extension string) ([]string, error) {
- pattern := filepath.Join(path, "*"+extension)
+func initLog() {
+ color := false
- files, err := filepath.Glob(pattern)
- if err != nil {
- return nil, fmt.Errorf("error listing files: %w", err)
+ switch strings.ToLower(colormode) {
+ case "auto":
+ if isatty.IsTerminal(os.Stdout.Fd()) && runtime.GOOS != "windows" {
+ color = true
+ }
+ case "yes", "true", "1", "on", "enable":
+ color = true
}
- return files, nil
-}
-
-var ErrNoFile = fmt.Errorf("no file found")
-
-func BuildFilepathList(path string, extension string) ([]string, error) {
- err := ValidateDirPath(path)
- if err != nil {
- return nil, fmt.Errorf("failed to validate input directory: %w", err)
- }
-
- pattern := filepath.Join(path, "*"+extension)
-
- files, err := filepath.Glob(pattern)
- if err != nil {
- return nil, fmt.Errorf("error listing files: %w", err)
+ if jsonlog {
+ log.Logger = zerolog.New(os.Stderr)
+ } else {
+ log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, NoColor: !color}) //nolint:exhaustruct
}
- if len(files) == 0 {
- return nil, fmt.Errorf("%w : no %s files found in %s", ErrNoFile, extension, path)
+ if debug {
+ log.Logger = log.Logger.With().Caller().Logger()
}
- return files, nil
+ setVerbosity()
}
-func ValidateDirPath(path string) error {
- fileInfo, err := os.Stat(path)
- if os.IsNotExist(err) {
- return fmt.Errorf("%w: %s", infra.ErrDirDoesNotExist, path)
- } else if err != nil {
- return fmt.Errorf("failed to get directory info: %w", err)
+func setVerbosity() {
+ switch verbosity {
+ case "trace", "5":
+ zerolog.SetGlobalLevel(zerolog.TraceLevel)
+ case "debug", "4":
+ zerolog.SetGlobalLevel(zerolog.DebugLevel)
+ case "info", "3":
+ zerolog.SetGlobalLevel(zerolog.InfoLevel)
+ case "warn", "2":
+ zerolog.SetGlobalLevel(zerolog.WarnLevel)
+ case "error", "1":
+ zerolog.SetGlobalLevel(zerolog.ErrorLevel)
+ default:
+ zerolog.SetGlobalLevel(zerolog.Disabled)
}
-
- if !fileInfo.IsDir() {
- return fmt.Errorf("%w: %s", infra.ErrPathIsNotDir, path)
- }
-
- if fileInfo.Mode().Perm()&infra.WriteDirPerm != infra.WriteDirPerm {
- return fmt.Errorf("%w: %s", infra.ErrWriteDirPermission, path)
- }
-
- return nil
}
diff --git a/go.mod b/go.mod
index cddbdd1..a472ce0 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/cgi-fr/rimo
go 1.20
require (
- github.com/hexops/valast v1.4.4
+ github.com/goccy/go-json v0.10.2
github.com/rs/zerolog v1.30.0
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.4
@@ -11,20 +11,20 @@ require (
gopkg.in/yaml.v3 v3.0.1
)
-require gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
+require (
+ github.com/kr/pretty v0.3.1 // indirect
+ github.com/rogpeppe/go-internal v1.10.0 // indirect
+ gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
+)
require (
github.com/davecgh/go-spew v1.1.1 // indirect
- github.com/google/go-cmp v0.5.9 // indirect
github.com/iancoleman/orderedmap v0.3.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/invopop/jsonschema v0.7.0 // direct
github.com/mattn/go-colorable v0.1.13 // indirect
- github.com/mattn/go-isatty v0.0.19 // indirect
+ github.com/mattn/go-isatty v0.0.19
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
- golang.org/x/mod v0.13.0 // indirect
golang.org/x/sys v0.13.0 // indirect
- golang.org/x/tools v0.14.0 // indirect
- mvdan.cc/gofumpt v0.5.0 // indirect
)
diff --git a/go.sum b/go.sum
index 4893f5a..8577e51 100644
--- a/go.sum
+++ b/go.sum
@@ -1,16 +1,12 @@
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY=
+github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
+github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
-github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/hexops/autogold v0.8.1 h1:wvyd/bAJ+Dy+DcE09BoLk6r4Fa5R5W+O+GUzmR985WM=
-github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
-github.com/hexops/valast v1.4.4 h1:rETyycw+/L2ZVJHHNxEBgh8KUn+87WugH9MxcEv9PGs=
-github.com/hexops/valast v1.4.4/go.mod h1:Jcy1pNH7LNraVaAZDLyv21hHg2WBv9Nf9FL6fGxU7o4=
github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0/go.mod h1:N0Wam8K1arqPXNWjMo21EXnBPOPp36vB07FNRdD2geA=
github.com/iancoleman/orderedmap v0.3.0 h1:5cbR2grmZR/DiVt+VJopEhtVs9YGInGIxAoMJn+Ichc=
github.com/iancoleman/orderedmap v0.3.0/go.mod h1:XuLcCUkdL5owUCQeF2Ue9uuw1EptkJDkXXS7VoV7XGE=
@@ -19,7 +15,9 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf
github.com/invopop/jsonschema v0.7.0 h1:2vgQcBz1n256N+FpX3Jq7Y17AjYt46Ig3zIWyy770So=
github.com/invopop/jsonschema v0.7.0/go.mod h1:O9uiLokuu0+MGFlyiaqtWxwqJm41/+8Nj0lD7A36YH0=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
@@ -27,10 +25,13 @@ github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27k
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
+github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.30.0 h1:SymVODrcRsaRaSInD9yQtKbtWqwsfoPcRff/oRXLj4c=
github.com/rs/zerolog v1.30.0/go.mod h1:/tk+P47gFdPXq4QYjvCmT5/Gsug2nagsFWBWhAiSi1w=
@@ -45,21 +46,14 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
-golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
-golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
-golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
-golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-mvdan.cc/gofumpt v0.5.0 h1:0EQ+Z56k8tXjj/6TQD25BFNKQXpCvT0rnansIc7Ug5E=
-mvdan.cc/gofumpt v0.5.0/go.mod h1:HBeVDtMKRZpXyxFciAirzdKklDlGu8aAy1wEbH5Y9js=
diff --git a/internal/infra/fileWriter_test.go b/internal/infra/fileWriter_test.go
deleted file mode 100644
index cd34651..0000000
--- a/internal/infra/fileWriter_test.go
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package infra_test
-
-import (
- "os"
- "path/filepath"
- "testing"
-
- "github.com/cgi-fr/rimo/internal/infra"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
-)
-
-const (
- dataDir = "../../testdata/"
-)
-
-func TestWriterYAML(t *testing.T) {
- t.Parallel()
-
- base := model.Base{
- Name: "databaseName",
- Tables: []model.Table{
- {
- Name: "tableName",
- Columns: []model.Column{},
- },
- },
- }
-
- // Create a temporary directory for the test
- tempDir, err := os.MkdirTemp(dataDir, "export_test")
- require.NoError(t, err)
-
- defer os.RemoveAll(tempDir)
-
- // Create a temporary file for the output
- outputFile := filepath.Join(tempDir, "output.yaml")
-
- // Create the writer
- writer, err := infra.YAMLWriterFactory(outputFile)
- require.NoError(t, err)
-
- err = writer.Export(&base)
- require.NoError(t, err)
-
- // Read the output file and check its contents
- file, err := os.Open(outputFile)
- require.NoError(t, err)
-
- defer file.Close()
-
- stat, err := file.Stat()
- require.NoError(t, err)
-
- outputData := make([]byte, stat.Size())
- _, err = file.Read(outputData)
- require.NoError(t, err)
-
- expectedData := `database: databaseName
-tables:
- - name: tableName
- columns: []
-`
-
- assert.Equal(t, expectedData, string(outputData))
-}
diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go
index 8430ae9..9503edd 100644
--- a/internal/infra/filesReader.go
+++ b/internal/infra/filesReader.go
@@ -1,173 +1,180 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
package infra
import (
"errors"
"fmt"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+
+ "github.com/cgi-fr/rimo/pkg/rimo"
+ "github.com/goccy/go-json"
+ "github.com/rs/zerolog/log"
)
-// Errors declaration.
-var (
- ErrInvalidFilePath = errors.New("failed to validate path")
- ErrNoFilePath = errors.New("no file path provided")
- ErrNonUniqueBase = errors.New("base name is not unique")
-)
+var ErrReadFile = errors.New("error while reading file")
+
+type JSONLFolderReader struct {
+ basename string
+ readers []*JSONLFileReader
+ current int
+}
+
+func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) {
+ log.Trace().Str("path", folderpath).Msg("reading folder")
+
+ basename := path.Base(folderpath)
+
+ pattern := filepath.Join(folderpath, "*.jsonl")
-// FilesReader can read multiple type of file and feed data to rimo.
-// FilesReader is responsible of :
-// - BaseName() return the name of the base
-// - Next() return true if there is a next value to read
-// - Value() return the value of the current column, the name of the column and the name of the table
-// Interface itself with a Loader interface. Which currently only supports YAML files.
-// Loader and FilesReader can be initialized with LoaderFactory and FilesReaderFactory.
-type FilesReader struct {
- filepathList []string
- loader JSONLinesLoader // responsible of loading a file format
- baseName string
- // variable for looping over columns
- fileIndex int
- colNameMapIndex map[int]string // map of column name by index
- colIndex int // value of current column index
- // given by Value()
- dataMap map[string][]interface{}
- tableName string // filled by FilesReader
-}
-
-// Constructor for FilesReader.
-func FilesReaderFactory(filepathList []string) (*FilesReader, error) {
- var err error
-
- // Process inputDirList
- if len(filepathList) == 0 {
- return nil, ErrNoFilePath
+ files, err := filepath.Glob(pattern)
+ if err != nil {
+ return nil, fmt.Errorf("error listing files: %w", err)
}
- for _, path := range filepathList {
- err := ValidateFilePath(path)
+ readers := make([]*JSONLFileReader, len(files))
+
+ for index, filepath := range files {
+ log.Trace().Str("path", filepath).Msg("scanning file")
+
+ readers[index], err = NewJSONLFileReader(basename, filepath)
if err != nil {
- return nil, ErrInvalidFilePath
+ return nil, fmt.Errorf("error opening files: %w", err)
}
}
- // Initialize FilesReader
- var filesReader FilesReader
- filesReader.filepathList = filepathList
- filesReader.fileIndex = -1
+ return &JSONLFolderReader{
+ basename: basename,
+ readers: readers,
+ current: 0,
+ }, nil
+}
- filesReader.baseName, err = filesReader.isBaseUnique()
- if err != nil {
- return nil, fmt.Errorf("base is not unique: %w", err)
- }
+func (r *JSONLFolderReader) BaseName() string {
+ return r.basename
+}
+
+func (r *JSONLFolderReader) Next() bool {
+ if r.current < len(r.readers) && !r.readers[r.current].Next() {
+ r.current++
- // Use of JSONLinesLoader
- filesReader.loader = JSONLinesLoader{}
+ return r.Next()
+ }
- return &filesReader, nil
+ return r.current < len(r.readers)
}
-// Reader interface implementation
+func (r *JSONLFolderReader) Col() (rimo.ColReader, error) { //nolint:ireturn
+ return r.readers[r.current].Col()
+}
-func (r *FilesReader) BaseName() string {
- return r.baseName
+type JSONLFileReader struct {
+ tablename string
+ source *os.File
+ columns []string
+ current int
+ decoder *json.Decoder
+ basename string
}
-func (r *FilesReader) Next() bool {
- // First call to Next()
- if r.fileIndex == -1 {
- r.fileIndex = 0
- r.colIndex = 0
+func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) {
+ log.Trace().Str("path", filepath).Msg("opening file")
- return true
+ source, err := os.Open(filepath)
+ if err != nil {
+ return nil, fmt.Errorf("%w", err)
}
- // Current file contain column left to process.
- if r.colIndex < len(r.dataMap) {
- r.colIndex++
+ template := map[string]any{}
+
+ log.Trace().Str("path", filepath).Msg("decoding line template")
+
+ decoder := json.NewDecoder(source)
+ if err := decoder.Decode(&template); err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrReadFile, err)
}
- // Current file contain no columns left to process.
- if r.colIndex == len(r.dataMap) {
- // Current file is last file.
- if r.fileIndex == len(r.filepathList)-1 {
- return false
- }
- // There is a next file.
- r.fileIndex++
- r.colIndex = 0
+ log.Trace().Str("path", filepath).Any("template", template).Msg("decoded line template")
+
+ if _, err := source.Seek(0, 0); err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrReadFile, err)
}
- return true
-}
+ columns := make([]string, 0, len(template))
-// Charger les fichiers un à un dans une dataMap.
-// Retourne les valeurs d'une colonne, son nom et le nom de table.
-func (r *FilesReader) Value() ([]interface{}, string, string, error) {
- var err error
+ for column := range template {
+ log.Trace().Str("path", filepath).Any("column", column).Msg("registering column")
- // colIndex = 0 : new file to load
- if r.colIndex == 0 {
- filepath := r.filepathList[r.fileIndex]
+ columns = append(columns, column)
+ }
- // Extract table name from file name
- _, r.tableName, err = ExtractName(filepath)
- if err != nil {
- return nil, "", "", fmt.Errorf("failed to extract table name: %w", err)
- }
+ return &JSONLFileReader{
+ tablename: strings.TrimSuffix(path.Base(filepath), path.Ext(filepath)),
+ source: source,
+ columns: columns,
+ current: -1,
+ decoder: json.NewDecoder(source),
+ basename: basename,
+ }, nil
+}
- // Load file in dataMap
- r.dataMap, err = r.loader.Load(r.filepathList[r.fileIndex])
- if err != nil {
- panic(err)
- }
+func (fr *JSONLFileReader) BaseName() string {
+ return fr.basename
+}
- // Create a map of column name by index
- r.colNameMapIndex = make(map[int]string, 0)
- i := 0
+func (fr *JSONLFileReader) Next() bool {
+ fr.current++
- for k := range r.dataMap {
- r.colNameMapIndex[i] = k
- i++
- }
+ if _, err := fr.source.Seek(0, 0); err != nil {
+ panic(err)
}
- // colIndex = n : current file have been partially processed
- currentColName := r.colNameMapIndex[r.colIndex]
- // return values, colName, tableName
- return r.dataMap[currentColName], currentColName, r.tableName, nil
+ fr.decoder = json.NewDecoder(fr.source)
+
+ log.Trace().Str("base", fr.basename).Any("index", fr.current).Msg("successful jump to next column")
+
+ return fr.current < len(fr.columns)
}
-func (r *FilesReader) isBaseUnique() (string, error) {
- baseName, _, err := ExtractName(r.filepathList[0])
- if err != nil {
- return "", err
+func (fr *JSONLFileReader) Col() (rimo.ColReader, error) { //nolint:ireturn
+ return NewJSONLColReader(fr.tablename, fr.columns[fr.current], fr.decoder), nil
+}
+
+type JSONLColReader struct {
+ table string
+ column string
+ decoder *json.Decoder
+}
+
+func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColReader {
+ return &JSONLColReader{
+ table: table,
+ column: column,
+ decoder: decoder,
}
+}
- for _, path := range r.filepathList {
- baseNameI, _, err := ExtractName(path)
- if err != nil {
- return "", err
- }
+func (cr *JSONLColReader) ColName() string {
+ return cr.column
+}
- if baseName != baseNameI {
- return "", fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI)
- }
+func (cr *JSONLColReader) TableName() string {
+ return cr.table
+}
+
+func (cr *JSONLColReader) Next() bool {
+ return cr.decoder.More()
+}
+
+func (cr *JSONLColReader) Value() (any, error) {
+ row := map[string]any{}
+
+ if err := cr.decoder.Decode(&row); err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrReadFile, err)
}
- return baseName, nil
+ log.Trace().Str("table", cr.table).Str("column", cr.column).Any("value", row[cr.column]).Msg("read value")
+
+ return row[cr.column], nil
}
diff --git a/internal/infra/filesReader_test.go b/internal/infra/filesReader_test.go
deleted file mode 100644
index ed6cdc0..0000000
--- a/internal/infra/filesReader_test.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package infra_test
-
-import (
- "fmt"
- "path/filepath"
- "testing"
-
- "github.com/cgi-fr/rimo/internal/infra"
- "github.com/stretchr/testify/assert"
-)
-
-func TestReader(t *testing.T) {
- t.Parallel()
-
- inputFile := filepath.Join(dataDir, "data0/data_input.jsonl")
-
- reader, err := infra.FilesReaderFactory([]string{inputFile})
- assert.NoError(t, err)
-
- // Assertions.
-
- actualBaseName := reader.BaseName()
- expectedBaseName := "data"
- assert.Equal(t, expectedBaseName, actualBaseName)
-
- expectedTableName := "input"
- expectedDataMap := map[string][]interface{}{
- "address": {"PSC", "095", "06210"},
- "age": {nil, nil, float64(61)},
- "major": {true, false, true},
- "empty": {nil, nil, nil},
- }
-
- for reader.Next() {
- values, colName, tableName, err := reader.Value()
- if err != nil {
- assert.NoError(t, err)
- }
-
- expectedColData, ok := expectedDataMap[colName]
- if !ok {
- assert.Fail(t, "column name not found : %s", colName)
- }
-
- assert.Equal(t, expectedColData, values)
- assert.Equal(t, expectedTableName, tableName)
- }
-}
-
-func TestReaderMultipleFiles(t *testing.T) {
- t.Parallel()
-
- inputFile := filepath.Join(dataDir, "data0/data_input.jsonl")
- inputFile2 := filepath.Join(dataDir, "data0/data_input2.jsonl")
- reader, err := infra.FilesReaderFactory([]string{inputFile, inputFile2})
- assert.NoError(t, err)
-
- for reader.Next() {
- values, colName, tableName, err := reader.Value()
- if err != nil {
- assert.NoError(t, err)
- }
-
- fmt.Printf("%s.%s: %v\n", tableName, colName, values)
- }
-}
diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go
deleted file mode 100644
index ad40f79..0000000
--- a/internal/infra/infra_test.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package infra_test
-
-import (
- "path/filepath"
- "testing"
-
- "github.com/cgi-fr/rimo/internal/infra"
- "github.com/cgi-fr/rimo/pkg/rimo"
- "github.com/stretchr/testify/require"
-)
-
-const (
- testdataDir = "../../testdata/"
-)
-
-// Test RIMO pipeline with FilesReader, JSONLinesLoader and YAMLWriter.
-func TestPipeline(t *testing.T) {
- t.Parallel()
-
- inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl")
-
- reader, err := infra.FilesReaderFactory([]string{inputPath})
- require.NoError(t, err)
-
- writer := infra.StdoutWriterFactory()
-
- err = rimo.AnalyseBase(reader, writer)
- require.NoError(t, err)
-}
-
-// var (
-// Readers []*rimo.Reader
-// Writers []*rimo.Writer
-// )
-
-// // List of implemented readers and writers.
-// func GetReaders(filepathList []string) []*rimo.Reader {
-// filesReader, err := infra.FilesReaderFactory(filepathList)
-// if err != nil {
-// panic(err)
-// }
-
-// Readers = []*rimo.Reader{filesReader}
-
-// return Readers
-// }
-
-// func GetWriters() []*rimo.Writer {
-// yamlWriter := infra.YAMLWriterFactory("../../testdata/data1/data_output.yaml")
-
-// Writers = []*rimo.Writer{yamlWriter, infra.StdoutWriter{}}
-
-// return Writers
-// }
-
-// func TestInterface(t *testing.T) {
-// t.Parallel()
-
-// Writers = GetWriters()
-// Readers = GetReaders([]string{"../../testdata/data1/data_input.jsonl"})
-// // Assert that all readers and writers implement the Reader and Writer interfaces.
-// for _, reader := range Readers {
-// var _ rimo.Reader = (reader)(nil)
-// }
-// for _, writer := range Writers {
-// var _ rimo.Reader = (writer)(nil)
-// }
-
-// // Assert that all combinations of readers and writers can be used in the pipeline.
-// for _, reader := range Readers {
-// for _, writer := range Writers {
-// err := rimo.AnalyseBase(reader, writer)
-// require.NoError(t, err)
-// }
-// }
-// }
diff --git a/internal/infra/loader_test.go b/internal/infra/loader_test.go
deleted file mode 100644
index d55186d..0000000
--- a/internal/infra/loader_test.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package infra_test
-
-import (
- "fmt"
- "path/filepath"
- "testing"
-
- "github.com/cgi-fr/rimo/internal/infra"
- "github.com/stretchr/testify/require"
-)
-
-func TestLoaderJSONL(t *testing.T) {
- t.Parallel()
-
- path := filepath.Join(testdataDir, "data1/data_input.jsonl")
-
- LoaderJSONL := infra.JSONLinesLoader{}
-
- data, err := LoaderJSONL.Load(path)
- require.NoError(t, err)
- fmt.Printf("dataMap: %v\n", data)
-}
diff --git a/pkg/metric/analyser.go b/pkg/metric/analyser.go
new file mode 100644
index 0000000..9d1d936
--- /dev/null
+++ b/pkg/metric/analyser.go
@@ -0,0 +1,26 @@
+package metric
+
+import (
+ "github.com/cgi-fr/rimo/pkg/model"
+)
+
+type Analyser[T Accepted] interface {
+ Read(*T)
+ Build(*model.Column)
+}
+
+type Multi[T Accepted] struct {
+ analyser []Analyser[T]
+}
+
+func (m Multi[T]) Read(value *T) {
+ for _, a := range m.analyser {
+ a.Read(value)
+ }
+}
+
+func (m Multi[T]) Build(metric *model.Column) {
+ for _, a := range m.analyser {
+ a.Build(metric)
+ }
+}
diff --git a/pkg/metric/bool.go b/pkg/metric/bool.go
new file mode 100644
index 0000000..47fa462
--- /dev/null
+++ b/pkg/metric/bool.go
@@ -0,0 +1,21 @@
+package metric
+
+type Bool struct {
+ Multi[bool]
+}
+
+func NewBool(sampleSize uint, countDistinct bool) *Bool {
+ mainAnalyser := []Analyser[bool]{
+ NewCounter[bool](), // count total, count null, count empty
+ NewSampler[bool](sampleSize), // store few samples
+ NewTrueRatio(), // calculate true ratio
+ }
+
+ if countDistinct {
+ mainAnalyser = append(mainAnalyser, NewDistinct[bool]())
+ }
+
+ return &Bool{
+ Multi: Multi[bool]{mainAnalyser},
+ }
+}
diff --git a/pkg/metric/build.go b/pkg/metric/build.go
deleted file mode 100644
index 058c539..0000000
--- a/pkg/metric/build.go
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric
-
-import (
- "encoding/json"
- "errors"
- "fmt"
-
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-var ErrValueType = errors.New("value type error")
-
-// Return a model.Column.
-func ComputeMetric(colName string, values []interface{}) (model.Column, error) {
- var confidential *bool = nil //nolint
-
- // Create the column.
- col := model.Column{
- Name: colName,
- Type: GetColType(values),
- Concept: "",
- Constraint: []string{},
- Confidential: confidential,
- MainMetric: model.GenericMetric{}, //nolint:exhaustruct
- StringMetric: model.StringMetric{}, //nolint:exhaustruct
- NumericMetric: model.NumericMetric{}, //nolint:exhaustruct
- BoolMetric: model.BoolMetric{}, //nolint:exhaustruct
- }
-
- // Generic metric
- err := SetGenericMetric(values, &col.MainMetric)
- if err != nil {
- return model.Column{}, fmt.Errorf("error computing generic metric in column %v : %w", col.Name, err)
- }
-
- // Type specific metric
- switch col.Type {
- case model.ColType.String:
- err := SetStringMetric(values, &col.StringMetric)
- if err != nil {
- return model.Column{}, fmt.Errorf("error computing string metric in column %v : %w", col.Name, err)
- }
-
- case model.ColType.Numeric:
- err := SetNumericMetric(values, &col.NumericMetric)
- if err != nil {
- return model.Column{}, fmt.Errorf("error computing numeric metric in column %v : %w", col.Name, err)
- }
-
- case model.ColType.Bool:
- err := SetBoolMetric(values, &col.BoolMetric)
- if err != nil {
- return model.Column{}, fmt.Errorf("error computing bool metric in column %v : %w", col.Name, err)
- }
- }
-
- return col, nil
-}
-
-func GetColType(values []interface{}) model.ValueType {
- colType := model.ColType.Undefined
- for i := 0; i < len(values) && colType == model.ColType.Undefined; i++ {
- colType = ColType(values[i])
- }
-
- return colType
-}
-
-// Utils functions.
-
-func GetFrequency(occurrence int, count int) float64 {
- return float64(occurrence) / float64(count)
-}
-
-// To check why not using isNil() ?
-func GetFirstValue(values []interface{}) interface{} {
- for _, value := range values {
- if value != nil {
- return value
- }
- }
-
- return nil
-}
-
-func ColType(value interface{}) model.ValueType {
- switch value.(type) {
- case int:
- return model.ColType.Numeric
- case float64:
- return model.ColType.Numeric
- case json.Number:
- return model.ColType.Numeric
- case string:
- return model.ColType.String
- case bool:
- return model.ColType.Bool
- default:
- return model.ColType.Undefined
- }
-}
diff --git a/pkg/metric/counter.go b/pkg/metric/counter.go
new file mode 100644
index 0000000..ede347a
--- /dev/null
+++ b/pkg/metric/counter.go
@@ -0,0 +1,38 @@
+package metric
+
+import (
+ "github.com/cgi-fr/rimo/pkg/model"
+)
+
+type Counter[T Accepted] struct {
+ countTotal uint
+ countNulls uint
+ countEmpty uint
+ zero T
+}
+
+func NewCounter[T Accepted]() *Counter[T] {
+ return &Counter[T]{
+ countTotal: 0,
+ countNulls: 0,
+ countEmpty: 0,
+ zero: *new(T),
+ }
+}
+
+func (c *Counter[T]) Read(value *T) {
+ c.countTotal++
+
+ switch {
+ case value == nil:
+ c.countNulls++
+ case *value == c.zero:
+ c.countEmpty++
+ }
+}
+
+func (c *Counter[T]) Build(metric *model.Column) {
+ metric.MainMetric.Count = c.countTotal
+ metric.MainMetric.Null = c.countNulls
+ metric.MainMetric.Empty = c.countEmpty
+}
diff --git a/pkg/metric/distinct.go b/pkg/metric/distinct.go
new file mode 100644
index 0000000..c01e7b0
--- /dev/null
+++ b/pkg/metric/distinct.go
@@ -0,0 +1,25 @@
+package metric
+
+import (
+ "github.com/cgi-fr/rimo/pkg/model"
+)
+
+type Distinct[T Accepted] struct {
+ values map[T]int
+}
+
+func NewDistinct[T Accepted]() *Distinct[T] {
+ return &Distinct[T]{
+ values: make(map[T]int, 1024), //nolint:gomnd
+ }
+}
+
+func (a *Distinct[T]) Read(value *T) {
+ if value != nil {
+ a.values[*value] = 0
+ }
+}
+
+func (a *Distinct[T]) Build(metric *model.Column) {
+ metric.MainMetric.Distinct = uint(len(a.values))
+}
diff --git a/pkg/metric/generic.go b/pkg/metric/generic.go
deleted file mode 100644
index ac85bbd..0000000
--- a/pkg/metric/generic.go
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric
-
-import (
- "errors"
- "fmt"
- "math/rand"
-
- "github.com/cgi-fr/rimo/pkg/model"
- "golang.org/x/exp/constraints"
-)
-
-var ErrEmptySlice = errors.New("slice is empty")
-
-func SetGenericMetric(values []interface{}, metric *model.GenericMetric) error {
- sample, err := Sample(values, model.SampleSize)
- if err != nil {
- return fmt.Errorf("error computing sample: %w", err)
- }
-
- metric.Count = len(values)
- metric.Unique = CountUnique(values)
- metric.Empty = CountEmpty(values)
- metric.Sample = sample
-
- return nil
-}
-
-func CountEmpty[T comparable](values []T) int {
- empty := 0
-
- for _, value := range values {
- if isNil(value) {
- empty++
- }
- }
-
- return empty
-}
-
-// Return a sample of size sampleSize from values.
-func Sample[T comparable](values []T, sampleSize int) ([]T, error) {
- uniqueValues := Unique(values)
-
- if sampleSize >= len(uniqueValues) {
- return uniqueValues, nil
- }
-
- sample := make([]T, sampleSize)
- for i := 0; i < sampleSize; i++ {
- sample[i] = uniqueValues[rand.Intn(len(uniqueValues)-1)] //nolint:gosec
- }
-
- return sample, nil
-}
-
-func CountUnique[T comparable](values []T) int {
- unique := make(map[T]bool)
-
- for _, value := range values {
- if isNil(value) {
- continue
- }
-
- unique[value] = true
- }
-
- return len(unique)
-}
-
-func Unique[T comparable](values []T) []T {
- unique := make(map[T]bool)
-
- for _, value := range values {
- if isNil(value) {
- continue
- }
-
- unique[value] = true
- }
-
- uniqueValues := make([]T, 0, len(unique))
- for value := range unique {
- uniqueValues = append(uniqueValues, value)
- }
-
- return uniqueValues
-}
-
-func isNil[T comparable](v T) bool {
- return v == *new(T)
-}
-
-func min[T constraints.Ordered](a, b T) T {
- if a < b {
- return a
- }
-
- return b
-}
-
-func max[T constraints.Ordered](a, b T) T {
- if a > b {
- return a
- }
-
- return b
-}
diff --git a/pkg/metric/generic_test.go b/pkg/metric/generic_test.go
deleted file mode 100644
index 4af5071..0000000
--- a/pkg/metric/generic_test.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric_test
-
-import (
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/metric"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
-)
-
-func TestCountEmpty(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{1, 2, 3, nil}
- expected := 1
- actual := metric.CountEmpty(slice)
-
- assert.Equal(t, expected, actual)
-}
-
-func TestGetColType(t *testing.T) {
- t.Parallel()
-
- t.Run("numeric", func(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{nil, 2, 3}
- expected := model.ColType.Numeric
-
- actual := metric.GetColType(slice)
- require.Equal(t, expected, actual)
- })
-
- t.Run("string", func(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{nil, "text", nil}
- expected := model.ColType.String
-
- actual := metric.GetColType(slice)
- require.Equal(t, expected, actual)
- })
-
- t.Run("boolean", func(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{nil, true, false}
- expected := model.ColType.Bool
-
- actual := metric.GetColType(slice)
- require.Equal(t, expected, actual)
- })
-
- // Treat this case as error would imply to type assert each element of the slice when Loading.
- t.Run("mixed", func(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{"text", 2, false}
- expected := model.ColType.String
-
- actual := metric.GetColType(slice)
- require.Equal(t, expected, actual)
- })
-
- t.Run("unknown", func(t *testing.T) {
- t.Parallel()
-
- slice := []interface{}{nil, nil, nil}
- expected := model.ColType.Undefined
-
- actual := metric.GetColType(slice)
- require.Equal(t, expected, actual)
- })
-}
-
-// Implementation questions :
-// should Unique() append nil element ?
-// should CountUnique() count nil as a unique value ?
-
-func TestUnique(t *testing.T) {
- t.Parallel()
-
- values := []interface{}{1, 1, 2, 3, nil}
- expected := []interface{}{1, 2, 3}
- actual := metric.Unique(values)
-
- assert.ElementsMatch(t, expected, actual)
-}
-
-func TestCountUnique(t *testing.T) {
- t.Parallel()
-
- values := []interface{}{1, 1, 2, 3, nil}
- expected := 3
- actual := metric.CountUnique(values)
-
- assert.Equal(t, expected, actual)
-}
-
-func TestSample(t *testing.T) {
- t.Parallel()
-
- values := []interface{}{1, 2, 3, nil, 5, 6}
- actualOutput, _ := metric.Sample(values, 5)
-
- assert.Len(t, actualOutput, 5)
-
- actualOutput, _ = metric.Sample(values, 10)
- assert.Len(t, actualOutput, 5)
-}
diff --git a/pkg/metric/mean.go b/pkg/metric/mean.go
new file mode 100644
index 0000000..35137cc
--- /dev/null
+++ b/pkg/metric/mean.go
@@ -0,0 +1,31 @@
+package metric
+
+import "github.com/cgi-fr/rimo/pkg/model"
+
+type Mean struct {
+ count uint
+ mean float64
+}
+
+func NewMean() *Mean {
+ return &Mean{
+ count: 0,
+ mean: 0,
+ }
+}
+
+func (a *Mean) Read(value *float64) {
+ if value == nil {
+ return
+ }
+
+ a.count++
+
+ a.mean += (*value - a.mean) / float64(a.count)
+}
+
+func (a *Mean) Build(metric *model.Column) {
+ metric.NumericMetric = &model.Numeric{
+ Mean: a.mean,
+ }
+}
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
new file mode 100644
index 0000000..3bad918
--- /dev/null
+++ b/pkg/metric/metric_test.go
@@ -0,0 +1,82 @@
+package metric_test
+
+import (
+ "testing"
+
+ "github.com/cgi-fr/rimo/pkg/metric"
+ "github.com/cgi-fr/rimo/pkg/model"
+ "github.com/stretchr/testify/assert"
+)
+
+// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length.
+func TestStringMetric(t *testing.T) { //nolint:funlen
+ t.Parallel()
+
+ text := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""}
+
+ min := ""
+ max := "4441"
+
+ expectedMetric := model.Column{ //nolint:exhaustruct
+ MainMetric: model.Generic{
+ Count: 12,
+ Empty: 1,
+ Null: 1,
+ Distinct: 6,
+ Samples: []any{"22"},
+ Min: &min,
+ Max: &max,
+ },
+ StringMetric: &model.String{
+ MinLen: 0,
+ MaxLen: 4,
+ CountLen: 5,
+ Lengths: []model.StringLen{
+ {
+ Length: 1,
+ Freq: 0.3333333333333333,
+ Metrics: model.Generic{
+ Count: 4,
+ Empty: 0,
+ Null: 0,
+ Distinct: 1,
+ Min: &text[0],
+ Max: &text[0],
+ Samples: []any{"1", "1", "1", "1"},
+ },
+ },
+ },
+ },
+ }
+
+ actualMetric := model.Column{} //nolint:exhaustruct
+
+ analyser := metric.NewString(5, true)
+ for index := range text {
+ analyser.Read(&text[index])
+ }
+
+ analyser.Read(nil)
+
+ analyser.Build(&actualMetric)
+
+ // out, err := yaml.Marshal(actualMetric)
+ // assert.NoError(t, err)
+ // fmt.Println(string(out))
+
+ assert.Equal(t, expectedMetric.MainMetric.Count, actualMetric.MainMetric.Count)
+ assert.Equal(t, expectedMetric.MainMetric.Empty, actualMetric.MainMetric.Empty)
+ assert.Equal(t, expectedMetric.MainMetric.Null, actualMetric.MainMetric.Null)
+ assert.Equal(t, expectedMetric.MainMetric.Distinct, actualMetric.MainMetric.Distinct)
+ assert.Equal(t, expectedMetric.MainMetric.Min, actualMetric.MainMetric.Min)
+ assert.Equal(t, expectedMetric.MainMetric.Max, actualMetric.MainMetric.Max)
+ assert.Equal(t, expectedMetric.StringMetric.MinLen, actualMetric.StringMetric.MinLen)
+ assert.Equal(t, expectedMetric.StringMetric.MaxLen, actualMetric.StringMetric.MaxLen)
+ assert.Equal(t, expectedMetric.StringMetric.CountLen, actualMetric.StringMetric.CountLen)
+
+ for i := 0; i < len(expectedMetric.StringMetric.Lengths); i++ {
+ assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Length, actualMetric.StringMetric.Lengths[i].Length)
+ assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Freq, actualMetric.StringMetric.Lengths[i].Freq)
+ assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Metrics.Samples, actualMetric.StringMetric.Lengths[i].Metrics.Samples) //nolint:lll
+ }
+}
diff --git a/pkg/metric/metricbool.go b/pkg/metric/metricbool.go
deleted file mode 100644
index a3a38ed..0000000
--- a/pkg/metric/metricbool.go
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// rimo is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// rimo is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with rimo. If not, see .
-
-package metric
-
-import (
- "fmt"
-
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-// Bool metric : TrueRatio.
-func SetBoolMetric(values []interface{}, metric *model.BoolMetric) error {
- nullCount := 0
- trueCount := 0
-
- for _, value := range values {
- if value == nil {
- nullCount++
-
- continue
- }
-
- boolValue, ok := value.(bool)
- if !ok {
- return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value)
- }
-
- if boolValue {
- trueCount++
- }
- }
-
- metric.TrueRatio = GetFrequency(trueCount, len(values)-nullCount)
-
- return nil
-}
diff --git a/pkg/metric/metricbool_test.go b/pkg/metric/metricbool_test.go
deleted file mode 100644
index 5db4576..0000000
--- a/pkg/metric/metricbool_test.go
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric_test
-
-import (
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/metric"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
-)
-
-func TestBooleanMetric(t *testing.T) {
- t.Parallel()
-
- values := []interface{}{true, true, nil, false}
- expectedMetric := model.BoolMetric{
- TrueRatio: float64(2) / float64(3),
- }
-
- actualMetric := model.BoolMetric{} //nolint:exhaustruct
- err := metric.SetBoolMetric(values, &actualMetric)
- require.NoError(t, err)
-
- assert.Equal(t, expectedMetric, actualMetric)
-}
diff --git a/pkg/metric/metricnumeric.go b/pkg/metric/metricnumeric.go
deleted file mode 100644
index 77f50bd..0000000
--- a/pkg/metric/metricnumeric.go
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric
-
-import (
- "fmt"
-
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-func SetNumericMetric(values []interface{}, metric *model.NumericMetric) error {
- nonNullCount := 0
-
- value := GetFirstValue(values)
-
- floatValue, ok := value.(float64)
- if !ok {
- return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value)
- }
-
- min := floatValue
- max := floatValue
- sum := 0.0
-
- for _, value := range values {
- floatValue, ok := value.(float64)
- if !ok {
- if value == nil {
- continue
- }
-
- return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value)
- }
-
- sum += floatValue
- nonNullCount++
-
- if floatValue > max {
- max = floatValue
- }
-
- if floatValue < min {
- min = floatValue
- }
- }
-
- mean := sum / float64(nonNullCount)
-
- metric.Min = min
- metric.Max = max
- metric.Mean = mean
-
- return nil
-}
diff --git a/pkg/metric/metricnumeric_test.go b/pkg/metric/metricnumeric_test.go
deleted file mode 100644
index 997b506..0000000
--- a/pkg/metric/metricnumeric_test.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric_test
-
-import (
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/metric"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/assert"
-)
-
-func TestNumericMetric(t *testing.T) {
- t.Parallel()
-
- values := []interface{}{1.0, 2.0, 3.0, nil}
- expectedMetric := model.NumericMetric{
- Min: 1,
- Max: 3,
- Mean: 2,
- }
-
- actualMetric := model.NumericMetric{} //nolint:exhaustruct
-
- err := metric.SetNumericMetric(values, &actualMetric)
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- assert.Equal(t, expectedMetric, actualMetric)
-}
diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go
deleted file mode 100644
index a7d2ef5..0000000
--- a/pkg/metric/metricstring.go
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric
-
-import (
- "fmt"
- "math"
- "sort"
-
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-func SetStringMetric(values []interface{}, metric *model.StringMetric) error {
- // Store strings by length.
- lenMap := make(map[int][]string)
- // Count length occurrence.
- lenCounter := make(map[int]int)
- totalCount := len(values)
-
- metric.MinLen = math.MaxInt
- metric.MaxLen = 0
-
- for _, value := range values {
- if value == nil {
- continue
- }
-
- stringValue, ok := value.(string)
- if !ok {
- return fmt.Errorf("%w : expected string found %T: %v", ErrValueType, value, value)
- }
-
- length := len(stringValue)
- lenMap[length] = append(lenMap[length], stringValue)
- lenCounter[length]++
-
- metric.MinLen = min(metric.MinLen, length)
- metric.MaxLen = max(metric.MaxLen, length)
- }
-
- // Create a list of unique lengths sorted by descending frequency, break ties with ascending length
- sortedLength := uniqueLengthSorted(lenCounter)
-
- // Get size of MostFreqLen and LeastFreqLen
- mostFrequentLenSize, leastFrequentLenSize := getFreqSize(len(sortedLength))
-
- // Get ordered slice of least and most frequent length
- lenMostFreqLen := sortedLength[0:mostFrequentLenSize]
-
- lenLeastFreqLen := make([]int, leastFrequentLenSize)
-
- for i := 0; i < leastFrequentLenSize; i++ {
- index := len(sortedLength) - 1 - i
- lenLeastFreqLen[i] = sortedLength[index]
- }
-
- leastFreqLen, err := buildFreqLen(lenLeastFreqLen, lenMap, lenCounter, totalCount, model.LeastFrequentSampleSize)
- if err != nil {
- return fmt.Errorf("error building least frequent length : %w", err)
- }
-
- metric.LeastFreqLen = leastFreqLen
-
- mostFreqLen, err := buildFreqLen(lenMostFreqLen, lenMap, lenCounter, totalCount, model.MostFrequentSampleSize)
- if err != nil {
- return fmt.Errorf("error building most frequent length : %w", err)
- }
-
- metric.MostFreqLen = mostFreqLen
-
- return nil
-}
-
-func buildFreqLen(freqLen []int, lenMap map[int][]string, lenCounter map[int]int, totalCount int, sampleLen int) ([]model.LenFreq, error) { //nolint
- lenFreqs := make([]model.LenFreq, len(freqLen))
-
- for index, len := range freqLen {
- // Get unique value from lenMap[len]..
- sample, err := Sample(lenMap[len], sampleLen)
- if err != nil {
- return lenFreqs, fmt.Errorf("error getting sample for length %v : %w", len, err)
- }
-
- lenFreqs[index] = model.LenFreq{
- Length: len,
- Freq: GetFrequency(lenCounter[len], totalCount),
- Sample: sample,
- }
- }
-
- return lenFreqs, nil
-}
-
-func getFreqSize(nunique int) (int, int) {
- mostFrequentLenSize := model.MostFrequentLenSize
- leastFrequentLenSize := model.LeastFrequentLenSize
-
- if nunique < model.MostFrequentLenSize+model.LeastFrequentLenSize {
- // Modify MostFrequentLenSize and LeastFrequentLenSize to fit the number of unique length.
- // Should keep ratio of MostFrequentLenSize and LeastFrequentLenSize.
- ratio := float64(model.MostFrequentLenSize) / float64(model.MostFrequentLenSize+model.LeastFrequentLenSize)
- mostFrequentLenSize = int(math.Round(float64(nunique) * ratio))
- leastFrequentLenSize = nunique - mostFrequentLenSize
- }
-
- return mostFrequentLenSize, leastFrequentLenSize
-}
-
-func uniqueLengthSorted(lenCounter map[int]int) []int {
- uniqueLengthSorted := make([]int, 0, len(lenCounter))
- for l := range lenCounter {
- uniqueLengthSorted = append(uniqueLengthSorted, l)
- }
-
- // Sort the string lengths by descending count of occurrence, breaks ties with ascending length
- sort.Slice(uniqueLengthSorted, func(i, j int) bool {
- if lenCounter[uniqueLengthSorted[i]] == lenCounter[uniqueLengthSorted[j]] {
- return uniqueLengthSorted[i] < uniqueLengthSorted[j]
- }
-
- return lenCounter[uniqueLengthSorted[i]] > lenCounter[uniqueLengthSorted[j]]
- })
-
- return uniqueLengthSorted
-}
diff --git a/pkg/metric/metricstring_test.go b/pkg/metric/metricstring_test.go
deleted file mode 100644
index 254a923..0000000
--- a/pkg/metric/metricstring_test.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric_test
-
-import (
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/metric"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/assert"
-)
-
-// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length.
-func TestStringMetric(t *testing.T) {
- t.Parallel()
-
- text := []interface{}{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441"}
- expectedMetric := model.StringMetric{
- MinLen: 1,
- MaxLen: 4,
- MostFreqLen: []model.LenFreq{{Length: 1, Freq: 0.4, Sample: []string{"1"}}, {Length: 2, Freq: 0.3, Sample: []string{"22"}}}, //nolint:lll
- LeastFreqLen: []model.LenFreq{{Length: 4, Freq: 0.1, Sample: []string{"4441"}}, {Length: 3, Freq: 0.2, Sample: []string{"331", "332"}}}, //nolint:lll
- }
-
- actualMetric := model.StringMetric{} //nolint:exhaustruct
-
- err := metric.SetStringMetric(text, &actualMetric)
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
-
- // t.Logf(valast.String(actualMetric))
-
- for i := 0; i < len(expectedMetric.MostFreqLen); i++ {
- assert.Equal(t, expectedMetric.MostFreqLen[i].Length, actualMetric.MostFreqLen[i].Length)
- assert.Equal(t, expectedMetric.MostFreqLen[i].Freq, actualMetric.MostFreqLen[i].Freq)
- assert.Equal(t, expectedMetric.MostFreqLen[i].Sample, actualMetric.MostFreqLen[i].Sample)
- }
-
- for i := 0; i < len(expectedMetric.LeastFreqLen); i++ {
- assert.Equal(t, expectedMetric.LeastFreqLen[i].Length, actualMetric.LeastFreqLen[i].Length)
- assert.Equal(t, expectedMetric.LeastFreqLen[i].Freq, actualMetric.LeastFreqLen[i].Freq)
- assert.ElementsMatch(t, expectedMetric.LeastFreqLen[i].Sample, actualMetric.LeastFreqLen[i].Sample)
- }
-}
diff --git a/pkg/metric/minmax.go b/pkg/metric/minmax.go
new file mode 100644
index 0000000..9bf0ccb
--- /dev/null
+++ b/pkg/metric/minmax.go
@@ -0,0 +1,41 @@
+package metric
+
+import (
+ "github.com/cgi-fr/rimo/pkg/model"
+ "golang.org/x/exp/constraints"
+)
+
+type MinMax[T constraints.Ordered] struct {
+ min *T
+ max *T
+}
+
+func NewMinMax[T constraints.Ordered]() *MinMax[T] {
+ return &MinMax[T]{
+ min: nil,
+ max: nil,
+ }
+}
+
+func (a *MinMax[T]) Read(value *T) {
+ if value != nil {
+ if a.min == nil {
+ a.min = value
+ }
+
+ if a.max == nil {
+ a.max = value
+ }
+
+ if *value < *a.min {
+ a.min = value
+ } else if *value > *a.max {
+ a.max = value
+ }
+ }
+}
+
+func (a *MinMax[T]) Build(metric *model.Column) {
+ metric.MainMetric.Min = a.min
+ metric.MainMetric.Max = a.max
+}
diff --git a/pkg/metric/numeric.go b/pkg/metric/numeric.go
new file mode 100644
index 0000000..2b34fe0
--- /dev/null
+++ b/pkg/metric/numeric.go
@@ -0,0 +1,22 @@
+package metric
+
+type Numeric struct {
+ Multi[float64]
+}
+
+func NewNumeric(sampleSize uint, countDistinct bool) *Numeric {
+ mainAnalyser := []Analyser[float64]{
+ NewCounter[float64](), // count total, count null, count empty
+ NewMinMax[float64](), // store min and max values
+ NewSampler[float64](sampleSize), // store few samples
+ NewMean(), // calculate running mean
+ }
+
+ if countDistinct {
+ mainAnalyser = append(mainAnalyser, NewDistinct[float64]())
+ }
+
+ return &Numeric{
+ Multi: Multi[float64]{mainAnalyser},
+ }
+}
diff --git a/pkg/metric/sampler.go b/pkg/metric/sampler.go
new file mode 100644
index 0000000..db59f59
--- /dev/null
+++ b/pkg/metric/sampler.go
@@ -0,0 +1,45 @@
+package metric
+
+import (
+ "math/rand"
+
+ "github.com/cgi-fr/rimo/pkg/model"
+)
+
+type Sampler[T Accepted] struct {
+ size uint
+ count int
+ samples []T
+}
+
+func NewSampler[T Accepted](size uint) *Sampler[T] {
+ return &Sampler[T]{
+ size: size,
+ count: 0,
+ samples: make([]T, 0, size),
+ }
+}
+
+func (s *Sampler[T]) Read(value *T) {
+ if value != nil {
+ s.count++
+
+ if len(s.samples) < int(s.size) {
+ s.samples = append(s.samples, *value)
+
+ return
+ }
+
+ index := rand.Intn(s.count) //nolint:gosec
+ if index < int(s.size) {
+ s.samples[index] = *value
+ }
+ }
+}
+
+func (s *Sampler[T]) Build(metric *model.Column) {
+ metric.MainMetric.Samples = make([]any, len(s.samples))
+ for i, s := range s.samples {
+ metric.MainMetric.Samples[i] = s
+ }
+}
diff --git a/pkg/metric/string.go b/pkg/metric/string.go
new file mode 100644
index 0000000..7e5861d
--- /dev/null
+++ b/pkg/metric/string.go
@@ -0,0 +1,96 @@
+package metric
+
+import (
+ "sort"
+
+ "github.com/cgi-fr/rimo/pkg/model"
+ "golang.org/x/exp/maps"
+ "golang.org/x/exp/slices"
+)
+
+type String struct {
+ sampleSize uint
+ distinct bool
+ main Multi[string]
+ byLen map[int]Multi[string]
+}
+
+func NewString(sampleSize uint, countDistinct bool) *String {
+ mainAnalyser := []Analyser[string]{
+ NewCounter[string](), // count total, count null, count empty
+ NewMinMax[string](), // store min and max values
+ NewSampler[string](sampleSize), // store few samples
+ }
+
+ if countDistinct {
+ mainAnalyser = append(mainAnalyser, NewDistinct[string]())
+ }
+
+ return &String{
+ sampleSize: sampleSize,
+ distinct: countDistinct,
+ main: Multi[string]{mainAnalyser},
+ byLen: make(map[int]Multi[string], 0),
+ }
+}
+
+func (a *String) Read(value *string) {
+ a.main.Read(value)
+
+ if value != nil {
+ length := len(*value)
+
+ analyser, exists := a.byLen[length]
+ if !exists {
+ analyser = Multi[string]{
+ []Analyser[string]{
+ NewCounter[string](), // count total, count null, count empty
+ NewMinMax[string](), // store min and max values
+ NewSampler[string](a.sampleSize), // store few samples
+ },
+ }
+
+ if a.distinct {
+ analyser.analyser = append(analyser.analyser, NewDistinct[string]())
+ }
+ }
+
+ analyser.Read(value)
+
+ a.byLen[length] = analyser
+ }
+}
+
+func (a *String) Build(metric *model.Column) {
+ a.main.Build(metric)
+
+ metric.StringMetric = &model.String{
+ MinLen: slices.Min(maps.Keys(a.byLen)),
+ MaxLen: slices.Max(maps.Keys(a.byLen)),
+ CountLen: len(a.byLen),
+ Lengths: make([]model.StringLen, 0, len(a.byLen)),
+ }
+
+ for length, analyser := range a.byLen {
+ lenMetric := model.Column{} //nolint:exhaustruct
+ analyser.Build(&lenMetric)
+
+ strlen := model.StringLen{
+ Length: length,
+ Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count),
+ Metrics: model.Generic{}, //nolint:exhaustruct
+ }
+ strlen.Metrics.Count = lenMetric.MainMetric.Count
+ strlen.Metrics.Empty = lenMetric.MainMetric.Empty
+ strlen.Metrics.Null = lenMetric.MainMetric.Null
+ strlen.Metrics.Distinct = lenMetric.MainMetric.Distinct
+ strlen.Metrics.Max = lenMetric.MainMetric.Max
+ strlen.Metrics.Min = lenMetric.MainMetric.Min
+ strlen.Metrics.Samples = lenMetric.MainMetric.Samples
+ metric.StringMetric.Lengths = append(metric.StringMetric.Lengths, strlen)
+ }
+
+ sort.Slice(metric.StringMetric.Lengths, func(i, j int) bool {
+ return metric.StringMetric.Lengths[i].Freq > metric.StringMetric.Lengths[j].Freq
+ })
+}
diff --git a/pkg/metric/trueratio.go b/pkg/metric/trueratio.go
new file mode 100644
index 0000000..e9d8271
--- /dev/null
+++ b/pkg/metric/trueratio.go
@@ -0,0 +1,33 @@
+package metric
+
+import "github.com/cgi-fr/rimo/pkg/model"
+
+type TrueRatio struct {
+ countTrue uint
+ count uint
+}
+
+func NewTrueRatio() *TrueRatio {
+ return &TrueRatio{
+ countTrue: 0,
+ count: 0,
+ }
+}
+
+func (a *TrueRatio) Read(value *bool) {
+ if value == nil {
+ return
+ }
+
+ a.count++
+
+ if *value {
+ a.countTrue++
+ }
+}
+
+func (a *TrueRatio) Build(metric *model.Column) {
+ metric.BoolMetric = &model.Bool{
+ TrueRatio: float64(a.countTrue) / float64(a.count),
+ }
+}
diff --git a/pkg/metric/types.go b/pkg/metric/types.go
new file mode 100644
index 0000000..05de82d
--- /dev/null
+++ b/pkg/metric/types.go
@@ -0,0 +1,7 @@
+package metric
+
+import "golang.org/x/exp/constraints"
+
+type Accepted interface {
+ constraints.Ordered | ~bool
+}
diff --git a/pkg/model/base.go b/pkg/model/base.go
index 49eed63..2641719 100644
--- a/pkg/model/base.go
+++ b/pkg/model/base.go
@@ -1,50 +1,20 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
package model
-import (
- "fmt"
- "reflect"
-
- "github.com/hexops/valast"
-)
-
-// RIMO YAML structure.
-type (
- Base struct {
- Name string `json:"database" jsonschema:"required" yaml:"database"`
- // Tables should be map[string][]Column
- Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"`
- }
+const DefaultTableSize = 10
- Table struct {
- Name string `json:"name" jsonschema:"required" yaml:"name"`
- Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"`
- }
-)
+type Base struct {
+ Name string `json:"database" yaml:"database" jsonschema:"required"`
+ Tables []Table `json:"tables" yaml:"tables" jsonschema:"required"`
+}
-// Should be improved with more detail about difference.
-func SameBase(base1, base2 *Base) (bool, string) {
- if !reflect.DeepEqual(base1, base2) {
- msg := fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2))
+type Table struct {
+ Name string `json:"name" yaml:"name" jsonschema:"required"`
+ Columns []Column `json:"columns" yaml:"columns" jsonschema:"required"`
+}
- return false, msg
+func NewBase(name string) *Base {
+ return &Base{
+ Name: name,
+ Tables: make([]Table, 0, DefaultTableSize),
}
-
- return true, ""
}
diff --git a/pkg/model/column.go b/pkg/model/column.go
index 223bcb0..8eadc62 100644
--- a/pkg/model/column.go
+++ b/pkg/model/column.go
@@ -1,44 +1,13 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
package model
-const (
- SampleSize int = 5
- MostFrequentLenSize int = 5
- MostFrequentSampleSize int = 5
- LeastFrequentLenSize int = 5
- LeastFrequentSampleSize int = 5
-)
-
-type (
- Column struct {
- Name string `json:"name" jsonschema:"required" yaml:"name"`
- Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll
-
- // The 3 following parameter should be part of a Config struct
- Concept string `json:"concept" jsonschema:"required" yaml:"concept"`
- Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"`
- Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"`
+type Column struct {
+ Name string `json:"name" yaml:"name" jsonschema:"required"`
+ Type string `json:"type" yaml:"type" jsonschema:"required" validate:"oneof=string numeric boolean"`
- MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"`
+ Config
- StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"`
- NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"`
- BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"`
- }
-)
+ MainMetric Generic `json:"mainMetric" yaml:"mainMetric" jsonschema:"required"`
+ StringMetric *String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"`
+ NumericMetric *Numeric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"`
+ BoolMetric *Bool `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"`
+}
diff --git a/pkg/model/config.go b/pkg/model/config.go
new file mode 100644
index 0000000..d6742e4
--- /dev/null
+++ b/pkg/model/config.go
@@ -0,0 +1,7 @@
+package model
+
+type Config struct {
+ Concept string `json:"concept" yaml:"concept" jsonschema:"required"`
+ Constraint []string `json:"constraint" yaml:"constraint" jsonschema:"required"`
+ Confidential *bool `json:"confidential" yaml:"confidential" jsonschema:"required"`
+}
diff --git a/pkg/model/metric.go b/pkg/model/metric.go
deleted file mode 100644
index c0e8d03..0000000
--- a/pkg/model/metric.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package model
-
-// RIMO YAML metrics.
-type (
- GenericMetric struct {
- Count int `json:"count" jsonschema:"required" yaml:"count"`
- Empty int `json:"empty" jsonschema:"required" yaml:"empty"`
- Unique int `json:"unique" jsonschema:"required" yaml:"unique"`
- Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"`
- }
-
- StringMetric struct {
- MinLen int `json:"minLen" jsonschema:"required" yaml:"minLen"`
- MaxLen int `json:"maxLen" jsonschema:"required" yaml:"maxLen"`
- MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"`
- LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"`
- }
-
- LenFreq struct {
- Length int `json:"length" jsonschema:"required" yaml:"length"`
- Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"`
- Sample []string `json:"sample" jsonschema:"required" yaml:"sample"`
- }
-
- NumericMetric struct {
- Min float64 `json:"min" jsonschema:"required" yaml:"min"`
- Max float64 `json:"max" jsonschema:"required" yaml:"max"`
- Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"`
- }
-
- BoolMetric struct {
- TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"`
- }
-)
-
-// Type that a column can be.
-type ValueType string
-
-var ColType = struct { //nolint:gochecknoglobals
- String ValueType
- Numeric ValueType
- Bool ValueType
- Undefined ValueType
-}{
- String: "string",
- Numeric: "numeric",
- Bool: "bool",
- Undefined: "undefined",
-}
diff --git a/pkg/model/metrics.go b/pkg/model/metrics.go
new file mode 100644
index 0000000..5073cc6
--- /dev/null
+++ b/pkg/model/metrics.go
@@ -0,0 +1,32 @@
+package model
+
+type Generic struct {
+ Count uint `json:"count" yaml:"count" jsonschema:"required"`
+ Empty uint `json:"empty" yaml:"empty" jsonschema:"required"`
+ Null uint `json:"nulls" yaml:"nulls" jsonschema:"required"`
+ Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"`
+ Min any `json:"min,omitempty" yaml:"min,omitempty"`
+ Max any `json:"max,omitempty" yaml:"max,omitempty"`
+ Samples []any `json:"samples" yaml:"samples" jsonschema:"required"`
+}
+
+type String struct {
+ MinLen int `json:"minLen" yaml:"minLen"`
+ MaxLen int `json:"maxLen" yaml:"maxLen"`
+ CountLen int `json:"countLen,omitempty" yaml:"countLen,omitempty"`
+ Lengths []StringLen `json:"lengths,omitempty" yaml:"lengths,omitempty"`
+}
+
+type StringLen struct {
+ Length int `json:"length" yaml:"length" jsonschema:"required"`
+ Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"`
+ Metrics Generic `json:"metrics" yaml:"metrics" jsonschema:"required"`
+}
+
+type Numeric struct {
+ Mean float64 `json:"mean" yaml:"mean" jsonschema:"required"`
+}
+
+type Bool struct {
+ TrueRatio float64 `json:"trueRatio" yaml:"trueRatio" jsonschema:"required"`
+}
diff --git a/pkg/model/schema.go b/pkg/model/schema.go
new file mode 100644
index 0000000..5258dca
--- /dev/null
+++ b/pkg/model/schema.go
@@ -0,0 +1,17 @@
+package model
+
+import (
+ "encoding/json"
+ "fmt"
+
+ "github.com/invopop/jsonschema"
+)
+
+func GetJSONSchema() (string, error) {
+ resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct
+ if err != nil {
+ return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err)
+ }
+
+ return string(resBytes), nil
+}
diff --git a/pkg/model/utils.go b/pkg/model/utils.go
deleted file mode 100644
index f80a6e2..0000000
--- a/pkg/model/utils.go
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package model
-
-import (
- "encoding/json"
- "errors"
- "fmt"
- "os"
- "sort"
-
- "github.com/invopop/jsonschema"
- "gopkg.in/yaml.v3"
-)
-
-func GetJSONSchema() (string, error) {
- resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct
- if err != nil {
- return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err)
- }
-
- return string(resBytes), nil
-}
-
-func NewBase(name string) *Base {
- return &Base{
- Name: name,
- Tables: make([]Table, 0),
- }
-}
-
-var ErrBaseFormat = errors.New("error while decoding yaml file in a Base struct")
-
-// Can be improved.
-func LoadBase(path string) (*Base, error) {
- file, err := os.Open(path)
- if err != nil {
- return nil, fmt.Errorf("error while opening file: %w", err)
- }
-
- decoder := yaml.NewDecoder(file)
-
- var base Base
-
- err = decoder.Decode(&base)
- if err != nil {
- return nil, ErrBaseFormat
- }
-
- file.Close()
-
- return &base, nil
-}
-
-func RemoveSampleFromBase(base *Base) {
- for tableI, table := range base.Tables {
- for columnJ, column := range table.Columns {
- column.MainMetric.Sample = nil
-
- if column.Type == ColType.String {
- for freqLen := range column.StringMetric.MostFreqLen {
- column.StringMetric.MostFreqLen[freqLen].Sample = nil
- }
-
- for freqLen := range column.StringMetric.LeastFreqLen {
- column.StringMetric.LeastFreqLen[freqLen].Sample = nil
- }
- }
-
- base.Tables[tableI].Columns[columnJ] = column
- }
- }
-}
-
-func (base *Base) SortBase() {
- for _, table := range base.Tables {
- sort.Slice(table.Columns, func(i, j int) bool {
- return table.Columns[i].Name < table.Columns[j].Name
- })
- }
-
- sort.Slice(base.Tables, func(i, j int) bool {
- return base.Tables[i].Name < base.Tables[j].Name
- })
-}
-
-func (base *Base) AddColumn(column Column, tableName string) {
- mapTableName := make(map[string]int)
- for index, table := range base.Tables {
- mapTableName[table.Name] = index
- }
-
- if index, ok := mapTableName[tableName]; ok {
- // If the table exists, append the column to the table
- base.Tables[index].Columns = append(base.Tables[index].Columns, column)
- } else {
- // If the table does not exist, create a new table and add it to the base
- table := Table{
- Name: tableName,
- Columns: []Column{column},
- }
- base.Tables = append(base.Tables, table)
- }
-}
-
-// If the table does not exist, create a new table and add it to the base
-// table := Table{Name: tableName, Columns: []Column{column}}
-// base.Tables = append(base.Tables, table)
diff --git a/pkg/model/utils_test.go b/pkg/model/utils_test.go
deleted file mode 100644
index 16c619e..0000000
--- a/pkg/model/utils_test.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package model_test
-
-import (
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-func TestAddColumn(t *testing.T) {
- t.Parallel()
-
- base := model.NewBase("test_base")
-
- column := model.Column{ //nolint:exhaustruct
- Name: "test_column",
- Type: model.ColType.String,
- Concept: "test_concept",
- }
-
- tableName := "test_table"
-
- base.AddColumn(column, tableName)
-
- // fmt.Print(valast.String(base))
-
- if len(base.Tables) != 1 {
- t.Errorf("expected 1 table, got %d", len(base.Tables))
- }
-
- if base.Tables[0].Name != tableName {
- t.Errorf("expected table name %q, got %q", tableName, base.Tables[0].Name)
- }
-
- if len(base.Tables[0].Columns) != 1 {
- t.Errorf("expected 1 column, got %d", len(base.Tables[0].Columns))
- }
-
- if base.Tables[0].Columns[0].Name != column.Name {
- t.Errorf("expected column name %q, got %q", column.Name, base.Tables[0].Columns[0].Name)
- }
-
- if base.Tables[0].Columns[0].Type != column.Type {
- t.Errorf("expected column type %q, got %q", column.Type, base.Tables[0].Columns[0].Type)
- }
-
- if base.Tables[0].Columns[0].Concept != column.Concept {
- t.Errorf("expected column concept %q, got %q", column.Concept, base.Tables[0].Columns[0].Concept)
- }
-}
diff --git a/pkg/rimo/driven.go b/pkg/rimo/driven.go
index 1928b2a..fb79719 100644
--- a/pkg/rimo/driven.go
+++ b/pkg/rimo/driven.go
@@ -17,14 +17,19 @@
package rimo
-import (
- "github.com/cgi-fr/rimo/pkg/model"
-)
+import "github.com/cgi-fr/rimo/pkg/model"
+
+type ColReader interface {
+ ColName() string
+ TableName() string
+ Next() bool
+ Value() (any, error)
+}
type Reader interface {
BaseName() string
- Next() bool // itère sur les colonnes.
- Value() ([]interface{}, string, string, error) // colValues, colName, tableName
+ Next() bool
+ Col() (ColReader, error)
}
type Writer interface {
diff --git a/pkg/rimo/driven_test.go b/pkg/rimo/driven_test.go
deleted file mode 100644
index da635c9..0000000
--- a/pkg/rimo/driven_test.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package rimo_test
-
-import (
- "log"
- "math"
- "testing"
-
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/cgi-fr/rimo/pkg/rimo"
-)
-
-// TESTS
-
-func TestTestInterface(t *testing.T) {
- t.Parallel()
-
- var _ rimo.Reader = (*TestReader)(nil)
-
- var _ rimo.Writer = (*TestWriter)(nil)
-}
-
-// TestReader implementation
-
-type colInput struct {
- ColName string
- ColValues []interface{}
-}
-
-type TestReader struct {
- baseName string
- data []colInput
- tableNames []string // Next() will progressively change tableName
- // internal
- index int
- currentValues []interface{}
- currentColName string
- currentTableName string
-}
-
-func (r *TestReader) BaseName() string {
- return r.baseName
-}
-
-func (r *TestReader) Next() bool {
- if r.index == len(r.data) {
- log.Println("End of data")
-
- return false
- }
-
- // update tableName
- if len(r.tableNames) == len(r.data) {
- r.currentTableName = r.tableNames[r.index]
- } else {
- // use a percentage to determine the table name to use from the list
- percentageComplete := float64(r.index) / float64(len(r.data))
- expectedTableIndex := percentageComplete * float64(len(r.tableNames))
- roundedTableIndex := math.Floor(expectedTableIndex)
- tableNameIndex := int(roundedTableIndex)
-
- r.currentTableName = r.tableNames[tableNameIndex]
- }
-
- r.currentColName = r.data[r.index].ColName
- r.currentValues = r.data[r.index].ColValues
- r.index++
-
- return true
-}
-
-func (r *TestReader) Value() ([]interface{}, string, string, error) { //nolint:wsl
- // log.Printf("Processing %s column in %s table", r.currentTableName, r.currentColName)
-
- return r.currentValues, r.currentColName, r.currentTableName, nil
-}
-
-// TestWriter implementation
-
-type TestWriter struct {
- base model.Base
-}
-
-func (w *TestWriter) Export(base *model.Base) error {
- w.base = *base
-
- return nil
-}
-
-func (w *TestWriter) Base() *model.Base {
- return &w.base
-}
diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go
index e626bbc..6e28979 100644
--- a/pkg/rimo/driver.go
+++ b/pkg/rimo/driver.go
@@ -19,6 +19,7 @@ package rimo
import (
"fmt"
+ "sort"
"github.com/cgi-fr/rimo/pkg/metric"
"github.com/cgi-fr/rimo/pkg/model"
@@ -26,36 +27,103 @@ import (
"github.com/rs/zerolog/log"
)
-func AnalyseBase(reader Reader, writer Writer) error {
- // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel)
- baseName := reader.BaseName()
+type Driver struct {
+ SampleSize uint
+ Distinct bool
+}
- // log.Debug().Msgf("Processing [%s base]", baseName)
+//nolint:funlen,cyclop,gocognit
+func (d Driver) AnalyseBase(reader Reader, writer Writer) error {
+ baseName := reader.BaseName()
base := model.NewBase(baseName)
+ tables := map[string]model.Table{}
for reader.Next() { // itère colonne par colonne
- colValues, colName, tableName, err := reader.Value()
+ valreader, err := reader.Col()
if err != nil {
- return fmt.Errorf("failed to get column value : %w", err)
+ return fmt.Errorf("failed to get column reader : %w", err)
}
- column, err := metric.ComputeMetric(colName, colValues)
- if err != nil {
- return fmt.Errorf("failed to compute column : %w", err)
- }
+ nilcount := 0
+
+ for valreader.Next() {
+ val, err := valreader.Value()
+ if err != nil {
+ return fmt.Errorf("failed to read value : %w", err)
+ }
+
+ log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, valreader.TableName(), valreader.ColName())
+
+ switch valtyped := val.(type) {
+ case string:
+ col, err := d.AnalyseString(nilcount, valtyped, valreader)
+ if err != nil {
+ return fmt.Errorf("failed to analyse column : %w", err)
+ }
+
+ table, exists := tables[valreader.TableName()]
+ if !exists {
+ table = model.Table{
+ Name: valreader.TableName(),
+ Columns: []model.Column{},
+ }
+ }
+
+ table.Columns = append(table.Columns, col)
- log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, tableName, column.Name)
- // log.Debug().Msg(valast.String(column))
+ tables[valreader.TableName()] = table
+ case float64, float32, int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64:
+ col, err := d.AnalyseNumeric(nilcount, valtyped, valreader)
+ if err != nil {
+ return fmt.Errorf("failed to analyse column : %w", err)
+ }
- base.AddColumn(column, tableName)
+ table, exists := tables[valreader.TableName()]
+ if !exists {
+ table = model.Table{
+ Name: valreader.TableName(),
+ Columns: []model.Column{},
+ }
+ }
+
+ table.Columns = append(table.Columns, col)
+
+ tables[valreader.TableName()] = table
+ case bool:
+ col, err := d.AnalyseBool(nilcount, valtyped, valreader)
+ if err != nil {
+ return fmt.Errorf("failed to analyse column : %w", err)
+ }
+
+ table, exists := tables[valreader.TableName()]
+ if !exists {
+ table = model.Table{
+ Name: valreader.TableName(),
+ Columns: []model.Column{},
+ }
+ }
+
+ table.Columns = append(table.Columns, col)
+
+ tables[valreader.TableName()] = table
+ case nil:
+ nilcount++
+ }
+ }
}
- base.SortBase()
+ for _, table := range tables {
+ sort.SliceStable(table.Columns, func(i, j int) bool {
+ return table.Columns[i].Name < table.Columns[j].Name
+ })
+
+ base.Tables = append(base.Tables, table)
+ }
- // log.Debug().Msg("---------- Finish processing base :")
- // log.Debug().Msg(valast.String(*base))
- // log.Debug().Msg("----------")
+ sort.SliceStable(base.Tables, func(i, j int) bool {
+ return base.Tables[i].Name < base.Tables[j].Name
+ })
err := writer.Export(base)
if err != nil {
@@ -64,3 +132,164 @@ func AnalyseBase(reader Reader, writer Writer) error {
return nil
}
+
+func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) (model.Column, error) {
+ column := model.Column{
+ Name: reader.ColName(),
+ Type: "string",
+ Config: model.Config{}, //nolint:exhaustruct
+ MainMetric: model.Generic{}, //nolint:exhaustruct
+ StringMetric: &model.String{}, //nolint:exhaustruct
+ NumericMetric: nil,
+ BoolMetric: nil,
+ }
+
+ analyser := metric.NewString(d.SampleSize, d.Distinct)
+
+ for i := 0; i < nilcount; i++ {
+ analyser.Read(nil)
+ }
+
+ analyser.Read(&firstValue)
+
+ for reader.Next() {
+ val, err := reader.Value()
+ if err != nil {
+ return column, fmt.Errorf("failed to read value : %w", err)
+ }
+
+ switch valtyped := val.(type) {
+ case string:
+ analyser.Read(&valtyped)
+ case nil:
+ analyser.Read(nil)
+ default:
+ return column, fmt.Errorf("invalue value type : %w", err)
+ }
+ }
+
+ analyser.Build(&column)
+
+ return column, nil
+}
+
+func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) (model.Column, error) {
+ column := model.Column{
+ Name: reader.ColName(),
+ Type: "numeric",
+ Config: model.Config{}, //nolint:exhaustruct
+ MainMetric: model.Generic{}, //nolint:exhaustruct
+ StringMetric: nil,
+ NumericMetric: &model.Numeric{}, //nolint:exhaustruct
+ BoolMetric: nil,
+ }
+
+ analyser := metric.NewNumeric(d.SampleSize, d.Distinct)
+
+ for i := 0; i < nilcount; i++ {
+ analyser.Read(nil)
+ }
+
+ valtyped, err := GetFloat64(firstValue)
+ if err != nil {
+ return column, fmt.Errorf("failed to read value : %w", err)
+ }
+
+ analyser.Read(valtyped)
+
+ for reader.Next() {
+ val, err := reader.Value()
+ if err != nil {
+ return column, fmt.Errorf("failed to read value : %w", err)
+ }
+
+ valtyped, err := GetFloat64(val)
+ if err != nil {
+ return column, fmt.Errorf("failed to read value : %w", err)
+ }
+
+ analyser.Read(valtyped)
+ }
+
+ analyser.Build(&column)
+
+ return column, nil
+}
+
+func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (model.Column, error) {
+ column := model.Column{
+ Name: reader.ColName(),
+ Type: "bool",
+ Config: model.Config{}, //nolint:exhaustruct
+ MainMetric: model.Generic{}, //nolint:exhaustruct
+ StringMetric: nil,
+ NumericMetric: nil,
+ BoolMetric: &model.Bool{}, //nolint:exhaustruct
+ }
+
+ analyser := metric.NewBool(d.SampleSize, d.Distinct)
+
+ for i := 0; i < nilcount; i++ {
+ analyser.Read(nil)
+ }
+
+ analyser.Read(&firstValue)
+
+ for reader.Next() {
+ val, err := reader.Value()
+ if err != nil {
+ return column, fmt.Errorf("failed to read value : %w", err)
+ }
+
+ switch valtyped := val.(type) {
+ case bool:
+ analyser.Read(&valtyped)
+ case nil:
+ analyser.Read(nil)
+ default:
+ return column, fmt.Errorf("invalue value type : %w", err)
+ }
+ }
+
+ analyser.Build(&column)
+
+ return column, nil
+}
+
+//nolint:cyclop
+func GetFloat64(value any) (*float64, error) {
+ var converted float64
+
+ switch valtyped := value.(type) {
+ case float64:
+ converted = valtyped
+ case float32:
+ converted = float64(valtyped)
+ case int:
+ converted = float64(valtyped)
+ case int8:
+ converted = float64(valtyped)
+ case int16:
+ converted = float64(valtyped)
+ case int32:
+ converted = float64(valtyped)
+ case int64:
+ converted = float64(valtyped)
+ case uint:
+ converted = float64(valtyped)
+ case uint8:
+ converted = float64(valtyped)
+ case uint16:
+ converted = float64(valtyped)
+ case uint32:
+ converted = float64(valtyped)
+ case uint64:
+ converted = float64(valtyped)
+ case nil:
+ return nil, nil //nolint:nilnil
+ default:
+ return nil, fmt.Errorf("%w : %T", ErrInvalidValueType, value)
+ }
+
+ return &converted, nil
+}
diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go
index 186e18f..f8dd16f 100644
--- a/pkg/rimo/driver_test.go
+++ b/pkg/rimo/driver_test.go
@@ -1,20 +1,3 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
package rimo_test
import (
@@ -24,15 +7,11 @@ import (
"time"
"github.com/cgi-fr/rimo/internal/infra"
- "github.com/cgi-fr/rimo/pkg/model"
"github.com/cgi-fr/rimo/pkg/rimo"
-
- "github.com/hexops/valast"
- "github.com/stretchr/testify/assert"
+ "github.com/rs/zerolog"
"github.com/stretchr/testify/require"
)
-// Run Analyse pipeline with FilesReader and TestWriter and compare with expected result.
const (
dataDir = "../../testdata/"
inputName = "data_input.jsonl"
@@ -40,129 +19,28 @@ const (
expectedName = "data_expected.yaml"
)
-type testCase struct {
- name string
- inputPath string
- expectedPath string
-}
-
-func getTestCase(dataFolder string) testCase {
- return testCase{
- name: filepath.Base(dataFolder),
- inputPath: filepath.Join(dataFolder, inputName),
- expectedPath: filepath.Join(dataFolder, expectedName),
- }
-}
-
-// PIPELINE TESTS
-
-// Note : numeric value should be converted to float64.
-func TestManualPipeline(t *testing.T) {
- t.Parallel()
-
- // Set up TestReader
- baseName := "databaseName"
- tableNames := []string{"tableTest"}
- testInput := []colInput{
- {
- ColName: "string",
- ColValues: []interface{}{"val1", "val2", "val3"},
- },
- {
- ColName: "col2",
- ColValues: []interface{}{true, false, nil},
- },
- {
- ColName: "col9",
- ColValues: []interface{}{float64(31), float64(29), float64(42)},
- },
- {
- ColName: "empty",
- ColValues: []interface{}{nil, nil, nil},
- },
- }
-
- testReader := TestReader{ //nolint:exhaustruct
- baseName: baseName,
- tableNames: tableNames,
- data: testInput,
- index: 0,
- }
-
- testWriter := TestWriter{} //nolint:exhaustruct
-
- err := rimo.AnalyseBase(&testReader, &testWriter)
- if err != nil {
- t.Errorf("Error: %v", err)
- }
-
- t.Logf("Base returned : %s", valast.String(*testWriter.Base()))
-}
-
-// Ensure that the pipeline produce the same base as expected.
-func TestPipeline(t *testing.T) {
- t.Parallel()
-
- testCases := []testCase{}
- testCases = append(testCases, getTestCase("../../testdata/data1/"))
- // testCases = append(testCases, getTestCase("../../testdata/data2/"))
-
- for _, testCase := range testCases {
- testCase := testCase // capture range variable
- t.Run(testCase.name, func(t *testing.T) {
- t.Parallel()
-
- // Actual base
-
- reader, err := infra.FilesReaderFactory([]string{testCase.inputPath})
- assert.NoError(t, err)
-
- writer := &TestWriter{} //nolint:exhaustruct
-
- err = rimo.AnalyseBase(reader, writer)
- assert.NoError(t, err)
-
- actualBase := writer.Base()
-
- // Expected base
- expectedBase, err := model.LoadBase(testCase.expectedPath)
- if err != nil {
- t.Errorf("Error: %v", err)
- }
-
- // Remove sample
- model.RemoveSampleFromBase(expectedBase)
- model.RemoveSampleFromBase(actualBase)
-
- fmt.Printf("Actual base : %s\n", valast.String(*actualBase))
- // Compare
- equal, diff := model.SameBase(expectedBase, actualBase)
- if !equal {
- t.Errorf("Base are not equal:\n%s", diff)
- }
- })
- }
-}
-
// Benchmark (same as previous analyse_test.go benchmark).
func BenchmarkAnalyseInterface(b *testing.B) {
- for _, numLines := range []int{100, 1000, 10000, 100000} {
- inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines))
- inputList := []string{inputPath}
+ zerolog.SetGlobalLevel(zerolog.WarnLevel)
+
+ for _, numLines := range []int{100, 1000, 10000} {
+ inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d", numLines))
outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines))
b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) {
startTime := time.Now()
- reader, err := infra.FilesReaderFactory(inputList)
+ reader, err := infra.NewJSONLFolderReader(inputPath)
require.NoError(b, err)
writer, err := infra.YAMLWriterFactory(outputPath)
require.NoError(b, err)
+ driver := rimo.Driver{SampleSize: 5, Distinct: true}
+
b.ResetTimer()
for n := 0; n < b.N; n++ {
- err := rimo.AnalyseBase(reader, writer)
+ err := driver.AnalyseBase(reader, writer)
require.NoError(b, err)
}
b.StopTimer()
diff --git a/pkg/rimo/error.go b/pkg/rimo/error.go
new file mode 100644
index 0000000..083daa5
--- /dev/null
+++ b/pkg/rimo/error.go
@@ -0,0 +1,5 @@
+package rimo
+
+import "errors"
+
+var ErrInvalidValueType = errors.New("invalue value type")
diff --git a/schema/v1/rimo.schema.json b/schema/v1/rimo.schema.json
index be205df..187194b 100644
--- a/schema/v1/rimo.schema.json
+++ b/schema/v1/rimo.schema.json
@@ -22,7 +22,7 @@
"tables"
]
},
- "BoolMetric": {
+ "Bool": {
"properties": {
"trueRatio": {
"type": "number"
@@ -55,16 +55,16 @@
"type": "boolean"
},
"mainMetric": {
- "$ref": "#/$defs/GenericMetric"
+ "$ref": "#/$defs/Generic"
},
"stringMetric": {
- "$ref": "#/$defs/StringMetric"
+ "$ref": "#/$defs/String"
},
"numericMetric": {
- "$ref": "#/$defs/NumericMetric"
+ "$ref": "#/$defs/Numeric"
},
"boolMetric": {
- "$ref": "#/$defs/BoolMetric"
+ "$ref": "#/$defs/Bool"
}
},
"additionalProperties": false,
@@ -78,7 +78,7 @@
"mainMetric"
]
},
- "GenericMetric": {
+ "Generic": {
"properties": {
"count": {
"type": "integer"
@@ -86,10 +86,15 @@
"empty": {
"type": "integer"
},
- "unique": {
+ "nulls": {
"type": "integer"
},
- "sample": {
+ "distinct": {
+ "type": "integer"
+ },
+ "min": true,
+ "max": true,
+ "samples": {
"items": true,
"type": "array"
}
@@ -99,41 +104,12 @@
"required": [
"count",
"empty",
- "unique",
- "sample"
+ "nulls",
+ "samples"
]
},
- "LenFreq": {
+ "Numeric": {
"properties": {
- "length": {
- "type": "integer"
- },
- "freq": {
- "type": "number"
- },
- "sample": {
- "items": {
- "type": "string"
- },
- "type": "array"
- }
- },
- "additionalProperties": false,
- "type": "object",
- "required": [
- "length",
- "freq",
- "sample"
- ]
- },
- "NumericMetric": {
- "properties": {
- "min": {
- "type": "number"
- },
- "max": {
- "type": "number"
- },
"mean": {
"type": "number"
}
@@ -141,12 +117,10 @@
"additionalProperties": false,
"type": "object",
"required": [
- "min",
- "max",
"mean"
]
},
- "StringMetric": {
+ "String": {
"properties": {
"minLen": {
"type": "integer"
@@ -154,15 +128,12 @@
"maxLen": {
"type": "integer"
},
- "mostFrequentLen": {
- "items": {
- "$ref": "#/$defs/LenFreq"
- },
- "type": "array"
+ "countLen": {
+ "type": "integer"
},
- "leastFrequentLen": {
+ "lengths": {
"items": {
- "$ref": "#/$defs/LenFreq"
+ "$ref": "#/$defs/StringLen"
},
"type": "array"
}
@@ -171,9 +142,27 @@
"type": "object",
"required": [
"minLen",
- "maxLen",
- "mostFrequentLen",
- "leastFrequentLen"
+ "maxLen"
+ ]
+ },
+ "StringLen": {
+ "properties": {
+ "length": {
+ "type": "integer"
+ },
+ "freq": {
+ "type": "number"
+ },
+ "metrics": {
+ "$ref": "#/$defs/Generic"
+ }
+ },
+ "additionalProperties": false,
+ "type": "object",
+ "required": [
+ "length",
+ "freq",
+ "metrics"
]
},
"Table": {
diff --git a/test/suites/cli/metrics.yml b/test/suites/cli/metrics.yml
new file mode 100644
index 0000000..108eeac
--- /dev/null
+++ b/test/suites/cli/metrics.yml
@@ -0,0 +1,244 @@
+name: test metrics
+testcases:
+ - name: main metric
+ steps:
+ - script: rimo analyse -d ../testdata/main ../testdata/main/output
+ assertions:
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].name' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "data"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].name' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "bool"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].type' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "bool"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].mainMetric.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "10"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].mainMetric.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "4"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].mainMetric.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].mainMetric.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "2"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].mainMetric.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[0].boolMetric.trueRatio' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0.5555555555555556"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].name' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "numeric"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].type' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "numeric"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "10"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "7"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.min' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "-235"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.max' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "100"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].mainMetric.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[1].numericMetric.mean' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "-13.539833097777777"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].name' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "string"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].type' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "string"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "10"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "9"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.min' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual ""
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.max' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "教育漢字"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].mainMetric.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.minLen' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.maxLen' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "12"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.countLen' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "12"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].freq' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0.5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[0].metrics.min' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual '"hello world "'
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.max' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "教育漢字"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "5"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].freq' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0.3"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+ - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[1].metrics.min' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual '" "'
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.max' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "_"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "3"
+ - result.code ShouldEqual 0
+
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].freq' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0.1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.count' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.empty' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.nulls' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "0"
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.distinct' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
+ - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.min' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual '""'
+ - result.code ShouldEqual 0
+ - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.max' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual '""'
+ - result.code ShouldEqual 0
+ - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.samples|length' ../testdata/main/output/main.yaml
+ assertions:
+ - result.systemout ShouldEqual "1"
+ - result.code ShouldEqual 0
diff --git a/test/suites/testdata/main/data.jsonl b/test/suites/testdata/main/data.jsonl
new file mode 100644
index 0000000..260040e
--- /dev/null
+++ b/test/suites/testdata/main/data.jsonl
@@ -0,0 +1,10 @@
+{"string":"","bool":true,"numeric":0}
+{"string":" ","bool":false,"numeric":1}
+{"string":"_","bool":true,"numeric":3.1415}
+{"string":"new\nline ","bool":false,"numeric":1.0E+2}
+{"string":"hello world ","bool":true,"numeric":21.2e-7}
+{"string":"tabs\t ","bool":false,"numeric":-235}
+{"string":"教育漢字","bool":true,"numeric":-0}
+{"string":"\\","bool":false,"numeric":0.0}
+{"string":"\u20ac ","bool":true,"numeric":9.0}
+{"string":null,"bool":null,"numeric":null}
diff --git a/test/suites/testdata/main/output/.gitkeep b/test/suites/testdata/main/output/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/testdata/benchmark/buildBenchData.sh b/testdata/benchmark/buildBenchData.sh
index fa046ce..60b2138 100755
--- a/testdata/benchmark/buildBenchData.sh
+++ b/testdata/benchmark/buildBenchData.sh
@@ -3,53 +3,57 @@
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "${SCRIPT_DIR}/mixed/"
-if [ ! -f 100_input.jsonl ]; then
- pimo --empty-input --repeat=100 > 100_input.jsonl
+mkdir 100 1000 10000 100000
+if [ ! -f 100/input.jsonl ]; then
+ pimo --empty-input --repeat=100 > 100/input.jsonl
fi
-if [ ! -f 1000_input.jsonl ]; then
- pimo --empty-input --repeat=1000 > 1000_input.jsonl
+if [ ! -f 1000/input.jsonl ]; then
+ pimo --empty-input --repeat=1000 > 1000/input.jsonl
fi
-if [ ! -f 10000_input.jsonl ]; then
- pimo --empty-input --repeat=10000 > 10000_input.jsonl
+if [ ! -f 10000/input.jsonl ]; then
+ pimo --empty-input --repeat=10000 > 10000/input.jsonl
fi
-if [ ! -f 100000_input.jsonl ]; then
- pimo --empty-input --repeat=100000 > 100000_input.jsonl
+if [ ! -f 100000/input.jsonl ]; then
+ pimo --empty-input --repeat=100000 > 100000/input.jsonl
fi
echo "data for mixed : OK"
cd "${SCRIPT_DIR}/bool/"
-if [ ! -f 100_input.jsonl ]; then
- pimo --empty-input --repeat=100 > 100_input.jsonl
+mkdir 100 1000 10000 100000
+if [ ! -f 100/input.jsonl ]; then
+ pimo --empty-input --repeat=100 > 100/input.jsonl
fi
-if [ ! -f 1000_input.jsonl ]; then
- pimo --empty-input --repeat=1000 > 1000_input.jsonl
+if [ ! -f 1000/input.jsonl ]; then
+ pimo --empty-input --repeat=1000 > 1000/input.jsonl
fi
-if [ ! -f 10000_input.jsonl ]; then
- pimo --empty-input --repeat=10000 > 10000_input.jsonl
+if [ ! -f 10000/input.jsonl ]; then
+ pimo --empty-input --repeat=10000 > 10000/input.jsonl
fi
echo "data for mixed : OK"
cd "${SCRIPT_DIR}/numeric/"
-if [ ! -f 100_input.jsonl ]; then
- pimo --empty-input --repeat=100 > 100_input.jsonl
+mkdir 100 1000 10000 100000
+if [ ! -f 100/input.jsonl ]; then
+ pimo --empty-input --repeat=100 > 100/input.jsonl
fi
-if [ ! -f 1000_input.jsonl ]; then
- pimo --empty-input --repeat=1000 > 1000_input.jsonl
+if [ ! -f 1000/input.jsonl ]; then
+ pimo --empty-input --repeat=1000 > 1000/input.jsonl
fi
-if [ ! -f 10000_input.jsonl ]; then
- pimo --empty-input --repeat=10000 > 10000_input.jsonl
+if [ ! -f 10000/input.jsonl ]; then
+ pimo --empty-input --repeat=10000 > 10000/input.jsonl
fi
echo "data for numeric : OK"
cd "${SCRIPT_DIR}/text/"
-if [ ! -f 100_input.jsonl ]; then
- pimo --empty-input --repeat=100 > 100_input.jsonl
+mkdir 100 1000 10000 100000
+if [ ! -f 100/input.jsonl ]; then
+ pimo --empty-input --repeat=100 > 100/input.jsonl
fi
-if [ ! -f 1000_input.jsonl ]; then
- pimo --empty-input --repeat=1000 > 1000_input.jsonl
+if [ ! -f 1000/input.jsonl ]; then
+ pimo --empty-input --repeat=1000 > 1000/input.jsonl
fi
-if [ ! -f 10000_input.jsonl ]; then
- pimo --empty-input --repeat=10000 > 10000_input.jsonl
+if [ ! -f 10000/input.jsonl ]; then
+ pimo --empty-input --repeat=10000 > 10000/input.jsonl
fi
echo "data generated for text : OK"