From 58c3e27806bef7d33dc04c5c16507a11a8c3e14a Mon Sep 17 00:00:00 2001 From: Adrien Aury <44274230+adrienaury@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:33:54 +0100 Subject: [PATCH] refactor: stateless (#33) * refactor: wip! counter and sampler * refactor: wip! metrics string * refactor: wip! string len freq * refactor: wip! string sort by freq * refactor: wip! modelv2 * refactor: build column * refactor: driver * refactor: infra file reader * refactor: fix driver * refactor: reader v2 * refactor: reader v2 * refactor: lint + sample-size flag * refactor: numeric metric * refactor: stable output * refactor: bool metric * refactor: cleanup * refactor: update schema * refactor: count distinct * refactor: bool samples * refactor: put benchmark back * refactor: disable 100000 lines bench * refactor: add logs * refactor: bench set global level warn * refactor: test int * chore: add yq in ci * docs: update readme --- .devcontainer/Dockerfile | 4 +- .devcontainer/Dockerfile.ci | 4 +- CHANGELOG.md | 7 + build.yml | 6 + cmd/rimo/main.go | 128 +++++------ go.mod | 14 +- go.sum | 22 +- internal/infra/fileWriter_test.go | 84 ------- internal/infra/filesReader.go | 263 +++++++++++----------- internal/infra/filesReader_test.go | 83 ------- internal/infra/infra_test.go | 93 -------- internal/infra/loader_test.go | 39 ---- pkg/metric/analyser.go | 26 +++ pkg/metric/bool.go | 21 ++ pkg/metric/build.go | 118 ---------- pkg/metric/counter.go | 38 ++++ pkg/metric/distinct.go | 25 ++ pkg/metric/generic.go | 124 ---------- pkg/metric/generic_test.go | 128 ----------- pkg/metric/mean.go | 31 +++ pkg/metric/metric_test.go | 82 +++++++ pkg/metric/metricbool.go | 51 ----- pkg/metric/metricbool_test.go | 42 ---- pkg/metric/metricnumeric.go | 69 ------ pkg/metric/metricnumeric_test.go | 46 ---- pkg/metric/metricstring.go | 140 ------------ pkg/metric/metricstring_test.go | 60 ----- pkg/metric/minmax.go | 41 ++++ pkg/metric/numeric.go | 22 ++ pkg/metric/sampler.go | 45 ++++ pkg/metric/string.go | 96 ++++++++ pkg/metric/trueratio.go | 33 +++ pkg/metric/types.go | 7 + pkg/model/base.go | 56 ++--- pkg/model/column.go | 49 +--- pkg/model/config.go | 7 + pkg/model/metric.go | 66 ------ pkg/model/metrics.go | 32 +++ pkg/model/schema.go | 17 ++ pkg/model/utils.go | 123 ---------- pkg/model/utils_test.go | 66 ------ pkg/rimo/driven.go | 15 +- pkg/rimo/driven_test.go | 108 --------- pkg/rimo/driver.go | 263 ++++++++++++++++++++-- pkg/rimo/driver_test.go | 140 +----------- pkg/rimo/error.go | 5 + schema/v1/rimo.schema.json | 95 ++++---- test/suites/cli/metrics.yml | 244 ++++++++++++++++++++ test/suites/testdata/main/data.jsonl | 10 + test/suites/testdata/main/output/.gitkeep | 0 testdata/benchmark/buildBenchData.sh | 56 ++--- 51 files changed, 1374 insertions(+), 1970 deletions(-) delete mode 100644 internal/infra/fileWriter_test.go delete mode 100644 internal/infra/filesReader_test.go delete mode 100644 internal/infra/infra_test.go delete mode 100644 internal/infra/loader_test.go create mode 100644 pkg/metric/analyser.go create mode 100644 pkg/metric/bool.go delete mode 100644 pkg/metric/build.go create mode 100644 pkg/metric/counter.go create mode 100644 pkg/metric/distinct.go delete mode 100644 pkg/metric/generic.go delete mode 100644 pkg/metric/generic_test.go create mode 100644 pkg/metric/mean.go create mode 100644 pkg/metric/metric_test.go delete mode 100644 pkg/metric/metricbool.go delete mode 100644 pkg/metric/metricbool_test.go delete mode 100644 pkg/metric/metricnumeric.go delete mode 100644 pkg/metric/metricnumeric_test.go delete mode 100644 pkg/metric/metricstring.go delete mode 100644 pkg/metric/metricstring_test.go create mode 100644 pkg/metric/minmax.go create mode 100644 pkg/metric/numeric.go create mode 100644 pkg/metric/sampler.go create mode 100644 pkg/metric/string.go create mode 100644 pkg/metric/trueratio.go create mode 100644 pkg/metric/types.go create mode 100644 pkg/model/config.go delete mode 100644 pkg/model/metric.go create mode 100644 pkg/model/metrics.go create mode 100644 pkg/model/schema.go delete mode 100644 pkg/model/utils.go delete mode 100644 pkg/model/utils_test.go delete mode 100644 pkg/rimo/driven_test.go create mode 100644 pkg/rimo/error.go create mode 100644 test/suites/cli/metrics.yml create mode 100644 test/suites/testdata/main/data.jsonl create mode 100644 test/suites/testdata/main/output/.gitkeep diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index ba5566f..24f3fe4 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -10,9 +10,11 @@ RUN apk add --update --progress --no-cache make gomplate ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 +ARG VERSION_YQ=4.40.4 RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr + && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \ + && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq COPY --from=pimo /usr/bin/pimo /usr/bin/pimo diff --git a/.devcontainer/Dockerfile.ci b/.devcontainer/Dockerfile.ci index 0e547cd..f679260 100644 --- a/.devcontainer/Dockerfile.ci +++ b/.devcontainer/Dockerfile.ci @@ -10,8 +10,10 @@ RUN apk add --update --progress --no-cache make gomplate ARG VERSION_GOLICENSE=0.2.0 ARG VERSION_MILLER=6.2.0 +ARG VERSION_YQ=4.40.4 RUN wget -nv -O- https://github.com/mitchellh/golicense/releases/download/v${VERSION_GOLICENSE}/golicense_${VERSION_GOLICENSE}_linux_x86_64.tar.gz | tar xz -C /usr/bin golicense \ && wget -nv -O- https://github.com/johnkerl/miller/releases/download/v${VERSION_MILLER}/miller-${VERSION_MILLER}-linux-amd64.tar.gz | tar xz --strip-components 1 -C /usr/bin miller-${VERSION_MILLER}-linux-amd64/mlr \ - && chmod +x /usr/bin/golicense /usr/bin/mlr + && wget -nv -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v${VERSION_YQ}/yq_linux_amd64 \ + && chmod +x /usr/bin/golicense /usr/bin/mlr /usr/bin/yq COPY --from=pimo /usr/bin/pimo /usr/bin/pimo diff --git a/CHANGELOG.md b/CHANGELOG.md index 06380a3..792ec12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,13 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [0.3.0] + +- `Added` moved `min` and `max` to the main metric. +- `Added` `countNulls` to the main metric. +- `Added` all main metrics to the lengths section in string metrics. +- `Removed` `leastFrequentLen` and `mostFrequentLen` all lengths are listed with the most frequent length in first position + ## [0.2.0] - `Added` new string metrics `minLen` and `maxLen` diff --git a/build.yml b/build.yml index 394b39f..af94db6 100644 --- a/build.yml +++ b/build.yml @@ -255,6 +255,12 @@ targets: - ldflags = ldflags + " -s -w" # Omit the DWARF symbol table. Omit the symbol table and debug information. - call: compile + test-int-debug: + doc: "Run all integration tests" + depends: ["info"] + steps: + - $: venom run test/suites/* + test-int: doc: "Run all integration tests" depends: ["info", "refresh", "lint", "test", "benchmark", "release"] diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index e75290a..57bdaac 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -21,27 +21,39 @@ import ( "fmt" "os" "path/filepath" + "runtime" + "strings" "github.com/cgi-fr/rimo/internal/infra" "github.com/cgi-fr/rimo/pkg/model" "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/mattn/go-isatty" "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/spf13/cobra" ) -// Provisioned by ldflags. +const DefaultSampleSize = uint(5) + +//nolint:gochecknoglobals var ( - name string //nolint: gochecknoglobals - version string //nolint: gochecknoglobals - commit string //nolint: gochecknoglobals - buildDate string //nolint: gochecknoglobals - builtBy string //nolint: gochecknoglobals + name string // provisioned by ldflags + version string // provisioned by ldflags + commit string // provisioned by ldflags + buildDate string // provisioned by ldflags + builtBy string // provisioned by ldflags + + verbosity string + jsonlog bool + debug bool + colormode string + + sampleSize uint + distinct bool //nolint: gochecknoglobals ) func main() { //nolint:funlen - log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) //nolint: exhaustruct - + cobra.OnInitialize(initLog) log.Info().Msgf("%v %v (commit=%v date=%v by=%v)", name, version, commit, buildDate, builtBy) rootCmd := &cobra.Command{ //nolint:exhaustruct @@ -54,6 +66,12 @@ func main() { //nolint:funlen There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDate, builtBy), } + rootCmd.PersistentFlags().StringVarP(&verbosity, "verbosity", "v", "warn", + "set level of log verbosity : none (0), error (1), warn (2), info (3), debug (4), trace (5)") + rootCmd.PersistentFlags().BoolVar(&debug, "debug", false, "add debug information to logs (very slow)") + rootCmd.PersistentFlags().BoolVar(&jsonlog, "log-json", false, "output logs in JSON format") + rootCmd.PersistentFlags().StringVar(&colormode, "color", "auto", "use colors in log outputs : yes, no or auto") + rimoSchemaCmd := &cobra.Command{ //nolint:exhaustruct Use: "jsonschema", Short: "Return rimo jsonschema", @@ -77,32 +95,21 @@ func main() { //nolint:funlen outputDir := args[1] // Reader - - inputList, err := BuildFilepathList(inputDir, ".jsonl") - if err != nil { - log.Fatal().Msgf("error listing files: %v", err) - } - - reader, err := infra.FilesReaderFactory(inputList) + reader, err := infra.NewJSONLFolderReader(inputDir) if err != nil { log.Fatal().Msgf("error creating reader: %v", err) } - // Writer - // (could be relocated to infra.FilesReader) - baseName, _, err := infra.ExtractName(inputList[0]) - if err != nil { - log.Fatal().Msgf("error extracting base name: %v", err) - } - - outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", baseName)) + outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", reader.BaseName())) writer, err := infra.YAMLWriterFactory(outputPath) if err != nil { log.Fatal().Msgf("error creating writer: %v", err) } - err = rimo.AnalyseBase(reader, writer) + driver := rimo.Driver{SampleSize: sampleSize, Distinct: distinct} + + err = driver.AnalyseBase(reader, writer) if err != nil { log.Fatal().Msgf("error generating rimo.yaml: %v", err) } @@ -111,6 +118,9 @@ func main() { //nolint:funlen }, } + rimoAnalyseCmd.Flags().UintVar(&sampleSize, "sample-size", DefaultSampleSize, "number of sample value to collect") + rimoAnalyseCmd.Flags().BoolVarP(&distinct, "distinct", "d", false, "count distinct values") + rootCmd.AddCommand(rimoAnalyseCmd) rootCmd.AddCommand(rimoSchemaCmd) @@ -120,54 +130,44 @@ func main() { //nolint:funlen } } -func FilesList(path string, extension string) ([]string, error) { - pattern := filepath.Join(path, "*"+extension) +func initLog() { + color := false - files, err := filepath.Glob(pattern) - if err != nil { - return nil, fmt.Errorf("error listing files: %w", err) + switch strings.ToLower(colormode) { + case "auto": + if isatty.IsTerminal(os.Stdout.Fd()) && runtime.GOOS != "windows" { + color = true + } + case "yes", "true", "1", "on", "enable": + color = true } - return files, nil -} - -var ErrNoFile = fmt.Errorf("no file found") - -func BuildFilepathList(path string, extension string) ([]string, error) { - err := ValidateDirPath(path) - if err != nil { - return nil, fmt.Errorf("failed to validate input directory: %w", err) - } - - pattern := filepath.Join(path, "*"+extension) - - files, err := filepath.Glob(pattern) - if err != nil { - return nil, fmt.Errorf("error listing files: %w", err) + if jsonlog { + log.Logger = zerolog.New(os.Stderr) + } else { + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, NoColor: !color}) //nolint:exhaustruct } - if len(files) == 0 { - return nil, fmt.Errorf("%w : no %s files found in %s", ErrNoFile, extension, path) + if debug { + log.Logger = log.Logger.With().Caller().Logger() } - return files, nil + setVerbosity() } -func ValidateDirPath(path string) error { - fileInfo, err := os.Stat(path) - if os.IsNotExist(err) { - return fmt.Errorf("%w: %s", infra.ErrDirDoesNotExist, path) - } else if err != nil { - return fmt.Errorf("failed to get directory info: %w", err) +func setVerbosity() { + switch verbosity { + case "trace", "5": + zerolog.SetGlobalLevel(zerolog.TraceLevel) + case "debug", "4": + zerolog.SetGlobalLevel(zerolog.DebugLevel) + case "info", "3": + zerolog.SetGlobalLevel(zerolog.InfoLevel) + case "warn", "2": + zerolog.SetGlobalLevel(zerolog.WarnLevel) + case "error", "1": + zerolog.SetGlobalLevel(zerolog.ErrorLevel) + default: + zerolog.SetGlobalLevel(zerolog.Disabled) } - - if !fileInfo.IsDir() { - return fmt.Errorf("%w: %s", infra.ErrPathIsNotDir, path) - } - - if fileInfo.Mode().Perm()&infra.WriteDirPerm != infra.WriteDirPerm { - return fmt.Errorf("%w: %s", infra.ErrWriteDirPermission, path) - } - - return nil } diff --git a/go.mod b/go.mod index cddbdd1..a472ce0 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cgi-fr/rimo go 1.20 require ( - github.com/hexops/valast v1.4.4 + github.com/goccy/go-json v0.10.2 github.com/rs/zerolog v1.30.0 github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.8.4 @@ -11,20 +11,20 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) -require gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +require ( + github.com/kr/pretty v0.3.1 // indirect + github.com/rogpeppe/go-internal v1.10.0 // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect +) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/iancoleman/orderedmap v0.3.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/invopop/jsonschema v0.7.0 // direct github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.19 github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/mod v0.13.0 // indirect golang.org/x/sys v0.13.0 // indirect - golang.org/x/tools v0.14.0 // indirect - mvdan.cc/gofumpt v0.5.0 // indirect ) diff --git a/go.sum b/go.sum index 4893f5a..8577e51 100644 --- a/go.sum +++ b/go.sum @@ -1,16 +1,12 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/hexops/autogold v0.8.1 h1:wvyd/bAJ+Dy+DcE09BoLk6r4Fa5R5W+O+GUzmR985WM= -github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= -github.com/hexops/valast v1.4.4 h1:rETyycw+/L2ZVJHHNxEBgh8KUn+87WugH9MxcEv9PGs= -github.com/hexops/valast v1.4.4/go.mod h1:Jcy1pNH7LNraVaAZDLyv21hHg2WBv9Nf9FL6fGxU7o4= github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0/go.mod h1:N0Wam8K1arqPXNWjMo21EXnBPOPp36vB07FNRdD2geA= github.com/iancoleman/orderedmap v0.3.0 h1:5cbR2grmZR/DiVt+VJopEhtVs9YGInGIxAoMJn+Ichc= github.com/iancoleman/orderedmap v0.3.0/go.mod h1:XuLcCUkdL5owUCQeF2Ue9uuw1EptkJDkXXS7VoV7XGE= @@ -19,7 +15,9 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf github.com/invopop/jsonschema v0.7.0 h1:2vgQcBz1n256N+FpX3Jq7Y17AjYt46Ig3zIWyy770So= github.com/invopop/jsonschema v0.7.0/go.mod h1:O9uiLokuu0+MGFlyiaqtWxwqJm41/+8Nj0lD7A36YH0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= @@ -27,10 +25,13 @@ github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27k github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.30.0 h1:SymVODrcRsaRaSInD9yQtKbtWqwsfoPcRff/oRXLj4c= github.com/rs/zerolog v1.30.0/go.mod h1:/tk+P47gFdPXq4QYjvCmT5/Gsug2nagsFWBWhAiSi1w= @@ -45,21 +46,14 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= -golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= -golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= -golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -mvdan.cc/gofumpt v0.5.0 h1:0EQ+Z56k8tXjj/6TQD25BFNKQXpCvT0rnansIc7Ug5E= -mvdan.cc/gofumpt v0.5.0/go.mod h1:HBeVDtMKRZpXyxFciAirzdKklDlGu8aAy1wEbH5Y9js= diff --git a/internal/infra/fileWriter_test.go b/internal/infra/fileWriter_test.go deleted file mode 100644 index cd34651..0000000 --- a/internal/infra/fileWriter_test.go +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -const ( - dataDir = "../../testdata/" -) - -func TestWriterYAML(t *testing.T) { - t.Parallel() - - base := model.Base{ - Name: "databaseName", - Tables: []model.Table{ - { - Name: "tableName", - Columns: []model.Column{}, - }, - }, - } - - // Create a temporary directory for the test - tempDir, err := os.MkdirTemp(dataDir, "export_test") - require.NoError(t, err) - - defer os.RemoveAll(tempDir) - - // Create a temporary file for the output - outputFile := filepath.Join(tempDir, "output.yaml") - - // Create the writer - writer, err := infra.YAMLWriterFactory(outputFile) - require.NoError(t, err) - - err = writer.Export(&base) - require.NoError(t, err) - - // Read the output file and check its contents - file, err := os.Open(outputFile) - require.NoError(t, err) - - defer file.Close() - - stat, err := file.Stat() - require.NoError(t, err) - - outputData := make([]byte, stat.Size()) - _, err = file.Read(outputData) - require.NoError(t, err) - - expectedData := `database: databaseName -tables: - - name: tableName - columns: [] -` - - assert.Equal(t, expectedData, string(outputData)) -} diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go index 8430ae9..9503edd 100644 --- a/internal/infra/filesReader.go +++ b/internal/infra/filesReader.go @@ -1,173 +1,180 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - package infra import ( "errors" "fmt" + "os" + "path" + "path/filepath" + "strings" + + "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/goccy/go-json" + "github.com/rs/zerolog/log" ) -// Errors declaration. -var ( - ErrInvalidFilePath = errors.New("failed to validate path") - ErrNoFilePath = errors.New("no file path provided") - ErrNonUniqueBase = errors.New("base name is not unique") -) +var ErrReadFile = errors.New("error while reading file") + +type JSONLFolderReader struct { + basename string + readers []*JSONLFileReader + current int +} + +func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { + log.Trace().Str("path", folderpath).Msg("reading folder") + + basename := path.Base(folderpath) + + pattern := filepath.Join(folderpath, "*.jsonl") -// FilesReader can read multiple type of file and feed data to rimo. -// FilesReader is responsible of : -// - BaseName() return the name of the base -// - Next() return true if there is a next value to read -// - Value() return the value of the current column, the name of the column and the name of the table -// Interface itself with a Loader interface. Which currently only supports YAML files. -// Loader and FilesReader can be initialized with LoaderFactory and FilesReaderFactory. -type FilesReader struct { - filepathList []string - loader JSONLinesLoader // responsible of loading a file format - baseName string - // variable for looping over columns - fileIndex int - colNameMapIndex map[int]string // map of column name by index - colIndex int // value of current column index - // given by Value() - dataMap map[string][]interface{} - tableName string // filled by FilesReader -} - -// Constructor for FilesReader. -func FilesReaderFactory(filepathList []string) (*FilesReader, error) { - var err error - - // Process inputDirList - if len(filepathList) == 0 { - return nil, ErrNoFilePath + files, err := filepath.Glob(pattern) + if err != nil { + return nil, fmt.Errorf("error listing files: %w", err) } - for _, path := range filepathList { - err := ValidateFilePath(path) + readers := make([]*JSONLFileReader, len(files)) + + for index, filepath := range files { + log.Trace().Str("path", filepath).Msg("scanning file") + + readers[index], err = NewJSONLFileReader(basename, filepath) if err != nil { - return nil, ErrInvalidFilePath + return nil, fmt.Errorf("error opening files: %w", err) } } - // Initialize FilesReader - var filesReader FilesReader - filesReader.filepathList = filepathList - filesReader.fileIndex = -1 + return &JSONLFolderReader{ + basename: basename, + readers: readers, + current: 0, + }, nil +} - filesReader.baseName, err = filesReader.isBaseUnique() - if err != nil { - return nil, fmt.Errorf("base is not unique: %w", err) - } +func (r *JSONLFolderReader) BaseName() string { + return r.basename +} + +func (r *JSONLFolderReader) Next() bool { + if r.current < len(r.readers) && !r.readers[r.current].Next() { + r.current++ - // Use of JSONLinesLoader - filesReader.loader = JSONLinesLoader{} + return r.Next() + } - return &filesReader, nil + return r.current < len(r.readers) } -// Reader interface implementation +func (r *JSONLFolderReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return r.readers[r.current].Col() +} -func (r *FilesReader) BaseName() string { - return r.baseName +type JSONLFileReader struct { + tablename string + source *os.File + columns []string + current int + decoder *json.Decoder + basename string } -func (r *FilesReader) Next() bool { - // First call to Next() - if r.fileIndex == -1 { - r.fileIndex = 0 - r.colIndex = 0 +func NewJSONLFileReader(basename string, filepath string) (*JSONLFileReader, error) { + log.Trace().Str("path", filepath).Msg("opening file") - return true + source, err := os.Open(filepath) + if err != nil { + return nil, fmt.Errorf("%w", err) } - // Current file contain column left to process. - if r.colIndex < len(r.dataMap) { - r.colIndex++ + template := map[string]any{} + + log.Trace().Str("path", filepath).Msg("decoding line template") + + decoder := json.NewDecoder(source) + if err := decoder.Decode(&template); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - // Current file contain no columns left to process. - if r.colIndex == len(r.dataMap) { - // Current file is last file. - if r.fileIndex == len(r.filepathList)-1 { - return false - } - // There is a next file. - r.fileIndex++ - r.colIndex = 0 + log.Trace().Str("path", filepath).Any("template", template).Msg("decoded line template") + + if _, err := source.Seek(0, 0); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - return true -} + columns := make([]string, 0, len(template)) -// Charger les fichiers un à un dans une dataMap. -// Retourne les valeurs d'une colonne, son nom et le nom de table. -func (r *FilesReader) Value() ([]interface{}, string, string, error) { - var err error + for column := range template { + log.Trace().Str("path", filepath).Any("column", column).Msg("registering column") - // colIndex = 0 : new file to load - if r.colIndex == 0 { - filepath := r.filepathList[r.fileIndex] + columns = append(columns, column) + } - // Extract table name from file name - _, r.tableName, err = ExtractName(filepath) - if err != nil { - return nil, "", "", fmt.Errorf("failed to extract table name: %w", err) - } + return &JSONLFileReader{ + tablename: strings.TrimSuffix(path.Base(filepath), path.Ext(filepath)), + source: source, + columns: columns, + current: -1, + decoder: json.NewDecoder(source), + basename: basename, + }, nil +} - // Load file in dataMap - r.dataMap, err = r.loader.Load(r.filepathList[r.fileIndex]) - if err != nil { - panic(err) - } +func (fr *JSONLFileReader) BaseName() string { + return fr.basename +} - // Create a map of column name by index - r.colNameMapIndex = make(map[int]string, 0) - i := 0 +func (fr *JSONLFileReader) Next() bool { + fr.current++ - for k := range r.dataMap { - r.colNameMapIndex[i] = k - i++ - } + if _, err := fr.source.Seek(0, 0); err != nil { + panic(err) } - // colIndex = n : current file have been partially processed - currentColName := r.colNameMapIndex[r.colIndex] - // return values, colName, tableName - return r.dataMap[currentColName], currentColName, r.tableName, nil + fr.decoder = json.NewDecoder(fr.source) + + log.Trace().Str("base", fr.basename).Any("index", fr.current).Msg("successful jump to next column") + + return fr.current < len(fr.columns) } -func (r *FilesReader) isBaseUnique() (string, error) { - baseName, _, err := ExtractName(r.filepathList[0]) - if err != nil { - return "", err +func (fr *JSONLFileReader) Col() (rimo.ColReader, error) { //nolint:ireturn + return NewJSONLColReader(fr.tablename, fr.columns[fr.current], fr.decoder), nil +} + +type JSONLColReader struct { + table string + column string + decoder *json.Decoder +} + +func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColReader { + return &JSONLColReader{ + table: table, + column: column, + decoder: decoder, } +} - for _, path := range r.filepathList { - baseNameI, _, err := ExtractName(path) - if err != nil { - return "", err - } +func (cr *JSONLColReader) ColName() string { + return cr.column +} - if baseName != baseNameI { - return "", fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI) - } +func (cr *JSONLColReader) TableName() string { + return cr.table +} + +func (cr *JSONLColReader) Next() bool { + return cr.decoder.More() +} + +func (cr *JSONLColReader) Value() (any, error) { + row := map[string]any{} + + if err := cr.decoder.Decode(&row); err != nil { + return nil, fmt.Errorf("%w: %w", ErrReadFile, err) } - return baseName, nil + log.Trace().Str("table", cr.table).Str("column", cr.column).Any("value", row[cr.column]).Msg("read value") + + return row[cr.column], nil } diff --git a/internal/infra/filesReader_test.go b/internal/infra/filesReader_test.go deleted file mode 100644 index ed6cdc0..0000000 --- a/internal/infra/filesReader_test.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "fmt" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/stretchr/testify/assert" -) - -func TestReader(t *testing.T) { - t.Parallel() - - inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") - - reader, err := infra.FilesReaderFactory([]string{inputFile}) - assert.NoError(t, err) - - // Assertions. - - actualBaseName := reader.BaseName() - expectedBaseName := "data" - assert.Equal(t, expectedBaseName, actualBaseName) - - expectedTableName := "input" - expectedDataMap := map[string][]interface{}{ - "address": {"PSC", "095", "06210"}, - "age": {nil, nil, float64(61)}, - "major": {true, false, true}, - "empty": {nil, nil, nil}, - } - - for reader.Next() { - values, colName, tableName, err := reader.Value() - if err != nil { - assert.NoError(t, err) - } - - expectedColData, ok := expectedDataMap[colName] - if !ok { - assert.Fail(t, "column name not found : %s", colName) - } - - assert.Equal(t, expectedColData, values) - assert.Equal(t, expectedTableName, tableName) - } -} - -func TestReaderMultipleFiles(t *testing.T) { - t.Parallel() - - inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") - inputFile2 := filepath.Join(dataDir, "data0/data_input2.jsonl") - reader, err := infra.FilesReaderFactory([]string{inputFile, inputFile2}) - assert.NoError(t, err) - - for reader.Next() { - values, colName, tableName, err := reader.Value() - if err != nil { - assert.NoError(t, err) - } - - fmt.Printf("%s.%s: %v\n", tableName, colName, values) - } -} diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go deleted file mode 100644 index ad40f79..0000000 --- a/internal/infra/infra_test.go +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/rimo" - "github.com/stretchr/testify/require" -) - -const ( - testdataDir = "../../testdata/" -) - -// Test RIMO pipeline with FilesReader, JSONLinesLoader and YAMLWriter. -func TestPipeline(t *testing.T) { - t.Parallel() - - inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl") - - reader, err := infra.FilesReaderFactory([]string{inputPath}) - require.NoError(t, err) - - writer := infra.StdoutWriterFactory() - - err = rimo.AnalyseBase(reader, writer) - require.NoError(t, err) -} - -// var ( -// Readers []*rimo.Reader -// Writers []*rimo.Writer -// ) - -// // List of implemented readers and writers. -// func GetReaders(filepathList []string) []*rimo.Reader { -// filesReader, err := infra.FilesReaderFactory(filepathList) -// if err != nil { -// panic(err) -// } - -// Readers = []*rimo.Reader{filesReader} - -// return Readers -// } - -// func GetWriters() []*rimo.Writer { -// yamlWriter := infra.YAMLWriterFactory("../../testdata/data1/data_output.yaml") - -// Writers = []*rimo.Writer{yamlWriter, infra.StdoutWriter{}} - -// return Writers -// } - -// func TestInterface(t *testing.T) { -// t.Parallel() - -// Writers = GetWriters() -// Readers = GetReaders([]string{"../../testdata/data1/data_input.jsonl"}) -// // Assert that all readers and writers implement the Reader and Writer interfaces. -// for _, reader := range Readers { -// var _ rimo.Reader = (reader)(nil) -// } -// for _, writer := range Writers { -// var _ rimo.Reader = (writer)(nil) -// } - -// // Assert that all combinations of readers and writers can be used in the pipeline. -// for _, reader := range Readers { -// for _, writer := range Writers { -// err := rimo.AnalyseBase(reader, writer) -// require.NoError(t, err) -// } -// } -// } diff --git a/internal/infra/loader_test.go b/internal/infra/loader_test.go deleted file mode 100644 index d55186d..0000000 --- a/internal/infra/loader_test.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package infra_test - -import ( - "fmt" - "path/filepath" - "testing" - - "github.com/cgi-fr/rimo/internal/infra" - "github.com/stretchr/testify/require" -) - -func TestLoaderJSONL(t *testing.T) { - t.Parallel() - - path := filepath.Join(testdataDir, "data1/data_input.jsonl") - - LoaderJSONL := infra.JSONLinesLoader{} - - data, err := LoaderJSONL.Load(path) - require.NoError(t, err) - fmt.Printf("dataMap: %v\n", data) -} diff --git a/pkg/metric/analyser.go b/pkg/metric/analyser.go new file mode 100644 index 0000000..9d1d936 --- /dev/null +++ b/pkg/metric/analyser.go @@ -0,0 +1,26 @@ +package metric + +import ( + "github.com/cgi-fr/rimo/pkg/model" +) + +type Analyser[T Accepted] interface { + Read(*T) + Build(*model.Column) +} + +type Multi[T Accepted] struct { + analyser []Analyser[T] +} + +func (m Multi[T]) Read(value *T) { + for _, a := range m.analyser { + a.Read(value) + } +} + +func (m Multi[T]) Build(metric *model.Column) { + for _, a := range m.analyser { + a.Build(metric) + } +} diff --git a/pkg/metric/bool.go b/pkg/metric/bool.go new file mode 100644 index 0000000..47fa462 --- /dev/null +++ b/pkg/metric/bool.go @@ -0,0 +1,21 @@ +package metric + +type Bool struct { + Multi[bool] +} + +func NewBool(sampleSize uint, countDistinct bool) *Bool { + mainAnalyser := []Analyser[bool]{ + NewCounter[bool](), // count total, count null, count empty + NewSampler[bool](sampleSize), // store few samples + NewTrueRatio(), // calculate true ratio + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[bool]()) + } + + return &Bool{ + Multi: Multi[bool]{mainAnalyser}, + } +} diff --git a/pkg/metric/build.go b/pkg/metric/build.go deleted file mode 100644 index 058c539..0000000 --- a/pkg/metric/build.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "encoding/json" - "errors" - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -var ErrValueType = errors.New("value type error") - -// Return a model.Column. -func ComputeMetric(colName string, values []interface{}) (model.Column, error) { - var confidential *bool = nil //nolint - - // Create the column. - col := model.Column{ - Name: colName, - Type: GetColType(values), - Concept: "", - Constraint: []string{}, - Confidential: confidential, - MainMetric: model.GenericMetric{}, //nolint:exhaustruct - StringMetric: model.StringMetric{}, //nolint:exhaustruct - NumericMetric: model.NumericMetric{}, //nolint:exhaustruct - BoolMetric: model.BoolMetric{}, //nolint:exhaustruct - } - - // Generic metric - err := SetGenericMetric(values, &col.MainMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing generic metric in column %v : %w", col.Name, err) - } - - // Type specific metric - switch col.Type { - case model.ColType.String: - err := SetStringMetric(values, &col.StringMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing string metric in column %v : %w", col.Name, err) - } - - case model.ColType.Numeric: - err := SetNumericMetric(values, &col.NumericMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing numeric metric in column %v : %w", col.Name, err) - } - - case model.ColType.Bool: - err := SetBoolMetric(values, &col.BoolMetric) - if err != nil { - return model.Column{}, fmt.Errorf("error computing bool metric in column %v : %w", col.Name, err) - } - } - - return col, nil -} - -func GetColType(values []interface{}) model.ValueType { - colType := model.ColType.Undefined - for i := 0; i < len(values) && colType == model.ColType.Undefined; i++ { - colType = ColType(values[i]) - } - - return colType -} - -// Utils functions. - -func GetFrequency(occurrence int, count int) float64 { - return float64(occurrence) / float64(count) -} - -// To check why not using isNil() ? -func GetFirstValue(values []interface{}) interface{} { - for _, value := range values { - if value != nil { - return value - } - } - - return nil -} - -func ColType(value interface{}) model.ValueType { - switch value.(type) { - case int: - return model.ColType.Numeric - case float64: - return model.ColType.Numeric - case json.Number: - return model.ColType.Numeric - case string: - return model.ColType.String - case bool: - return model.ColType.Bool - default: - return model.ColType.Undefined - } -} diff --git a/pkg/metric/counter.go b/pkg/metric/counter.go new file mode 100644 index 0000000..ede347a --- /dev/null +++ b/pkg/metric/counter.go @@ -0,0 +1,38 @@ +package metric + +import ( + "github.com/cgi-fr/rimo/pkg/model" +) + +type Counter[T Accepted] struct { + countTotal uint + countNulls uint + countEmpty uint + zero T +} + +func NewCounter[T Accepted]() *Counter[T] { + return &Counter[T]{ + countTotal: 0, + countNulls: 0, + countEmpty: 0, + zero: *new(T), + } +} + +func (c *Counter[T]) Read(value *T) { + c.countTotal++ + + switch { + case value == nil: + c.countNulls++ + case *value == c.zero: + c.countEmpty++ + } +} + +func (c *Counter[T]) Build(metric *model.Column) { + metric.MainMetric.Count = c.countTotal + metric.MainMetric.Null = c.countNulls + metric.MainMetric.Empty = c.countEmpty +} diff --git a/pkg/metric/distinct.go b/pkg/metric/distinct.go new file mode 100644 index 0000000..c01e7b0 --- /dev/null +++ b/pkg/metric/distinct.go @@ -0,0 +1,25 @@ +package metric + +import ( + "github.com/cgi-fr/rimo/pkg/model" +) + +type Distinct[T Accepted] struct { + values map[T]int +} + +func NewDistinct[T Accepted]() *Distinct[T] { + return &Distinct[T]{ + values: make(map[T]int, 1024), //nolint:gomnd + } +} + +func (a *Distinct[T]) Read(value *T) { + if value != nil { + a.values[*value] = 0 + } +} + +func (a *Distinct[T]) Build(metric *model.Column) { + metric.MainMetric.Distinct = uint(len(a.values)) +} diff --git a/pkg/metric/generic.go b/pkg/metric/generic.go deleted file mode 100644 index ac85bbd..0000000 --- a/pkg/metric/generic.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "errors" - "fmt" - "math/rand" - - "github.com/cgi-fr/rimo/pkg/model" - "golang.org/x/exp/constraints" -) - -var ErrEmptySlice = errors.New("slice is empty") - -func SetGenericMetric(values []interface{}, metric *model.GenericMetric) error { - sample, err := Sample(values, model.SampleSize) - if err != nil { - return fmt.Errorf("error computing sample: %w", err) - } - - metric.Count = len(values) - metric.Unique = CountUnique(values) - metric.Empty = CountEmpty(values) - metric.Sample = sample - - return nil -} - -func CountEmpty[T comparable](values []T) int { - empty := 0 - - for _, value := range values { - if isNil(value) { - empty++ - } - } - - return empty -} - -// Return a sample of size sampleSize from values. -func Sample[T comparable](values []T, sampleSize int) ([]T, error) { - uniqueValues := Unique(values) - - if sampleSize >= len(uniqueValues) { - return uniqueValues, nil - } - - sample := make([]T, sampleSize) - for i := 0; i < sampleSize; i++ { - sample[i] = uniqueValues[rand.Intn(len(uniqueValues)-1)] //nolint:gosec - } - - return sample, nil -} - -func CountUnique[T comparable](values []T) int { - unique := make(map[T]bool) - - for _, value := range values { - if isNil(value) { - continue - } - - unique[value] = true - } - - return len(unique) -} - -func Unique[T comparable](values []T) []T { - unique := make(map[T]bool) - - for _, value := range values { - if isNil(value) { - continue - } - - unique[value] = true - } - - uniqueValues := make([]T, 0, len(unique)) - for value := range unique { - uniqueValues = append(uniqueValues, value) - } - - return uniqueValues -} - -func isNil[T comparable](v T) bool { - return v == *new(T) -} - -func min[T constraints.Ordered](a, b T) T { - if a < b { - return a - } - - return b -} - -func max[T constraints.Ordered](a, b T) T { - if a > b { - return a - } - - return b -} diff --git a/pkg/metric/generic_test.go b/pkg/metric/generic_test.go deleted file mode 100644 index 4af5071..0000000 --- a/pkg/metric/generic_test.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestCountEmpty(t *testing.T) { - t.Parallel() - - slice := []interface{}{1, 2, 3, nil} - expected := 1 - actual := metric.CountEmpty(slice) - - assert.Equal(t, expected, actual) -} - -func TestGetColType(t *testing.T) { - t.Parallel() - - t.Run("numeric", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, 2, 3} - expected := model.ColType.Numeric - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("string", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, "text", nil} - expected := model.ColType.String - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("boolean", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, true, false} - expected := model.ColType.Bool - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - // Treat this case as error would imply to type assert each element of the slice when Loading. - t.Run("mixed", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{"text", 2, false} - expected := model.ColType.String - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) - - t.Run("unknown", func(t *testing.T) { - t.Parallel() - - slice := []interface{}{nil, nil, nil} - expected := model.ColType.Undefined - - actual := metric.GetColType(slice) - require.Equal(t, expected, actual) - }) -} - -// Implementation questions : -// should Unique() append nil element ? -// should CountUnique() count nil as a unique value ? - -func TestUnique(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 1, 2, 3, nil} - expected := []interface{}{1, 2, 3} - actual := metric.Unique(values) - - assert.ElementsMatch(t, expected, actual) -} - -func TestCountUnique(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 1, 2, 3, nil} - expected := 3 - actual := metric.CountUnique(values) - - assert.Equal(t, expected, actual) -} - -func TestSample(t *testing.T) { - t.Parallel() - - values := []interface{}{1, 2, 3, nil, 5, 6} - actualOutput, _ := metric.Sample(values, 5) - - assert.Len(t, actualOutput, 5) - - actualOutput, _ = metric.Sample(values, 10) - assert.Len(t, actualOutput, 5) -} diff --git a/pkg/metric/mean.go b/pkg/metric/mean.go new file mode 100644 index 0000000..35137cc --- /dev/null +++ b/pkg/metric/mean.go @@ -0,0 +1,31 @@ +package metric + +import "github.com/cgi-fr/rimo/pkg/model" + +type Mean struct { + count uint + mean float64 +} + +func NewMean() *Mean { + return &Mean{ + count: 0, + mean: 0, + } +} + +func (a *Mean) Read(value *float64) { + if value == nil { + return + } + + a.count++ + + a.mean += (*value - a.mean) / float64(a.count) +} + +func (a *Mean) Build(metric *model.Column) { + metric.NumericMetric = &model.Numeric{ + Mean: a.mean, + } +} diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go new file mode 100644 index 0000000..3bad918 --- /dev/null +++ b/pkg/metric/metric_test.go @@ -0,0 +1,82 @@ +package metric_test + +import ( + "testing" + + "github.com/cgi-fr/rimo/pkg/metric" + "github.com/cgi-fr/rimo/pkg/model" + "github.com/stretchr/testify/assert" +) + +// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length. +func TestStringMetric(t *testing.T) { //nolint:funlen + t.Parallel() + + text := []string{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441", ""} + + min := "" + max := "4441" + + expectedMetric := model.Column{ //nolint:exhaustruct + MainMetric: model.Generic{ + Count: 12, + Empty: 1, + Null: 1, + Distinct: 6, + Samples: []any{"22"}, + Min: &min, + Max: &max, + }, + StringMetric: &model.String{ + MinLen: 0, + MaxLen: 4, + CountLen: 5, + Lengths: []model.StringLen{ + { + Length: 1, + Freq: 0.3333333333333333, + Metrics: model.Generic{ + Count: 4, + Empty: 0, + Null: 0, + Distinct: 1, + Min: &text[0], + Max: &text[0], + Samples: []any{"1", "1", "1", "1"}, + }, + }, + }, + }, + } + + actualMetric := model.Column{} //nolint:exhaustruct + + analyser := metric.NewString(5, true) + for index := range text { + analyser.Read(&text[index]) + } + + analyser.Read(nil) + + analyser.Build(&actualMetric) + + // out, err := yaml.Marshal(actualMetric) + // assert.NoError(t, err) + // fmt.Println(string(out)) + + assert.Equal(t, expectedMetric.MainMetric.Count, actualMetric.MainMetric.Count) + assert.Equal(t, expectedMetric.MainMetric.Empty, actualMetric.MainMetric.Empty) + assert.Equal(t, expectedMetric.MainMetric.Null, actualMetric.MainMetric.Null) + assert.Equal(t, expectedMetric.MainMetric.Distinct, actualMetric.MainMetric.Distinct) + assert.Equal(t, expectedMetric.MainMetric.Min, actualMetric.MainMetric.Min) + assert.Equal(t, expectedMetric.MainMetric.Max, actualMetric.MainMetric.Max) + assert.Equal(t, expectedMetric.StringMetric.MinLen, actualMetric.StringMetric.MinLen) + assert.Equal(t, expectedMetric.StringMetric.MaxLen, actualMetric.StringMetric.MaxLen) + assert.Equal(t, expectedMetric.StringMetric.CountLen, actualMetric.StringMetric.CountLen) + + for i := 0; i < len(expectedMetric.StringMetric.Lengths); i++ { + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Length, actualMetric.StringMetric.Lengths[i].Length) + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Freq, actualMetric.StringMetric.Lengths[i].Freq) + assert.Equal(t, expectedMetric.StringMetric.Lengths[i].Metrics.Samples, actualMetric.StringMetric.Lengths[i].Metrics.Samples) //nolint:lll + } +} diff --git a/pkg/metric/metricbool.go b/pkg/metric/metricbool.go deleted file mode 100644 index a3a38ed..0000000 --- a/pkg/metric/metricbool.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// rimo is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// rimo is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with rimo. If not, see . - -package metric - -import ( - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -// Bool metric : TrueRatio. -func SetBoolMetric(values []interface{}, metric *model.BoolMetric) error { - nullCount := 0 - trueCount := 0 - - for _, value := range values { - if value == nil { - nullCount++ - - continue - } - - boolValue, ok := value.(bool) - if !ok { - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - if boolValue { - trueCount++ - } - } - - metric.TrueRatio = GetFrequency(trueCount, len(values)-nullCount) - - return nil -} diff --git a/pkg/metric/metricbool_test.go b/pkg/metric/metricbool_test.go deleted file mode 100644 index 5db4576..0000000 --- a/pkg/metric/metricbool_test.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestBooleanMetric(t *testing.T) { - t.Parallel() - - values := []interface{}{true, true, nil, false} - expectedMetric := model.BoolMetric{ - TrueRatio: float64(2) / float64(3), - } - - actualMetric := model.BoolMetric{} //nolint:exhaustruct - err := metric.SetBoolMetric(values, &actualMetric) - require.NoError(t, err) - - assert.Equal(t, expectedMetric, actualMetric) -} diff --git a/pkg/metric/metricnumeric.go b/pkg/metric/metricnumeric.go deleted file mode 100644 index 77f50bd..0000000 --- a/pkg/metric/metricnumeric.go +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "fmt" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func SetNumericMetric(values []interface{}, metric *model.NumericMetric) error { - nonNullCount := 0 - - value := GetFirstValue(values) - - floatValue, ok := value.(float64) - if !ok { - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - min := floatValue - max := floatValue - sum := 0.0 - - for _, value := range values { - floatValue, ok := value.(float64) - if !ok { - if value == nil { - continue - } - - return fmt.Errorf("%w : expected numeric found %T: %v", ErrValueType, value, value) - } - - sum += floatValue - nonNullCount++ - - if floatValue > max { - max = floatValue - } - - if floatValue < min { - min = floatValue - } - } - - mean := sum / float64(nonNullCount) - - metric.Min = min - metric.Max = max - metric.Mean = mean - - return nil -} diff --git a/pkg/metric/metricnumeric_test.go b/pkg/metric/metricnumeric_test.go deleted file mode 100644 index 997b506..0000000 --- a/pkg/metric/metricnumeric_test.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" -) - -func TestNumericMetric(t *testing.T) { - t.Parallel() - - values := []interface{}{1.0, 2.0, 3.0, nil} - expectedMetric := model.NumericMetric{ - Min: 1, - Max: 3, - Mean: 2, - } - - actualMetric := model.NumericMetric{} //nolint:exhaustruct - - err := metric.SetNumericMetric(values, &actualMetric) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - assert.Equal(t, expectedMetric, actualMetric) -} diff --git a/pkg/metric/metricstring.go b/pkg/metric/metricstring.go deleted file mode 100644 index a7d2ef5..0000000 --- a/pkg/metric/metricstring.go +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric - -import ( - "fmt" - "math" - "sort" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func SetStringMetric(values []interface{}, metric *model.StringMetric) error { - // Store strings by length. - lenMap := make(map[int][]string) - // Count length occurrence. - lenCounter := make(map[int]int) - totalCount := len(values) - - metric.MinLen = math.MaxInt - metric.MaxLen = 0 - - for _, value := range values { - if value == nil { - continue - } - - stringValue, ok := value.(string) - if !ok { - return fmt.Errorf("%w : expected string found %T: %v", ErrValueType, value, value) - } - - length := len(stringValue) - lenMap[length] = append(lenMap[length], stringValue) - lenCounter[length]++ - - metric.MinLen = min(metric.MinLen, length) - metric.MaxLen = max(metric.MaxLen, length) - } - - // Create a list of unique lengths sorted by descending frequency, break ties with ascending length - sortedLength := uniqueLengthSorted(lenCounter) - - // Get size of MostFreqLen and LeastFreqLen - mostFrequentLenSize, leastFrequentLenSize := getFreqSize(len(sortedLength)) - - // Get ordered slice of least and most frequent length - lenMostFreqLen := sortedLength[0:mostFrequentLenSize] - - lenLeastFreqLen := make([]int, leastFrequentLenSize) - - for i := 0; i < leastFrequentLenSize; i++ { - index := len(sortedLength) - 1 - i - lenLeastFreqLen[i] = sortedLength[index] - } - - leastFreqLen, err := buildFreqLen(lenLeastFreqLen, lenMap, lenCounter, totalCount, model.LeastFrequentSampleSize) - if err != nil { - return fmt.Errorf("error building least frequent length : %w", err) - } - - metric.LeastFreqLen = leastFreqLen - - mostFreqLen, err := buildFreqLen(lenMostFreqLen, lenMap, lenCounter, totalCount, model.MostFrequentSampleSize) - if err != nil { - return fmt.Errorf("error building most frequent length : %w", err) - } - - metric.MostFreqLen = mostFreqLen - - return nil -} - -func buildFreqLen(freqLen []int, lenMap map[int][]string, lenCounter map[int]int, totalCount int, sampleLen int) ([]model.LenFreq, error) { //nolint - lenFreqs := make([]model.LenFreq, len(freqLen)) - - for index, len := range freqLen { - // Get unique value from lenMap[len].. - sample, err := Sample(lenMap[len], sampleLen) - if err != nil { - return lenFreqs, fmt.Errorf("error getting sample for length %v : %w", len, err) - } - - lenFreqs[index] = model.LenFreq{ - Length: len, - Freq: GetFrequency(lenCounter[len], totalCount), - Sample: sample, - } - } - - return lenFreqs, nil -} - -func getFreqSize(nunique int) (int, int) { - mostFrequentLenSize := model.MostFrequentLenSize - leastFrequentLenSize := model.LeastFrequentLenSize - - if nunique < model.MostFrequentLenSize+model.LeastFrequentLenSize { - // Modify MostFrequentLenSize and LeastFrequentLenSize to fit the number of unique length. - // Should keep ratio of MostFrequentLenSize and LeastFrequentLenSize. - ratio := float64(model.MostFrequentLenSize) / float64(model.MostFrequentLenSize+model.LeastFrequentLenSize) - mostFrequentLenSize = int(math.Round(float64(nunique) * ratio)) - leastFrequentLenSize = nunique - mostFrequentLenSize - } - - return mostFrequentLenSize, leastFrequentLenSize -} - -func uniqueLengthSorted(lenCounter map[int]int) []int { - uniqueLengthSorted := make([]int, 0, len(lenCounter)) - for l := range lenCounter { - uniqueLengthSorted = append(uniqueLengthSorted, l) - } - - // Sort the string lengths by descending count of occurrence, breaks ties with ascending length - sort.Slice(uniqueLengthSorted, func(i, j int) bool { - if lenCounter[uniqueLengthSorted[i]] == lenCounter[uniqueLengthSorted[j]] { - return uniqueLengthSorted[i] < uniqueLengthSorted[j] - } - - return lenCounter[uniqueLengthSorted[i]] > lenCounter[uniqueLengthSorted[j]] - }) - - return uniqueLengthSorted -} diff --git a/pkg/metric/metricstring_test.go b/pkg/metric/metricstring_test.go deleted file mode 100644 index 254a923..0000000 --- a/pkg/metric/metricstring_test.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/assert" -) - -// Ensure that 1. frequency is correct, 2. order is correct, 3. ties are break by length. -func TestStringMetric(t *testing.T) { - t.Parallel() - - text := []interface{}{"1", "1", "1", "1", "22", "22", "22", "331", "332", "4441"} - expectedMetric := model.StringMetric{ - MinLen: 1, - MaxLen: 4, - MostFreqLen: []model.LenFreq{{Length: 1, Freq: 0.4, Sample: []string{"1"}}, {Length: 2, Freq: 0.3, Sample: []string{"22"}}}, //nolint:lll - LeastFreqLen: []model.LenFreq{{Length: 4, Freq: 0.1, Sample: []string{"4441"}}, {Length: 3, Freq: 0.2, Sample: []string{"331", "332"}}}, //nolint:lll - } - - actualMetric := model.StringMetric{} //nolint:exhaustruct - - err := metric.SetStringMetric(text, &actualMetric) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - // t.Logf(valast.String(actualMetric)) - - for i := 0; i < len(expectedMetric.MostFreqLen); i++ { - assert.Equal(t, expectedMetric.MostFreqLen[i].Length, actualMetric.MostFreqLen[i].Length) - assert.Equal(t, expectedMetric.MostFreqLen[i].Freq, actualMetric.MostFreqLen[i].Freq) - assert.Equal(t, expectedMetric.MostFreqLen[i].Sample, actualMetric.MostFreqLen[i].Sample) - } - - for i := 0; i < len(expectedMetric.LeastFreqLen); i++ { - assert.Equal(t, expectedMetric.LeastFreqLen[i].Length, actualMetric.LeastFreqLen[i].Length) - assert.Equal(t, expectedMetric.LeastFreqLen[i].Freq, actualMetric.LeastFreqLen[i].Freq) - assert.ElementsMatch(t, expectedMetric.LeastFreqLen[i].Sample, actualMetric.LeastFreqLen[i].Sample) - } -} diff --git a/pkg/metric/minmax.go b/pkg/metric/minmax.go new file mode 100644 index 0000000..9bf0ccb --- /dev/null +++ b/pkg/metric/minmax.go @@ -0,0 +1,41 @@ +package metric + +import ( + "github.com/cgi-fr/rimo/pkg/model" + "golang.org/x/exp/constraints" +) + +type MinMax[T constraints.Ordered] struct { + min *T + max *T +} + +func NewMinMax[T constraints.Ordered]() *MinMax[T] { + return &MinMax[T]{ + min: nil, + max: nil, + } +} + +func (a *MinMax[T]) Read(value *T) { + if value != nil { + if a.min == nil { + a.min = value + } + + if a.max == nil { + a.max = value + } + + if *value < *a.min { + a.min = value + } else if *value > *a.max { + a.max = value + } + } +} + +func (a *MinMax[T]) Build(metric *model.Column) { + metric.MainMetric.Min = a.min + metric.MainMetric.Max = a.max +} diff --git a/pkg/metric/numeric.go b/pkg/metric/numeric.go new file mode 100644 index 0000000..2b34fe0 --- /dev/null +++ b/pkg/metric/numeric.go @@ -0,0 +1,22 @@ +package metric + +type Numeric struct { + Multi[float64] +} + +func NewNumeric(sampleSize uint, countDistinct bool) *Numeric { + mainAnalyser := []Analyser[float64]{ + NewCounter[float64](), // count total, count null, count empty + NewMinMax[float64](), // store min and max values + NewSampler[float64](sampleSize), // store few samples + NewMean(), // calculate running mean + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[float64]()) + } + + return &Numeric{ + Multi: Multi[float64]{mainAnalyser}, + } +} diff --git a/pkg/metric/sampler.go b/pkg/metric/sampler.go new file mode 100644 index 0000000..db59f59 --- /dev/null +++ b/pkg/metric/sampler.go @@ -0,0 +1,45 @@ +package metric + +import ( + "math/rand" + + "github.com/cgi-fr/rimo/pkg/model" +) + +type Sampler[T Accepted] struct { + size uint + count int + samples []T +} + +func NewSampler[T Accepted](size uint) *Sampler[T] { + return &Sampler[T]{ + size: size, + count: 0, + samples: make([]T, 0, size), + } +} + +func (s *Sampler[T]) Read(value *T) { + if value != nil { + s.count++ + + if len(s.samples) < int(s.size) { + s.samples = append(s.samples, *value) + + return + } + + index := rand.Intn(s.count) //nolint:gosec + if index < int(s.size) { + s.samples[index] = *value + } + } +} + +func (s *Sampler[T]) Build(metric *model.Column) { + metric.MainMetric.Samples = make([]any, len(s.samples)) + for i, s := range s.samples { + metric.MainMetric.Samples[i] = s + } +} diff --git a/pkg/metric/string.go b/pkg/metric/string.go new file mode 100644 index 0000000..7e5861d --- /dev/null +++ b/pkg/metric/string.go @@ -0,0 +1,96 @@ +package metric + +import ( + "sort" + + "github.com/cgi-fr/rimo/pkg/model" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" +) + +type String struct { + sampleSize uint + distinct bool + main Multi[string] + byLen map[int]Multi[string] +} + +func NewString(sampleSize uint, countDistinct bool) *String { + mainAnalyser := []Analyser[string]{ + NewCounter[string](), // count total, count null, count empty + NewMinMax[string](), // store min and max values + NewSampler[string](sampleSize), // store few samples + } + + if countDistinct { + mainAnalyser = append(mainAnalyser, NewDistinct[string]()) + } + + return &String{ + sampleSize: sampleSize, + distinct: countDistinct, + main: Multi[string]{mainAnalyser}, + byLen: make(map[int]Multi[string], 0), + } +} + +func (a *String) Read(value *string) { + a.main.Read(value) + + if value != nil { + length := len(*value) + + analyser, exists := a.byLen[length] + if !exists { + analyser = Multi[string]{ + []Analyser[string]{ + NewCounter[string](), // count total, count null, count empty + NewMinMax[string](), // store min and max values + NewSampler[string](a.sampleSize), // store few samples + }, + } + + if a.distinct { + analyser.analyser = append(analyser.analyser, NewDistinct[string]()) + } + } + + analyser.Read(value) + + a.byLen[length] = analyser + } +} + +func (a *String) Build(metric *model.Column) { + a.main.Build(metric) + + metric.StringMetric = &model.String{ + MinLen: slices.Min(maps.Keys(a.byLen)), + MaxLen: slices.Max(maps.Keys(a.byLen)), + CountLen: len(a.byLen), + Lengths: make([]model.StringLen, 0, len(a.byLen)), + } + + for length, analyser := range a.byLen { + lenMetric := model.Column{} //nolint:exhaustruct + analyser.Build(&lenMetric) + + strlen := model.StringLen{ + Length: length, + Freq: float64(lenMetric.MainMetric.Count) / float64(metric.MainMetric.Count), + Metrics: model.Generic{}, //nolint:exhaustruct + } + strlen.Metrics.Count = lenMetric.MainMetric.Count + strlen.Metrics.Empty = lenMetric.MainMetric.Empty + strlen.Metrics.Null = lenMetric.MainMetric.Null + strlen.Metrics.Distinct = lenMetric.MainMetric.Distinct + strlen.Metrics.Max = lenMetric.MainMetric.Max + strlen.Metrics.Min = lenMetric.MainMetric.Min + strlen.Metrics.Samples = lenMetric.MainMetric.Samples + metric.StringMetric.Lengths = append(metric.StringMetric.Lengths, strlen) + } + + sort.Slice(metric.StringMetric.Lengths, func(i, j int) bool { + return metric.StringMetric.Lengths[i].Freq > metric.StringMetric.Lengths[j].Freq + }) +} diff --git a/pkg/metric/trueratio.go b/pkg/metric/trueratio.go new file mode 100644 index 0000000..e9d8271 --- /dev/null +++ b/pkg/metric/trueratio.go @@ -0,0 +1,33 @@ +package metric + +import "github.com/cgi-fr/rimo/pkg/model" + +type TrueRatio struct { + countTrue uint + count uint +} + +func NewTrueRatio() *TrueRatio { + return &TrueRatio{ + countTrue: 0, + count: 0, + } +} + +func (a *TrueRatio) Read(value *bool) { + if value == nil { + return + } + + a.count++ + + if *value { + a.countTrue++ + } +} + +func (a *TrueRatio) Build(metric *model.Column) { + metric.BoolMetric = &model.Bool{ + TrueRatio: float64(a.countTrue) / float64(a.count), + } +} diff --git a/pkg/metric/types.go b/pkg/metric/types.go new file mode 100644 index 0000000..05de82d --- /dev/null +++ b/pkg/metric/types.go @@ -0,0 +1,7 @@ +package metric + +import "golang.org/x/exp/constraints" + +type Accepted interface { + constraints.Ordered | ~bool +} diff --git a/pkg/model/base.go b/pkg/model/base.go index 49eed63..2641719 100644 --- a/pkg/model/base.go +++ b/pkg/model/base.go @@ -1,50 +1,20 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - package model -import ( - "fmt" - "reflect" - - "github.com/hexops/valast" -) - -// RIMO YAML structure. -type ( - Base struct { - Name string `json:"database" jsonschema:"required" yaml:"database"` - // Tables should be map[string][]Column - Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"` - } +const DefaultTableSize = 10 - Table struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"` - } -) +type Base struct { + Name string `json:"database" yaml:"database" jsonschema:"required"` + Tables []Table `json:"tables" yaml:"tables" jsonschema:"required"` +} -// Should be improved with more detail about difference. -func SameBase(base1, base2 *Base) (bool, string) { - if !reflect.DeepEqual(base1, base2) { - msg := fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2)) +type Table struct { + Name string `json:"name" yaml:"name" jsonschema:"required"` + Columns []Column `json:"columns" yaml:"columns" jsonschema:"required"` +} - return false, msg +func NewBase(name string) *Base { + return &Base{ + Name: name, + Tables: make([]Table, 0, DefaultTableSize), } - - return true, "" } diff --git a/pkg/model/column.go b/pkg/model/column.go index 223bcb0..8eadc62 100644 --- a/pkg/model/column.go +++ b/pkg/model/column.go @@ -1,44 +1,13 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - package model -const ( - SampleSize int = 5 - MostFrequentLenSize int = 5 - MostFrequentSampleSize int = 5 - LeastFrequentLenSize int = 5 - LeastFrequentSampleSize int = 5 -) - -type ( - Column struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll - - // The 3 following parameter should be part of a Config struct - Concept string `json:"concept" jsonschema:"required" yaml:"concept"` - Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` - Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` +type Column struct { + Name string `json:"name" yaml:"name" jsonschema:"required"` + Type string `json:"type" yaml:"type" jsonschema:"required" validate:"oneof=string numeric boolean"` - MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` + Config - StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` - NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` - BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` - } -) + MainMetric Generic `json:"mainMetric" yaml:"mainMetric" jsonschema:"required"` + StringMetric *String `json:"stringMetric,omitempty" yaml:"stringMetric,omitempty"` + NumericMetric *Numeric `json:"numericMetric,omitempty" yaml:"numericMetric,omitempty"` + BoolMetric *Bool `json:"boolMetric,omitempty" yaml:"boolMetric,omitempty"` +} diff --git a/pkg/model/config.go b/pkg/model/config.go new file mode 100644 index 0000000..d6742e4 --- /dev/null +++ b/pkg/model/config.go @@ -0,0 +1,7 @@ +package model + +type Config struct { + Concept string `json:"concept" yaml:"concept" jsonschema:"required"` + Constraint []string `json:"constraint" yaml:"constraint" jsonschema:"required"` + Confidential *bool `json:"confidential" yaml:"confidential" jsonschema:"required"` +} diff --git a/pkg/model/metric.go b/pkg/model/metric.go deleted file mode 100644 index c0e8d03..0000000 --- a/pkg/model/metric.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -// RIMO YAML metrics. -type ( - GenericMetric struct { - Count int `json:"count" jsonschema:"required" yaml:"count"` - Empty int `json:"empty" jsonschema:"required" yaml:"empty"` - Unique int `json:"unique" jsonschema:"required" yaml:"unique"` - Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"` - } - - StringMetric struct { - MinLen int `json:"minLen" jsonschema:"required" yaml:"minLen"` - MaxLen int `json:"maxLen" jsonschema:"required" yaml:"maxLen"` - MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"` - LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"` - } - - LenFreq struct { - Length int `json:"length" jsonschema:"required" yaml:"length"` - Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` - Sample []string `json:"sample" jsonschema:"required" yaml:"sample"` - } - - NumericMetric struct { - Min float64 `json:"min" jsonschema:"required" yaml:"min"` - Max float64 `json:"max" jsonschema:"required" yaml:"max"` - Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"` - } - - BoolMetric struct { - TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"` - } -) - -// Type that a column can be. -type ValueType string - -var ColType = struct { //nolint:gochecknoglobals - String ValueType - Numeric ValueType - Bool ValueType - Undefined ValueType -}{ - String: "string", - Numeric: "numeric", - Bool: "bool", - Undefined: "undefined", -} diff --git a/pkg/model/metrics.go b/pkg/model/metrics.go new file mode 100644 index 0000000..5073cc6 --- /dev/null +++ b/pkg/model/metrics.go @@ -0,0 +1,32 @@ +package model + +type Generic struct { + Count uint `json:"count" yaml:"count" jsonschema:"required"` + Empty uint `json:"empty" yaml:"empty" jsonschema:"required"` + Null uint `json:"nulls" yaml:"nulls" jsonschema:"required"` + Distinct uint `json:"distinct,omitempty" yaml:"distinct,omitempty"` + Min any `json:"min,omitempty" yaml:"min,omitempty"` + Max any `json:"max,omitempty" yaml:"max,omitempty"` + Samples []any `json:"samples" yaml:"samples" jsonschema:"required"` +} + +type String struct { + MinLen int `json:"minLen" yaml:"minLen"` + MaxLen int `json:"maxLen" yaml:"maxLen"` + CountLen int `json:"countLen,omitempty" yaml:"countLen,omitempty"` + Lengths []StringLen `json:"lengths,omitempty" yaml:"lengths,omitempty"` +} + +type StringLen struct { + Length int `json:"length" yaml:"length" jsonschema:"required"` + Freq float64 `json:"freq" yaml:"freq" jsonschema:"required"` + Metrics Generic `json:"metrics" yaml:"metrics" jsonschema:"required"` +} + +type Numeric struct { + Mean float64 `json:"mean" yaml:"mean" jsonschema:"required"` +} + +type Bool struct { + TrueRatio float64 `json:"trueRatio" yaml:"trueRatio" jsonschema:"required"` +} diff --git a/pkg/model/schema.go b/pkg/model/schema.go new file mode 100644 index 0000000..5258dca --- /dev/null +++ b/pkg/model/schema.go @@ -0,0 +1,17 @@ +package model + +import ( + "encoding/json" + "fmt" + + "github.com/invopop/jsonschema" +) + +func GetJSONSchema() (string, error) { + resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct + if err != nil { + return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) + } + + return string(resBytes), nil +} diff --git a/pkg/model/utils.go b/pkg/model/utils.go deleted file mode 100644 index f80a6e2..0000000 --- a/pkg/model/utils.go +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -import ( - "encoding/json" - "errors" - "fmt" - "os" - "sort" - - "github.com/invopop/jsonschema" - "gopkg.in/yaml.v3" -) - -func GetJSONSchema() (string, error) { - resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct - if err != nil { - return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) - } - - return string(resBytes), nil -} - -func NewBase(name string) *Base { - return &Base{ - Name: name, - Tables: make([]Table, 0), - } -} - -var ErrBaseFormat = errors.New("error while decoding yaml file in a Base struct") - -// Can be improved. -func LoadBase(path string) (*Base, error) { - file, err := os.Open(path) - if err != nil { - return nil, fmt.Errorf("error while opening file: %w", err) - } - - decoder := yaml.NewDecoder(file) - - var base Base - - err = decoder.Decode(&base) - if err != nil { - return nil, ErrBaseFormat - } - - file.Close() - - return &base, nil -} - -func RemoveSampleFromBase(base *Base) { - for tableI, table := range base.Tables { - for columnJ, column := range table.Columns { - column.MainMetric.Sample = nil - - if column.Type == ColType.String { - for freqLen := range column.StringMetric.MostFreqLen { - column.StringMetric.MostFreqLen[freqLen].Sample = nil - } - - for freqLen := range column.StringMetric.LeastFreqLen { - column.StringMetric.LeastFreqLen[freqLen].Sample = nil - } - } - - base.Tables[tableI].Columns[columnJ] = column - } - } -} - -func (base *Base) SortBase() { - for _, table := range base.Tables { - sort.Slice(table.Columns, func(i, j int) bool { - return table.Columns[i].Name < table.Columns[j].Name - }) - } - - sort.Slice(base.Tables, func(i, j int) bool { - return base.Tables[i].Name < base.Tables[j].Name - }) -} - -func (base *Base) AddColumn(column Column, tableName string) { - mapTableName := make(map[string]int) - for index, table := range base.Tables { - mapTableName[table.Name] = index - } - - if index, ok := mapTableName[tableName]; ok { - // If the table exists, append the column to the table - base.Tables[index].Columns = append(base.Tables[index].Columns, column) - } else { - // If the table does not exist, create a new table and add it to the base - table := Table{ - Name: tableName, - Columns: []Column{column}, - } - base.Tables = append(base.Tables, table) - } -} - -// If the table does not exist, create a new table and add it to the base -// table := Table{Name: tableName, Columns: []Column{column}} -// base.Tables = append(base.Tables, table) diff --git a/pkg/model/utils_test.go b/pkg/model/utils_test.go deleted file mode 100644 index 16c619e..0000000 --- a/pkg/model/utils_test.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model_test - -import ( - "testing" - - "github.com/cgi-fr/rimo/pkg/model" -) - -func TestAddColumn(t *testing.T) { - t.Parallel() - - base := model.NewBase("test_base") - - column := model.Column{ //nolint:exhaustruct - Name: "test_column", - Type: model.ColType.String, - Concept: "test_concept", - } - - tableName := "test_table" - - base.AddColumn(column, tableName) - - // fmt.Print(valast.String(base)) - - if len(base.Tables) != 1 { - t.Errorf("expected 1 table, got %d", len(base.Tables)) - } - - if base.Tables[0].Name != tableName { - t.Errorf("expected table name %q, got %q", tableName, base.Tables[0].Name) - } - - if len(base.Tables[0].Columns) != 1 { - t.Errorf("expected 1 column, got %d", len(base.Tables[0].Columns)) - } - - if base.Tables[0].Columns[0].Name != column.Name { - t.Errorf("expected column name %q, got %q", column.Name, base.Tables[0].Columns[0].Name) - } - - if base.Tables[0].Columns[0].Type != column.Type { - t.Errorf("expected column type %q, got %q", column.Type, base.Tables[0].Columns[0].Type) - } - - if base.Tables[0].Columns[0].Concept != column.Concept { - t.Errorf("expected column concept %q, got %q", column.Concept, base.Tables[0].Columns[0].Concept) - } -} diff --git a/pkg/rimo/driven.go b/pkg/rimo/driven.go index 1928b2a..fb79719 100644 --- a/pkg/rimo/driven.go +++ b/pkg/rimo/driven.go @@ -17,14 +17,19 @@ package rimo -import ( - "github.com/cgi-fr/rimo/pkg/model" -) +import "github.com/cgi-fr/rimo/pkg/model" + +type ColReader interface { + ColName() string + TableName() string + Next() bool + Value() (any, error) +} type Reader interface { BaseName() string - Next() bool // itère sur les colonnes. - Value() ([]interface{}, string, string, error) // colValues, colName, tableName + Next() bool + Col() (ColReader, error) } type Writer interface { diff --git a/pkg/rimo/driven_test.go b/pkg/rimo/driven_test.go deleted file mode 100644 index da635c9..0000000 --- a/pkg/rimo/driven_test.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package rimo_test - -import ( - "log" - "math" - "testing" - - "github.com/cgi-fr/rimo/pkg/model" - "github.com/cgi-fr/rimo/pkg/rimo" -) - -// TESTS - -func TestTestInterface(t *testing.T) { - t.Parallel() - - var _ rimo.Reader = (*TestReader)(nil) - - var _ rimo.Writer = (*TestWriter)(nil) -} - -// TestReader implementation - -type colInput struct { - ColName string - ColValues []interface{} -} - -type TestReader struct { - baseName string - data []colInput - tableNames []string // Next() will progressively change tableName - // internal - index int - currentValues []interface{} - currentColName string - currentTableName string -} - -func (r *TestReader) BaseName() string { - return r.baseName -} - -func (r *TestReader) Next() bool { - if r.index == len(r.data) { - log.Println("End of data") - - return false - } - - // update tableName - if len(r.tableNames) == len(r.data) { - r.currentTableName = r.tableNames[r.index] - } else { - // use a percentage to determine the table name to use from the list - percentageComplete := float64(r.index) / float64(len(r.data)) - expectedTableIndex := percentageComplete * float64(len(r.tableNames)) - roundedTableIndex := math.Floor(expectedTableIndex) - tableNameIndex := int(roundedTableIndex) - - r.currentTableName = r.tableNames[tableNameIndex] - } - - r.currentColName = r.data[r.index].ColName - r.currentValues = r.data[r.index].ColValues - r.index++ - - return true -} - -func (r *TestReader) Value() ([]interface{}, string, string, error) { //nolint:wsl - // log.Printf("Processing %s column in %s table", r.currentTableName, r.currentColName) - - return r.currentValues, r.currentColName, r.currentTableName, nil -} - -// TestWriter implementation - -type TestWriter struct { - base model.Base -} - -func (w *TestWriter) Export(base *model.Base) error { - w.base = *base - - return nil -} - -func (w *TestWriter) Base() *model.Base { - return &w.base -} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index e626bbc..6e28979 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -19,6 +19,7 @@ package rimo import ( "fmt" + "sort" "github.com/cgi-fr/rimo/pkg/metric" "github.com/cgi-fr/rimo/pkg/model" @@ -26,36 +27,103 @@ import ( "github.com/rs/zerolog/log" ) -func AnalyseBase(reader Reader, writer Writer) error { - // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel) - baseName := reader.BaseName() +type Driver struct { + SampleSize uint + Distinct bool +} - // log.Debug().Msgf("Processing [%s base]", baseName) +//nolint:funlen,cyclop,gocognit +func (d Driver) AnalyseBase(reader Reader, writer Writer) error { + baseName := reader.BaseName() base := model.NewBase(baseName) + tables := map[string]model.Table{} for reader.Next() { // itère colonne par colonne - colValues, colName, tableName, err := reader.Value() + valreader, err := reader.Col() if err != nil { - return fmt.Errorf("failed to get column value : %w", err) + return fmt.Errorf("failed to get column reader : %w", err) } - column, err := metric.ComputeMetric(colName, colValues) - if err != nil { - return fmt.Errorf("failed to compute column : %w", err) - } + nilcount := 0 + + for valreader.Next() { + val, err := valreader.Value() + if err != nil { + return fmt.Errorf("failed to read value : %w", err) + } + + log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, valreader.TableName(), valreader.ColName()) + + switch valtyped := val.(type) { + case string: + col, err := d.AnalyseString(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, + } + } + + table.Columns = append(table.Columns, col) - log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, tableName, column.Name) - // log.Debug().Msg(valast.String(column)) + tables[valreader.TableName()] = table + case float64, float32, int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: + col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } - base.AddColumn(column, tableName) + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, + } + } + + table.Columns = append(table.Columns, col) + + tables[valreader.TableName()] = table + case bool: + col, err := d.AnalyseBool(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, + } + } + + table.Columns = append(table.Columns, col) + + tables[valreader.TableName()] = table + case nil: + nilcount++ + } + } } - base.SortBase() + for _, table := range tables { + sort.SliceStable(table.Columns, func(i, j int) bool { + return table.Columns[i].Name < table.Columns[j].Name + }) + + base.Tables = append(base.Tables, table) + } - // log.Debug().Msg("---------- Finish processing base :") - // log.Debug().Msg(valast.String(*base)) - // log.Debug().Msg("----------") + sort.SliceStable(base.Tables, func(i, j int) bool { + return base.Tables[i].Name < base.Tables[j].Name + }) err := writer.Export(base) if err != nil { @@ -64,3 +132,164 @@ func AnalyseBase(reader Reader, writer Writer) error { return nil } + +func (d Driver) AnalyseString(nilcount int, firstValue string, reader ColReader) (model.Column, error) { + column := model.Column{ + Name: reader.ColName(), + Type: "string", + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct + StringMetric: &model.String{}, //nolint:exhaustruct + NumericMetric: nil, + BoolMetric: nil, + } + + analyser := metric.NewString(d.SampleSize, d.Distinct) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + analyser.Read(&firstValue) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + switch valtyped := val.(type) { + case string: + analyser.Read(&valtyped) + case nil: + analyser.Read(nil) + default: + return column, fmt.Errorf("invalue value type : %w", err) + } + } + + analyser.Build(&column) + + return column, nil +} + +func (d Driver) AnalyseNumeric(nilcount int, firstValue any, reader ColReader) (model.Column, error) { + column := model.Column{ + Name: reader.ColName(), + Type: "numeric", + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct + StringMetric: nil, + NumericMetric: &model.Numeric{}, //nolint:exhaustruct + BoolMetric: nil, + } + + analyser := metric.NewNumeric(d.SampleSize, d.Distinct) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + valtyped, err := GetFloat64(firstValue) + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + analyser.Read(valtyped) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + valtyped, err := GetFloat64(val) + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + analyser.Read(valtyped) + } + + analyser.Build(&column) + + return column, nil +} + +func (d Driver) AnalyseBool(nilcount int, firstValue bool, reader ColReader) (model.Column, error) { + column := model.Column{ + Name: reader.ColName(), + Type: "bool", + Config: model.Config{}, //nolint:exhaustruct + MainMetric: model.Generic{}, //nolint:exhaustruct + StringMetric: nil, + NumericMetric: nil, + BoolMetric: &model.Bool{}, //nolint:exhaustruct + } + + analyser := metric.NewBool(d.SampleSize, d.Distinct) + + for i := 0; i < nilcount; i++ { + analyser.Read(nil) + } + + analyser.Read(&firstValue) + + for reader.Next() { + val, err := reader.Value() + if err != nil { + return column, fmt.Errorf("failed to read value : %w", err) + } + + switch valtyped := val.(type) { + case bool: + analyser.Read(&valtyped) + case nil: + analyser.Read(nil) + default: + return column, fmt.Errorf("invalue value type : %w", err) + } + } + + analyser.Build(&column) + + return column, nil +} + +//nolint:cyclop +func GetFloat64(value any) (*float64, error) { + var converted float64 + + switch valtyped := value.(type) { + case float64: + converted = valtyped + case float32: + converted = float64(valtyped) + case int: + converted = float64(valtyped) + case int8: + converted = float64(valtyped) + case int16: + converted = float64(valtyped) + case int32: + converted = float64(valtyped) + case int64: + converted = float64(valtyped) + case uint: + converted = float64(valtyped) + case uint8: + converted = float64(valtyped) + case uint16: + converted = float64(valtyped) + case uint32: + converted = float64(valtyped) + case uint64: + converted = float64(valtyped) + case nil: + return nil, nil //nolint:nilnil + default: + return nil, fmt.Errorf("%w : %T", ErrInvalidValueType, value) + } + + return &converted, nil +} diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go index 186e18f..f8dd16f 100644 --- a/pkg/rimo/driver_test.go +++ b/pkg/rimo/driver_test.go @@ -1,20 +1,3 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - package rimo_test import ( @@ -24,15 +7,11 @@ import ( "time" "github.com/cgi-fr/rimo/internal/infra" - "github.com/cgi-fr/rimo/pkg/model" "github.com/cgi-fr/rimo/pkg/rimo" - - "github.com/hexops/valast" - "github.com/stretchr/testify/assert" + "github.com/rs/zerolog" "github.com/stretchr/testify/require" ) -// Run Analyse pipeline with FilesReader and TestWriter and compare with expected result. const ( dataDir = "../../testdata/" inputName = "data_input.jsonl" @@ -40,129 +19,28 @@ const ( expectedName = "data_expected.yaml" ) -type testCase struct { - name string - inputPath string - expectedPath string -} - -func getTestCase(dataFolder string) testCase { - return testCase{ - name: filepath.Base(dataFolder), - inputPath: filepath.Join(dataFolder, inputName), - expectedPath: filepath.Join(dataFolder, expectedName), - } -} - -// PIPELINE TESTS - -// Note : numeric value should be converted to float64. -func TestManualPipeline(t *testing.T) { - t.Parallel() - - // Set up TestReader - baseName := "databaseName" - tableNames := []string{"tableTest"} - testInput := []colInput{ - { - ColName: "string", - ColValues: []interface{}{"val1", "val2", "val3"}, - }, - { - ColName: "col2", - ColValues: []interface{}{true, false, nil}, - }, - { - ColName: "col9", - ColValues: []interface{}{float64(31), float64(29), float64(42)}, - }, - { - ColName: "empty", - ColValues: []interface{}{nil, nil, nil}, - }, - } - - testReader := TestReader{ //nolint:exhaustruct - baseName: baseName, - tableNames: tableNames, - data: testInput, - index: 0, - } - - testWriter := TestWriter{} //nolint:exhaustruct - - err := rimo.AnalyseBase(&testReader, &testWriter) - if err != nil { - t.Errorf("Error: %v", err) - } - - t.Logf("Base returned : %s", valast.String(*testWriter.Base())) -} - -// Ensure that the pipeline produce the same base as expected. -func TestPipeline(t *testing.T) { - t.Parallel() - - testCases := []testCase{} - testCases = append(testCases, getTestCase("../../testdata/data1/")) - // testCases = append(testCases, getTestCase("../../testdata/data2/")) - - for _, testCase := range testCases { - testCase := testCase // capture range variable - t.Run(testCase.name, func(t *testing.T) { - t.Parallel() - - // Actual base - - reader, err := infra.FilesReaderFactory([]string{testCase.inputPath}) - assert.NoError(t, err) - - writer := &TestWriter{} //nolint:exhaustruct - - err = rimo.AnalyseBase(reader, writer) - assert.NoError(t, err) - - actualBase := writer.Base() - - // Expected base - expectedBase, err := model.LoadBase(testCase.expectedPath) - if err != nil { - t.Errorf("Error: %v", err) - } - - // Remove sample - model.RemoveSampleFromBase(expectedBase) - model.RemoveSampleFromBase(actualBase) - - fmt.Printf("Actual base : %s\n", valast.String(*actualBase)) - // Compare - equal, diff := model.SameBase(expectedBase, actualBase) - if !equal { - t.Errorf("Base are not equal:\n%s", diff) - } - }) - } -} - // Benchmark (same as previous analyse_test.go benchmark). func BenchmarkAnalyseInterface(b *testing.B) { - for _, numLines := range []int{100, 1000, 10000, 100000} { - inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines)) - inputList := []string{inputPath} + zerolog.SetGlobalLevel(zerolog.WarnLevel) + + for _, numLines := range []int{100, 1000, 10000} { + inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d", numLines)) outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) { startTime := time.Now() - reader, err := infra.FilesReaderFactory(inputList) + reader, err := infra.NewJSONLFolderReader(inputPath) require.NoError(b, err) writer, err := infra.YAMLWriterFactory(outputPath) require.NoError(b, err) + driver := rimo.Driver{SampleSize: 5, Distinct: true} + b.ResetTimer() for n := 0; n < b.N; n++ { - err := rimo.AnalyseBase(reader, writer) + err := driver.AnalyseBase(reader, writer) require.NoError(b, err) } b.StopTimer() diff --git a/pkg/rimo/error.go b/pkg/rimo/error.go new file mode 100644 index 0000000..083daa5 --- /dev/null +++ b/pkg/rimo/error.go @@ -0,0 +1,5 @@ +package rimo + +import "errors" + +var ErrInvalidValueType = errors.New("invalue value type") diff --git a/schema/v1/rimo.schema.json b/schema/v1/rimo.schema.json index be205df..187194b 100644 --- a/schema/v1/rimo.schema.json +++ b/schema/v1/rimo.schema.json @@ -22,7 +22,7 @@ "tables" ] }, - "BoolMetric": { + "Bool": { "properties": { "trueRatio": { "type": "number" @@ -55,16 +55,16 @@ "type": "boolean" }, "mainMetric": { - "$ref": "#/$defs/GenericMetric" + "$ref": "#/$defs/Generic" }, "stringMetric": { - "$ref": "#/$defs/StringMetric" + "$ref": "#/$defs/String" }, "numericMetric": { - "$ref": "#/$defs/NumericMetric" + "$ref": "#/$defs/Numeric" }, "boolMetric": { - "$ref": "#/$defs/BoolMetric" + "$ref": "#/$defs/Bool" } }, "additionalProperties": false, @@ -78,7 +78,7 @@ "mainMetric" ] }, - "GenericMetric": { + "Generic": { "properties": { "count": { "type": "integer" @@ -86,10 +86,15 @@ "empty": { "type": "integer" }, - "unique": { + "nulls": { "type": "integer" }, - "sample": { + "distinct": { + "type": "integer" + }, + "min": true, + "max": true, + "samples": { "items": true, "type": "array" } @@ -99,41 +104,12 @@ "required": [ "count", "empty", - "unique", - "sample" + "nulls", + "samples" ] }, - "LenFreq": { + "Numeric": { "properties": { - "length": { - "type": "integer" - }, - "freq": { - "type": "number" - }, - "sample": { - "items": { - "type": "string" - }, - "type": "array" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "length", - "freq", - "sample" - ] - }, - "NumericMetric": { - "properties": { - "min": { - "type": "number" - }, - "max": { - "type": "number" - }, "mean": { "type": "number" } @@ -141,12 +117,10 @@ "additionalProperties": false, "type": "object", "required": [ - "min", - "max", "mean" ] }, - "StringMetric": { + "String": { "properties": { "minLen": { "type": "integer" @@ -154,15 +128,12 @@ "maxLen": { "type": "integer" }, - "mostFrequentLen": { - "items": { - "$ref": "#/$defs/LenFreq" - }, - "type": "array" + "countLen": { + "type": "integer" }, - "leastFrequentLen": { + "lengths": { "items": { - "$ref": "#/$defs/LenFreq" + "$ref": "#/$defs/StringLen" }, "type": "array" } @@ -171,9 +142,27 @@ "type": "object", "required": [ "minLen", - "maxLen", - "mostFrequentLen", - "leastFrequentLen" + "maxLen" + ] + }, + "StringLen": { + "properties": { + "length": { + "type": "integer" + }, + "freq": { + "type": "number" + }, + "metrics": { + "$ref": "#/$defs/Generic" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "length", + "freq", + "metrics" ] }, "Table": { diff --git a/test/suites/cli/metrics.yml b/test/suites/cli/metrics.yml new file mode 100644 index 0000000..108eeac --- /dev/null +++ b/test/suites/cli/metrics.yml @@ -0,0 +1,244 @@ +name: test metrics +testcases: + - name: main metric + steps: + - script: rimo analyse -d ../testdata/main ../testdata/main/output + assertions: + - result.code ShouldEqual 0 + - script: yq '.tables[0].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "data" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "bool" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "bool" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "4" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "2" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[0].boolMetric.trueRatio' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.5555555555555556" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "numeric" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "numeric" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "7" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "-235" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "100" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[1].numericMetric.mean' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "-13.539833097777777" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].name' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "string" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].type' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "string" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "10" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "9" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "教育漢字" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].mainMetric.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.minLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.maxLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "12" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.countLen' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "12" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[0].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '"hello world "' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "教育漢字" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[0].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "5" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[1].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '" "' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "_" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[1].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "3" + - result.code ShouldEqual 0 + + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].freq' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0.1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.count' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.empty' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.nulls' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "0" + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.distinct' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.min' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '""' + - result.code ShouldEqual 0 + - script: yq -o json '.tables[0].columns[2].stringMetric.lengths[2].metrics.max' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual '""' + - result.code ShouldEqual 0 + - script: yq '.tables[0].columns[2].stringMetric.lengths[2].metrics.samples|length' ../testdata/main/output/main.yaml + assertions: + - result.systemout ShouldEqual "1" + - result.code ShouldEqual 0 diff --git a/test/suites/testdata/main/data.jsonl b/test/suites/testdata/main/data.jsonl new file mode 100644 index 0000000..260040e --- /dev/null +++ b/test/suites/testdata/main/data.jsonl @@ -0,0 +1,10 @@ +{"string":"","bool":true,"numeric":0} +{"string":" ","bool":false,"numeric":1} +{"string":"_","bool":true,"numeric":3.1415} +{"string":"new\nline ","bool":false,"numeric":1.0E+2} +{"string":"hello world ","bool":true,"numeric":21.2e-7} +{"string":"tabs\t ","bool":false,"numeric":-235} +{"string":"教育漢字","bool":true,"numeric":-0} +{"string":"\\","bool":false,"numeric":0.0} +{"string":"\u20ac ","bool":true,"numeric":9.0} +{"string":null,"bool":null,"numeric":null} diff --git a/test/suites/testdata/main/output/.gitkeep b/test/suites/testdata/main/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/testdata/benchmark/buildBenchData.sh b/testdata/benchmark/buildBenchData.sh index fa046ce..60b2138 100755 --- a/testdata/benchmark/buildBenchData.sh +++ b/testdata/benchmark/buildBenchData.sh @@ -3,53 +3,57 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd "${SCRIPT_DIR}/mixed/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi -if [ ! -f 100000_input.jsonl ]; then - pimo --empty-input --repeat=100000 > 100000_input.jsonl +if [ ! -f 100000/input.jsonl ]; then + pimo --empty-input --repeat=100000 > 100000/input.jsonl fi echo "data for mixed : OK" cd "${SCRIPT_DIR}/bool/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data for mixed : OK" cd "${SCRIPT_DIR}/numeric/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data for numeric : OK" cd "${SCRIPT_DIR}/text/" -if [ ! -f 100_input.jsonl ]; then - pimo --empty-input --repeat=100 > 100_input.jsonl +mkdir 100 1000 10000 100000 +if [ ! -f 100/input.jsonl ]; then + pimo --empty-input --repeat=100 > 100/input.jsonl fi -if [ ! -f 1000_input.jsonl ]; then - pimo --empty-input --repeat=1000 > 1000_input.jsonl +if [ ! -f 1000/input.jsonl ]; then + pimo --empty-input --repeat=1000 > 1000/input.jsonl fi -if [ ! -f 10000_input.jsonl ]; then - pimo --empty-input --repeat=10000 > 10000_input.jsonl +if [ ! -f 10000/input.jsonl ]; then + pimo --empty-input --repeat=10000 > 10000/input.jsonl fi echo "data generated for text : OK"