diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go index 0b562c9..e75290a 100644 --- a/cmd/rimo/main.go +++ b/cmd/rimo/main.go @@ -22,9 +22,9 @@ import ( "os" "path/filepath" - "github.com/cgi-fr/rimo/pkg/analyse" - "github.com/cgi-fr/rimo/pkg/io" + "github.com/cgi-fr/rimo/internal/infra" "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/rimo" "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/spf13/cobra" @@ -67,6 +67,7 @@ func main() { //nolint:funlen }, } + // Make use of interface instead of analyse/pkg rimoAnalyseCmd := &cobra.Command{ //nolint:exhaustruct Use: "analyse [inputDir] [outputDir]", Short: "Generate a rimo.yaml from a directory of .jsonl files", @@ -75,21 +76,33 @@ func main() { //nolint:funlen inputDir := args[0] outputDir := args[1] - // List .jsonl files in input directory - if err := io.ValidateDirPath(inputDir); err != nil { - log.Fatal().Msgf("error validating input directory: %v", err) - } + // Reader - inputList, err := FilesList(inputDir, ".jsonl") + inputList, err := BuildFilepathList(inputDir, ".jsonl") if err != nil { log.Fatal().Msgf("error listing files: %v", err) } - if len(inputList) == 0 { - log.Fatal().Msgf("no .jsonl files found in %s", inputDir) + reader, err := infra.FilesReaderFactory(inputList) + if err != nil { + log.Fatal().Msgf("error creating reader: %v", err) + } + + // Writer + // (could be relocated to infra.FilesReader) + baseName, _, err := infra.ExtractName(inputList[0]) + if err != nil { + log.Fatal().Msgf("error extracting base name: %v", err) + } + + outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", baseName)) + + writer, err := infra.YAMLWriterFactory(outputPath) + if err != nil { + log.Fatal().Msgf("error creating writer: %v", err) } - err = analyse.Orchestrator(inputList, outputDir) + err = rimo.AnalyseBase(reader, writer) if err != nil { log.Fatal().Msgf("error generating rimo.yaml: %v", err) } @@ -117,3 +130,44 @@ func FilesList(path string, extension string) ([]string, error) { return files, nil } + +var ErrNoFile = fmt.Errorf("no file found") + +func BuildFilepathList(path string, extension string) ([]string, error) { + err := ValidateDirPath(path) + if err != nil { + return nil, fmt.Errorf("failed to validate input directory: %w", err) + } + + pattern := filepath.Join(path, "*"+extension) + + files, err := filepath.Glob(pattern) + if err != nil { + return nil, fmt.Errorf("error listing files: %w", err) + } + + if len(files) == 0 { + return nil, fmt.Errorf("%w : no %s files found in %s", ErrNoFile, extension, path) + } + + return files, nil +} + +func ValidateDirPath(path string) error { + fileInfo, err := os.Stat(path) + if os.IsNotExist(err) { + return fmt.Errorf("%w: %s", infra.ErrDirDoesNotExist, path) + } else if err != nil { + return fmt.Errorf("failed to get directory info: %w", err) + } + + if !fileInfo.IsDir() { + return fmt.Errorf("%w: %s", infra.ErrPathIsNotDir, path) + } + + if fileInfo.Mode().Perm()&infra.WriteDirPerm != infra.WriteDirPerm { + return fmt.Errorf("%w: %s", infra.ErrWriteDirPermission, path) + } + + return nil +} diff --git a/internal/infra/fileWriter.go b/internal/infra/fileWriter.go new file mode 100644 index 0000000..d652811 --- /dev/null +++ b/internal/infra/fileWriter.go @@ -0,0 +1,81 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package infra + +import ( + "fmt" + "os" + + "github.com/cgi-fr/rimo/pkg/model" + "gopkg.in/yaml.v3" +) + +// Terminal writter interface + +type StdoutWriter struct{} + +func StdoutWriterFactory() *StdoutWriter { + writer := StdoutWriter{} + + return &writer +} + +func (w *StdoutWriter) Export(base *model.Base) error { + fmt.Printf("%v\n", base) + + return nil +} + +// YAML Writter interface + +type YAMLWriter struct { + outputPath string +} + +func YAMLWriterFactory(filepath string) (*YAMLWriter, error) { + err := ValidateOutputPath(filepath) + if err != nil { + return nil, fmt.Errorf("failed to validate file path: %w", err) + } + + writer := YAMLWriter{ + outputPath: filepath, + } + + return &writer, nil +} + +// Write a YAML file from RIMO base at outputPath. +func (w *YAMLWriter) Export(base *model.Base) error { + outputFile, err := os.Create(w.outputPath) + if err != nil { + return fmt.Errorf("failed to create output file: %w", err) + } + defer outputFile.Close() + + // Encode Base to YAML. + encoder := yaml.NewEncoder(outputFile) + defer encoder.Close() + + err = encoder.Encode(base) + if err != nil { + return fmt.Errorf("failed to encode Base to YAML: %w", err) + } + + return nil +} diff --git a/pkg/io/export_test.go b/internal/infra/fileWriter_test.go similarity index 88% rename from pkg/io/export_test.go rename to internal/infra/fileWriter_test.go index e07f353..cd34651 100644 --- a/pkg/io/export_test.go +++ b/internal/infra/fileWriter_test.go @@ -15,14 +15,14 @@ // You should have received a copy of the GNU General Public License // along with RIMO. If not, see . -package io_test +package infra_test import ( "os" "path/filepath" "testing" - "github.com/cgi-fr/rimo/pkg/io" + "github.com/cgi-fr/rimo/internal/infra" "github.com/cgi-fr/rimo/pkg/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -32,7 +32,7 @@ const ( dataDir = "../../testdata/" ) -func TestExport(t *testing.T) { +func TestWriterYAML(t *testing.T) { t.Parallel() base := model.Base{ @@ -54,8 +54,11 @@ func TestExport(t *testing.T) { // Create a temporary file for the output outputFile := filepath.Join(tempDir, "output.yaml") - // Export the base to the output file - err = io.Export(base, outputFile) + // Create the writer + writer, err := infra.YAMLWriterFactory(outputFile) + require.NoError(t, err) + + err = writer.Export(&base) require.NoError(t, err) // Read the output file and check its contents diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go new file mode 100644 index 0000000..8430ae9 --- /dev/null +++ b/internal/infra/filesReader.go @@ -0,0 +1,173 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package infra + +import ( + "errors" + "fmt" +) + +// Errors declaration. +var ( + ErrInvalidFilePath = errors.New("failed to validate path") + ErrNoFilePath = errors.New("no file path provided") + ErrNonUniqueBase = errors.New("base name is not unique") +) + +// FilesReader can read multiple type of file and feed data to rimo. +// FilesReader is responsible of : +// - BaseName() return the name of the base +// - Next() return true if there is a next value to read +// - Value() return the value of the current column, the name of the column and the name of the table +// Interface itself with a Loader interface. Which currently only supports YAML files. +// Loader and FilesReader can be initialized with LoaderFactory and FilesReaderFactory. +type FilesReader struct { + filepathList []string + loader JSONLinesLoader // responsible of loading a file format + baseName string + // variable for looping over columns + fileIndex int + colNameMapIndex map[int]string // map of column name by index + colIndex int // value of current column index + // given by Value() + dataMap map[string][]interface{} + tableName string // filled by FilesReader +} + +// Constructor for FilesReader. +func FilesReaderFactory(filepathList []string) (*FilesReader, error) { + var err error + + // Process inputDirList + if len(filepathList) == 0 { + return nil, ErrNoFilePath + } + + for _, path := range filepathList { + err := ValidateFilePath(path) + if err != nil { + return nil, ErrInvalidFilePath + } + } + + // Initialize FilesReader + var filesReader FilesReader + filesReader.filepathList = filepathList + filesReader.fileIndex = -1 + + filesReader.baseName, err = filesReader.isBaseUnique() + if err != nil { + return nil, fmt.Errorf("base is not unique: %w", err) + } + + // Use of JSONLinesLoader + filesReader.loader = JSONLinesLoader{} + + return &filesReader, nil +} + +// Reader interface implementation + +func (r *FilesReader) BaseName() string { + return r.baseName +} + +func (r *FilesReader) Next() bool { + // First call to Next() + if r.fileIndex == -1 { + r.fileIndex = 0 + r.colIndex = 0 + + return true + } + + // Current file contain column left to process. + if r.colIndex < len(r.dataMap) { + r.colIndex++ + } + + // Current file contain no columns left to process. + if r.colIndex == len(r.dataMap) { + // Current file is last file. + if r.fileIndex == len(r.filepathList)-1 { + return false + } + // There is a next file. + r.fileIndex++ + r.colIndex = 0 + } + + return true +} + +// Charger les fichiers un à un dans une dataMap. +// Retourne les valeurs d'une colonne, son nom et le nom de table. +func (r *FilesReader) Value() ([]interface{}, string, string, error) { + var err error + + // colIndex = 0 : new file to load + if r.colIndex == 0 { + filepath := r.filepathList[r.fileIndex] + + // Extract table name from file name + _, r.tableName, err = ExtractName(filepath) + if err != nil { + return nil, "", "", fmt.Errorf("failed to extract table name: %w", err) + } + + // Load file in dataMap + r.dataMap, err = r.loader.Load(r.filepathList[r.fileIndex]) + if err != nil { + panic(err) + } + + // Create a map of column name by index + r.colNameMapIndex = make(map[int]string, 0) + i := 0 + + for k := range r.dataMap { + r.colNameMapIndex[i] = k + i++ + } + } + + // colIndex = n : current file have been partially processed + currentColName := r.colNameMapIndex[r.colIndex] + // return values, colName, tableName + return r.dataMap[currentColName], currentColName, r.tableName, nil +} + +func (r *FilesReader) isBaseUnique() (string, error) { + baseName, _, err := ExtractName(r.filepathList[0]) + if err != nil { + return "", err + } + + for _, path := range r.filepathList { + baseNameI, _, err := ExtractName(path) + if err != nil { + return "", err + } + + if baseName != baseNameI { + return "", fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI) + } + } + + return baseName, nil +} diff --git a/internal/infra/filesReader_test.go b/internal/infra/filesReader_test.go new file mode 100644 index 0000000..ed6cdc0 --- /dev/null +++ b/internal/infra/filesReader_test.go @@ -0,0 +1,83 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package infra_test + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/cgi-fr/rimo/internal/infra" + "github.com/stretchr/testify/assert" +) + +func TestReader(t *testing.T) { + t.Parallel() + + inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") + + reader, err := infra.FilesReaderFactory([]string{inputFile}) + assert.NoError(t, err) + + // Assertions. + + actualBaseName := reader.BaseName() + expectedBaseName := "data" + assert.Equal(t, expectedBaseName, actualBaseName) + + expectedTableName := "input" + expectedDataMap := map[string][]interface{}{ + "address": {"PSC", "095", "06210"}, + "age": {nil, nil, float64(61)}, + "major": {true, false, true}, + "empty": {nil, nil, nil}, + } + + for reader.Next() { + values, colName, tableName, err := reader.Value() + if err != nil { + assert.NoError(t, err) + } + + expectedColData, ok := expectedDataMap[colName] + if !ok { + assert.Fail(t, "column name not found : %s", colName) + } + + assert.Equal(t, expectedColData, values) + assert.Equal(t, expectedTableName, tableName) + } +} + +func TestReaderMultipleFiles(t *testing.T) { + t.Parallel() + + inputFile := filepath.Join(dataDir, "data0/data_input.jsonl") + inputFile2 := filepath.Join(dataDir, "data0/data_input2.jsonl") + reader, err := infra.FilesReaderFactory([]string{inputFile, inputFile2}) + assert.NoError(t, err) + + for reader.Next() { + values, colName, tableName, err := reader.Value() + if err != nil { + assert.NoError(t, err) + } + + fmt.Printf("%s.%s: %v\n", tableName, colName, values) + } +} diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go new file mode 100644 index 0000000..ad40f79 --- /dev/null +++ b/internal/infra/infra_test.go @@ -0,0 +1,93 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package infra_test + +import ( + "path/filepath" + "testing" + + "github.com/cgi-fr/rimo/internal/infra" + "github.com/cgi-fr/rimo/pkg/rimo" + "github.com/stretchr/testify/require" +) + +const ( + testdataDir = "../../testdata/" +) + +// Test RIMO pipeline with FilesReader, JSONLinesLoader and YAMLWriter. +func TestPipeline(t *testing.T) { + t.Parallel() + + inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl") + + reader, err := infra.FilesReaderFactory([]string{inputPath}) + require.NoError(t, err) + + writer := infra.StdoutWriterFactory() + + err = rimo.AnalyseBase(reader, writer) + require.NoError(t, err) +} + +// var ( +// Readers []*rimo.Reader +// Writers []*rimo.Writer +// ) + +// // List of implemented readers and writers. +// func GetReaders(filepathList []string) []*rimo.Reader { +// filesReader, err := infra.FilesReaderFactory(filepathList) +// if err != nil { +// panic(err) +// } + +// Readers = []*rimo.Reader{filesReader} + +// return Readers +// } + +// func GetWriters() []*rimo.Writer { +// yamlWriter := infra.YAMLWriterFactory("../../testdata/data1/data_output.yaml") + +// Writers = []*rimo.Writer{yamlWriter, infra.StdoutWriter{}} + +// return Writers +// } + +// func TestInterface(t *testing.T) { +// t.Parallel() + +// Writers = GetWriters() +// Readers = GetReaders([]string{"../../testdata/data1/data_input.jsonl"}) +// // Assert that all readers and writers implement the Reader and Writer interfaces. +// for _, reader := range Readers { +// var _ rimo.Reader = (reader)(nil) +// } +// for _, writer := range Writers { +// var _ rimo.Reader = (writer)(nil) +// } + +// // Assert that all combinations of readers and writers can be used in the pipeline. +// for _, reader := range Readers { +// for _, writer := range Writers { +// err := rimo.AnalyseBase(reader, writer) +// require.NoError(t, err) +// } +// } +// } diff --git a/pkg/io/load.go b/internal/infra/loader.go similarity index 82% rename from pkg/io/load.go rename to internal/infra/loader.go index 7890ed5..f15ea38 100644 --- a/pkg/io/load.go +++ b/internal/infra/loader.go @@ -15,7 +15,7 @@ // You should have received a copy of the GNU General Public License // along with RIMO. If not, see . -package io +package infra import ( "bufio" @@ -39,8 +39,14 @@ var ( type DataMap map[string][]interface{} -// Load .jsonl and return DataMap. -func Load(inputPath string) (DataMap, error) { +// JSONLinesLoader loads JSON lines files with this format : { "col_name1" : value1, "col_name2" : value1, ... }. +// It may be interesting performance wise to use this format : +// "col_name1" : [value1, value2, ...], +// "col_name2" : [value1, value2, ...], + +type JSONLinesLoader struct{} + +func (l *JSONLinesLoader) Load(inputPath string) (DataMap, error) { file, err := os.Open(inputPath) if err != nil { return nil, fmt.Errorf("couldn't load %s : %w", inputPath, err) @@ -49,7 +55,7 @@ func Load(inputPath string) (DataMap, error) { scanner := bufio.NewScanner(file) - data, err := LoadJSONLines(scanner) + data, err := l.LoadJSONLines(scanner) if err != nil { return nil, err } @@ -58,7 +64,7 @@ func Load(inputPath string) (DataMap, error) { } // Reads JSON lines structure: { "col_name1" : value1, "col_name2" : value1, ... }. -func LoadJSONLines(scanner *bufio.Scanner) (DataMap, error) { +func (l *JSONLinesLoader) LoadJSONLines(scanner *bufio.Scanner) (DataMap, error) { var data map[string][]interface{} = DataMap{} lineNumber := 0 diff --git a/pkg/io/load_test.go b/internal/infra/loader_test.go similarity index 74% rename from pkg/io/load_test.go rename to internal/infra/loader_test.go index 2cd17e3..d55186d 100644 --- a/pkg/io/load_test.go +++ b/internal/infra/loader_test.go @@ -15,21 +15,25 @@ // You should have received a copy of the GNU General Public License // along with RIMO. If not, see . -package io_test +package infra_test import ( + "fmt" "path/filepath" "testing" - "github.com/cgi-fr/rimo/pkg/io" + "github.com/cgi-fr/rimo/internal/infra" "github.com/stretchr/testify/require" ) -func TestLoad(t *testing.T) { +func TestLoaderJSONL(t *testing.T) { t.Parallel() - path := filepath.Join(dataDir, "data1/data_input.jsonl") + path := filepath.Join(testdataDir, "data1/data_input.jsonl") - _, err := io.Load(path) + LoaderJSONL := infra.JSONLinesLoader{} + + data, err := LoaderJSONL.Load(path) require.NoError(t, err) + fmt.Printf("dataMap: %v\n", data) } diff --git a/internal/infra/utils.go b/internal/infra/utils.go new file mode 100644 index 0000000..cfb6b67 --- /dev/null +++ b/internal/infra/utils.go @@ -0,0 +1,110 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package infra + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" +) + +var ( + ErrFileDoesNotExist = fmt.Errorf("file does not exist") + ErrDirDoesNotExist = fmt.Errorf("directory does not exist") + ErrPathIsNotDir = fmt.Errorf("path is not a directory") + ErrNotRegularFile = fmt.Errorf("path is not a regular file") + ErrReadPermission = fmt.Errorf("user does not have read permission for file") + ErrWriteDirPermission = fmt.Errorf("user does not have write permission for directory") +) + +const ( + ReadPerm os.FileMode = 0o400 + WriteDirPerm os.FileMode = 0o200 +) + +func ValidateFilePath(path string) error { + fileInfo, err := os.Stat(path) + if os.IsNotExist(err) { + return fmt.Errorf("%w: %s", ErrFileDoesNotExist, path) + } else if err != nil { + return fmt.Errorf("%w: failed to get file info %s", err, path) + } + + if !fileInfo.Mode().IsRegular() { + return fmt.Errorf("%w: %s", ErrNotRegularFile, path) + } + + if fileInfo.Mode().Perm()&ReadPerm != ReadPerm { + return fmt.Errorf("%w: %s", ErrReadPermission, path) + } + + return nil +} + +// Takes a filepath but only checks the directory part of it. +func ValidateOutputPath(path string) error { + // Check if path is a directory + if filepath.Ext(path) == "" { + return fmt.Errorf("%w: %s", ErrPathIsNotDir, path) + } + // Get directory out of filepath + dirPath := filepath.Dir(path) + + // Check if directory exists + fileInfo, err := os.Stat(dirPath) + if os.IsNotExist(err) { + return fmt.Errorf("%w: %s", ErrDirDoesNotExist, dirPath) + } else if err != nil { + return fmt.Errorf("failed to get directory info: %w", err) + } + + // Check directory permissions + if fileInfo.Mode().Perm()&WriteDirPerm != WriteDirPerm { + return fmt.Errorf("%w: %s", ErrWriteDirPermission, dirPath) + } + + return nil +} + +// filesReader.go UTILS + +var ErrNonExtractibleValue = errors.New("couldn't extract base or table name from path") + +func ExtractName(path string) (string, string, error) { + // path format : /path/to/jsonl/BASE_TABLE.jsonl + fileName := strings.TrimSuffix(filepath.Base(path), filepath.Ext(filepath.Base(path))) + + parts := strings.Split(fileName, "_") + if len(parts) != 2 { //nolint:gomnd + return "", "", fmt.Errorf("%w : %s", ErrNonExtractibleValue, path) + } + + baseName := parts[0] + if baseName == "" { + return "", "", fmt.Errorf("%w : base name is empty from %s", ErrNonExtractibleValue, path) + } + + tableName := parts[1] + if tableName == "" { + return "", "", fmt.Errorf("%w : table name is empty from %s", ErrNonExtractibleValue, path) + } + + return baseName, tableName, nil +} diff --git a/pkg/analyse/analyse.go b/pkg/analyse/analyse.go deleted file mode 100644 index 90ea7a0..0000000 --- a/pkg/analyse/analyse.go +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package analyse - -import ( - "errors" - "fmt" - "path/filepath" - "strings" - - "github.com/cgi-fr/rimo/pkg/io" - "github.com/cgi-fr/rimo/pkg/metric" - "github.com/cgi-fr/rimo/pkg/model" -) - -var ErrWrongParameter = errors.New("wrong parameter") - -// Handle execution pipeline of rimo analyse. -func Orchestrator(inputList []string, outputPath string) error { - // Process input - err := ProcessInput(inputList, outputPath) - if err != nil { - return err - } - - // Compute model.base - base, err := Build(inputList) - if err != nil { - return err - } - - // Export rimo.yaml - outputPath = filepath.Join(outputPath, base.Name+".yaml") - - err = io.Export(base, outputPath) - if err != nil { - return fmt.Errorf("%w : cannot export to %s", err, outputPath) - } - - return nil -} - -func ProcessInput(inputList []string, outputPath string) error { - // verify output dirPath - err := io.ValidateDirPath(outputPath) - if err != nil { - return fmt.Errorf("failed to validate output path: %w", err) - } - - // validate input filepath - for i := range inputList { - err := io.ValidateFilePath(inputList[i]) - if err != nil { - return fmt.Errorf("failed to validate input file: %w", err) - } - } - - // verify that input files relates to the same base - err = BaseIsUnique(inputList) - if err != nil { - return fmt.Errorf("failed to validate input file: %w", err) - } - - return nil -} - -// Return a model.Base from inputList. -func Build(inputList []string) (model.Base, error) { - baseName, _, err := ExtractName(inputList[0]) - if err != nil { - return model.Base{}, fmt.Errorf("failed to extract base name for %s: %w", inputList[0], err) - } - - base := model.Base{ - Name: baseName, - Tables: []model.Table{}, - } - - for _, inputPath := range inputList { - _, tableName, err := ExtractName(inputPath) - if err != nil { - return model.Base{}, fmt.Errorf("failed to extract table name for %s: %w", inputPath, err) - } - - columns, err := Analyse(inputPath) - if err != nil { - return model.Base{}, fmt.Errorf("failed to analyse %s: %w", inputPath, err) - } - - // Add columns to base - table := model.Table{ - Name: tableName, - Columns: columns, - } - base.Tables = append(base.Tables, table) - } - - base.SortBase() - - return base, nil -} - -// Return a list of column from a jsonl file. -func Analyse(path string) ([]model.Column, error) { - // Load file in a dataMap. - data, err := io.Load(path) - if err != nil { - return nil, fmt.Errorf("failed to load jsonl file: %w", err) - } - - columns := []model.Column{} - - for colName, values := range data { - column, err := metric.ComputeMetric(colName, values) - if err != nil { - return nil, fmt.Errorf("failed to compute metric: %w", err) - } - - columns = append(columns, column) - } - - return columns, nil -} - -// Error definitions. - -var ErrNonExtractibleValue = errors.New("couldn't extract base or table name from path") - -func ExtractName(path string) (string, string, error) { - // path format : /path/to/jsonl/BASE_TABLE.jsonl - fileName := strings.TrimSuffix(filepath.Base(path), filepath.Ext(filepath.Base(path))) - - parts := strings.Split(fileName, "_") - if len(parts) != 2 { //nolint:gomnd - return "", "", fmt.Errorf("%w : %s", ErrNonExtractibleValue, path) - } - - baseName := parts[0] - if baseName == "" { - return "", "", fmt.Errorf("%w : base name is empty from %s", ErrNonExtractibleValue, path) - } - - tableName := parts[1] - if tableName == "" { - return "", "", fmt.Errorf("%w : table name is empty from %s", ErrNonExtractibleValue, path) - } - - return baseName, tableName, nil -} - -var ErrNonUniqueBase = errors.New("base name is not unique") - -func BaseIsUnique(pathList []string) error { - baseName, _, err := ExtractName(pathList[0]) - if err != nil { - return err - } - - for _, path := range pathList { - baseNameI, _, err := ExtractName(path) - if err != nil { - return err - } - - if baseName != baseNameI { - return fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI) - } - } - - return nil -} diff --git a/pkg/analyse/analyse_test.go b/pkg/analyse/analyse_test.go deleted file mode 100644 index b3a6aa2..0000000 --- a/pkg/analyse/analyse_test.go +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package analyse_test - -import ( - "bytes" - "errors" - "fmt" - "os" - "path/filepath" - "reflect" - "strings" - "testing" - "time" - - "gopkg.in/yaml.v3" - - "github.com/cgi-fr/rimo/pkg/analyse" - "github.com/cgi-fr/rimo/pkg/io" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/hexops/valast" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -const ( - dataDir = "../../testdata/" - inputName = "data_input.jsonl" - outputName = "data_output.yaml" - expectedName = "data_expected.yaml" -) - -type testCase struct { - name string - inputPath string - outputPath string - expectedPath string -} - -func getTestCase(dataFolder string) testCase { - return testCase{ - name: filepath.Base(dataFolder), - inputPath: filepath.Join(dataFolder, inputName), - outputPath: filepath.Join(dataFolder, outputName), - expectedPath: filepath.Join(dataFolder, expectedName), - } -} - -// Execute Analyse pipeline and compare with expected result. -func TestAnalyse(t *testing.T) { - t.Parallel() - - testCases := []testCase{} - testCases = append(testCases, getTestCase("../../testdata/data1/")) - testCases = append(testCases, getTestCase("../../testdata/data2/")) - - for _, testCase := range testCases { - testCase := testCase // capture range variable - t.Run(testCase.name, func(t *testing.T) { - t.Parallel() - - runAnalyse(t, testCase.inputPath, testCase.outputPath) - compareFileOutput(t, testCase.outputPath, testCase.expectedPath) - compareObjectOutput(t, testCase.outputPath, testCase.expectedPath) - }) - } -} - -func runAnalyse(t *testing.T, inputPath string, outputPath string) { - t.Helper() - - inputList := []string{inputPath} - - base, err := analyse.Build(inputList) - require.NoError(t, err) - - if outputPath != "" { - err = io.Export(base, outputPath) - require.NoError(t, err) - } -} - -func compareFileOutput(t *testing.T, outputPath string, testPath string) { - t.Helper() - - actualOutput := getText(t, outputPath) - expectedOutput := getText(t, testPath) - - // Call removeSampleFromStrings - actualOutput = removeSampleFromStrings(actualOutput) - expectedOutput = removeSampleFromStrings(expectedOutput) - - // Compare the expected output and actual output - assert.Equal(t, expectedOutput, actualOutput) -} - -func compareObjectOutput(t *testing.T, outputPath string, testPath string) { - t.Helper() - - actualOutputBase := loadYAML(t, outputPath) - expectedOutputBase := loadYAML(t, testPath) - - // Remove sample fields from both model.Base. - actualOutputBase = removeSampleFromBase(actualOutputBase) - expectedOutputBase = removeSampleFromBase(expectedOutputBase) - - // Compare the expected output and actual output except all sample fields. - equal, diff := EqualBase(expectedOutputBase, actualOutputBase) - if !equal { - t.Errorf("base are not similar : %s", diff) - } -} - -// Benchmark Analyse pipeline. - -func BenchmarkAnalyse(b *testing.B) { - for _, numLines := range []int{100, 1000, 10000, 100000} { - inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines)) - inputList := []string{inputPath} - outputPath := filepath.Join(dataDir, "benchmark/mixed/") - - b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) { - startTime := time.Now() - - b.ResetTimer() - for n := 0; n < b.N; n++ { - err := analyse.Orchestrator(inputList, outputPath) - require.NoError(b, err) - } - b.StopTimer() - - elapsed := time.Since(startTime) - linesPerSecond := float64(numLines*b.N) / elapsed.Seconds() - b.ReportMetric(linesPerSecond, "lines/s") - }) - } -} - -func TestExtractName(t *testing.T) { - t.Parallel() - - path := "path/to/dir/basename_tablename.jsonl" - expectedBase, expectedName := "basename", "tablename" - actualBase, actualName, err := analyse.ExtractName(path) - assert.NoError(t, err) - - assert.Equal(t, expectedBase, actualBase) - assert.Equal(t, expectedName, actualName) - - path = "basename_tablename.jsonl" - expectedBase, expectedName = "basename", "tablename" - actualBase, actualName, err = analyse.ExtractName(path) - assert.NoError(t, err) - - assert.Equal(t, expectedBase, actualBase) - assert.Equal(t, expectedName, actualName) - - invalidPath := "" - - _, _, err = analyse.ExtractName(invalidPath) - if !errors.Is(err, analyse.ErrNonExtractibleValue) { - t.Errorf("expected error %v, but got %v", analyse.ErrNonExtractibleValue, err) - } -} - -func TestBaseIsUnique(t *testing.T) { - t.Parallel() - - inputList := []string{ - "/data/somewhere/BASE_test.jsonl", - "/data/somewhere/BASE3221_test.jsonl", - } - - err := analyse.BaseIsUnique(inputList) - assert.ErrorIs(t, err, analyse.ErrNonUniqueBase) -} - -// Helper functions - -func loadYAML(t *testing.T, path string) model.Base { - t.Helper() - - // Load output file - file, err := os.Open(path) - require.NoError(t, err) - - decoder := yaml.NewDecoder(file) - - var base model.Base - err = decoder.Decode(&base) - - if err != nil { - t.Errorf("error while decoding yaml file: %v", err) - } - - file.Close() - - return base -} - -func getText(t *testing.T, outputPath string) string { - t.Helper() - - file, err := os.Open(outputPath) - require.NoError(t, err) - - var output string - - buf := new(bytes.Buffer) - _, err = buf.ReadFrom(file) - require.NoError(t, err) - file.Close() - - output = buf.String() - - return output -} - -func removeSampleFromBase(base model.Base) model.Base { - for tableI, table := range base.Tables { - for columnJ, column := range table.Columns { - column.MainMetric.Sample = nil - - if column.Type == model.ValueType.String { - for freqLen := range column.StringMetric.MostFreqLen { - column.StringMetric.MostFreqLen[freqLen].Sample = nil - } - - for freqLen := range column.StringMetric.LeastFreqLen { - column.StringMetric.LeastFreqLen[freqLen].Sample = nil - } - } - - base.Tables[tableI].Columns[columnJ] = column - } - } - - return base -} - -func removeSampleFromStrings(rimoString string) string { - // Split at every new line - lines := strings.Split(rimoString, "\n") - - // Filter out sample by skipping sampleSize + 1 lines when a line contain "sample" or "leastFrequentSample:" - var filteredLines []string - - var skipLine int - - sampleSizeSkip := model.SampleSize + 1 - - for _, line := range lines { - // sample of stringMetric.MostFreqLen and stringMetric.LeastFreqLen may be of different length, skipping when nex - if skipLine > 0 && strings.Contains(line, " - length:") || strings.Contains(line, " - name:") { - skipLine = 0 - } - - switch { - case skipLine > 0: - skipLine-- - case strings.Contains(line, "sample:"): - skipLine = sampleSizeSkip - default: - filteredLines = append(filteredLines, line) - } - } - - // Join the filtered lines back into a string - rimoString = strings.Join(filteredLines, "\n") - - return rimoString -} - -// DeepEqual two model.Base. -func EqualBase(base1, base2 model.Base) (bool, string) { - if !reflect.DeepEqual(base1, base2) { - return false, fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2)) - } - - return true, "" -} diff --git a/pkg/io/utils.go b/pkg/io/utils.go deleted file mode 100644 index 9b8fbef..0000000 --- a/pkg/io/utils.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package io - -import ( - "fmt" - "os" -) - -var ( - ErrFileDoesNotExist = fmt.Errorf("file does not exist") - ErrDirDoesNotExist = fmt.Errorf("directory does not exist") - ErrPathIsNotDir = fmt.Errorf("path is not a directory") - ErrNotRegularFile = fmt.Errorf("path is not a regular file") - ErrReadPermission = fmt.Errorf("user does not have read permission for file") - ErrWriteDirPermission = fmt.Errorf("user does not have write permission for directory") -) - -const ( - ReadPerm os.FileMode = 0o400 - WriteDirPerm os.FileMode = 0o200 -) - -func ValidateFilePath(path string) error { - fileInfo, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return fmt.Errorf("%w : %s", ErrFileDoesNotExist, path) - } - - return fmt.Errorf("%w : failed to get file info %s", err, path) - } - - if !fileInfo.Mode().IsRegular() { - return fmt.Errorf("%w : %s", ErrNotRegularFile, path) - } - - if fileInfo.Mode().Perm()&ReadPerm != ReadPerm { - return fmt.Errorf("%w : %s", ErrReadPermission, path) - } - - return nil -} - -func ValidateDirPath(path string) error { - fileInfo, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return fmt.Errorf("%w : %s", ErrDirDoesNotExist, path) - } - - return fmt.Errorf("failed to get directory info: %w", err) - } - - if !fileInfo.IsDir() { - return fmt.Errorf("%w : %s", ErrPathIsNotDir, path) - } - - if fileInfo.Mode().Perm()&WriteDirPerm != WriteDirPerm { - return fmt.Errorf("%w : %s", ErrWriteDirPermission, path) - } - - return nil -} diff --git a/pkg/metric/build.go b/pkg/metric/build.go index b80c6c5..058c539 100644 --- a/pkg/metric/build.go +++ b/pkg/metric/build.go @@ -34,7 +34,7 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) { // Create the column. col := model.Column{ Name: colName, - Type: ColType(values), + Type: GetColType(values), Concept: "", Constraint: []string{}, Confidential: confidential, @@ -52,19 +52,19 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) { // Type specific metric switch col.Type { - case model.ValueType.String: + case model.ColType.String: err := SetStringMetric(values, &col.StringMetric) if err != nil { return model.Column{}, fmt.Errorf("error computing string metric in column %v : %w", col.Name, err) } - case model.ValueType.Numeric: + case model.ColType.Numeric: err := SetNumericMetric(values, &col.NumericMetric) if err != nil { return model.Column{}, fmt.Errorf("error computing numeric metric in column %v : %w", col.Name, err) } - case model.ValueType.Bool: + case model.ColType.Bool: err := SetBoolMetric(values, &col.BoolMetric) if err != nil { return model.Column{}, fmt.Errorf("error computing bool metric in column %v : %w", col.Name, err) @@ -74,10 +74,10 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) { return col, nil } -func ColType(values []interface{}) model.RIMOType { - colType := model.ValueType.Undefined - for i := 0; i < len(values) && colType == model.ValueType.Undefined; i++ { - colType = ValueType(values[i]) +func GetColType(values []interface{}) model.ValueType { + colType := model.ColType.Undefined + for i := 0; i < len(values) && colType == model.ColType.Undefined; i++ { + colType = ColType(values[i]) } return colType @@ -89,6 +89,7 @@ func GetFrequency(occurrence int, count int) float64 { return float64(occurrence) / float64(count) } +// To check why not using isNil() ? func GetFirstValue(values []interface{}) interface{} { for _, value := range values { if value != nil { @@ -99,19 +100,19 @@ func GetFirstValue(values []interface{}) interface{} { return nil } -func ValueType(value interface{}) model.RIMOType { +func ColType(value interface{}) model.ValueType { switch value.(type) { case int: - return model.ValueType.Numeric + return model.ColType.Numeric case float64: - return model.ValueType.Numeric + return model.ColType.Numeric case json.Number: - return model.ValueType.Numeric + return model.ColType.Numeric case string: - return model.ValueType.String + return model.ColType.String case bool: - return model.ValueType.Bool + return model.ColType.Bool default: - return model.ValueType.Undefined + return model.ColType.Undefined } } diff --git a/pkg/metric/build_test.go b/pkg/metric/build_test.go deleted file mode 100644 index 5fa9249..0000000 --- a/pkg/metric/build_test.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package metric_test - -import ( - "fmt" - "testing" - "time" - - "github.com/cgi-fr/rimo/pkg/analyse" - "github.com/cgi-fr/rimo/pkg/model" - "github.com/stretchr/testify/require" -) - -const ( - dataDir = "../../testdata/benchmark" -) - -var result model.Base //nolint:gochecknoglobals // used in benchmark to avoid misleading compiler optimisation. - -func BenchmarkMetric(b *testing.B) { - listNumValues := []int{100, 1000, 10000} - listType := []string{"numeric", "text", "bool"} - - for _, dataType := range listType { - for _, numValues := range listNumValues { - inputList := []string{fmt.Sprintf("%s/%s/%d_input.jsonl", dataDir, dataType, numValues)} - - b.Run(fmt.Sprintf("type= %s, numValues=%d", dataType, numValues), func(b *testing.B) { - startTime := time.Now() - - base := model.Base{} //nolint:exhaustruct - var err error - - for n := 0; n < b.N; n++ { - base, err = analyse.Build(inputList) - require.NoError(b, err) - } - - result = base - - elapsed := time.Since(startTime) - valuesPerSecond := float64(numValues*b.N) / elapsed.Seconds() - b.ReportMetric(valuesPerSecond, "lines/s") - }) - } - } -} diff --git a/pkg/metric/generic_test.go b/pkg/metric/generic_test.go index 5210e7d..4af5071 100644 --- a/pkg/metric/generic_test.go +++ b/pkg/metric/generic_test.go @@ -36,16 +36,16 @@ func TestCountEmpty(t *testing.T) { assert.Equal(t, expected, actual) } -func TestColType(t *testing.T) { +func TestGetColType(t *testing.T) { t.Parallel() t.Run("numeric", func(t *testing.T) { t.Parallel() slice := []interface{}{nil, 2, 3} - expected := model.ValueType.Numeric + expected := model.ColType.Numeric - actual := metric.ColType(slice) + actual := metric.GetColType(slice) require.Equal(t, expected, actual) }) @@ -53,9 +53,9 @@ func TestColType(t *testing.T) { t.Parallel() slice := []interface{}{nil, "text", nil} - expected := model.ValueType.String + expected := model.ColType.String - actual := metric.ColType(slice) + actual := metric.GetColType(slice) require.Equal(t, expected, actual) }) @@ -63,9 +63,9 @@ func TestColType(t *testing.T) { t.Parallel() slice := []interface{}{nil, true, false} - expected := model.ValueType.Bool + expected := model.ColType.Bool - actual := metric.ColType(slice) + actual := metric.GetColType(slice) require.Equal(t, expected, actual) }) @@ -74,9 +74,9 @@ func TestColType(t *testing.T) { t.Parallel() slice := []interface{}{"text", 2, false} - expected := model.ValueType.String + expected := model.ColType.String - actual := metric.ColType(slice) + actual := metric.GetColType(slice) require.Equal(t, expected, actual) }) @@ -84,9 +84,9 @@ func TestColType(t *testing.T) { t.Parallel() slice := []interface{}{nil, nil, nil} - expected := model.ValueType.Undefined + expected := model.ColType.Undefined - actual := metric.ColType(slice) + actual := metric.GetColType(slice) require.Equal(t, expected, actual) }) } diff --git a/pkg/metric/metricbool.go b/pkg/metric/metricbool.go index fb38bfe..a3a38ed 100644 --- a/pkg/metric/metricbool.go +++ b/pkg/metric/metricbool.go @@ -2,18 +2,18 @@ // // This file is part of RIMO. // -// RIMO is free software: you can redistribute it and/or modify +// rimo is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // -// RIMO is distributed in the hope that it will be useful, +// rimo is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . +// along with rimo. If not, see . package metric diff --git a/pkg/model/base.go b/pkg/model/base.go new file mode 100644 index 0000000..49eed63 --- /dev/null +++ b/pkg/model/base.go @@ -0,0 +1,50 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package model + +import ( + "fmt" + "reflect" + + "github.com/hexops/valast" +) + +// RIMO YAML structure. +type ( + Base struct { + Name string `json:"database" jsonschema:"required" yaml:"database"` + // Tables should be map[string][]Column + Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"` + } + + Table struct { + Name string `json:"name" jsonschema:"required" yaml:"name"` + Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"` + } +) + +// Should be improved with more detail about difference. +func SameBase(base1, base2 *Base) (bool, string) { + if !reflect.DeepEqual(base1, base2) { + msg := fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2)) + + return false, msg + } + + return true, "" +} diff --git a/pkg/model/column.go b/pkg/model/column.go new file mode 100644 index 0000000..223bcb0 --- /dev/null +++ b/pkg/model/column.go @@ -0,0 +1,44 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package model + +const ( + SampleSize int = 5 + MostFrequentLenSize int = 5 + MostFrequentSampleSize int = 5 + LeastFrequentLenSize int = 5 + LeastFrequentSampleSize int = 5 +) + +type ( + Column struct { + Name string `json:"name" jsonschema:"required" yaml:"name"` + Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll + + // The 3 following parameter should be part of a Config struct + Concept string `json:"concept" jsonschema:"required" yaml:"concept"` + Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` + Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` + + MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` + + StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` + NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` + BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` + } +) diff --git a/pkg/model/metric.go b/pkg/model/metric.go new file mode 100644 index 0000000..b9c8f5d --- /dev/null +++ b/pkg/model/metric.go @@ -0,0 +1,64 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package model + +// RIMO YAML metrics. +type ( + GenericMetric struct { + Count int `json:"count" jsonschema:"required" yaml:"count"` + Empty int `json:"empty" jsonschema:"required" yaml:"empty"` + Unique int `json:"unique" jsonschema:"required" yaml:"unique"` + Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"` + } + + StringMetric struct { + MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"` + LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"` + } + + LenFreq struct { + Length int `json:"length" jsonschema:"required" yaml:"length"` + Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` + Sample []string `json:"sample" jsonschema:"required" yaml:"sample"` + } + + NumericMetric struct { + Min float64 `json:"min" jsonschema:"required" yaml:"min"` + Max float64 `json:"max" jsonschema:"required" yaml:"max"` + Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"` + } + + BoolMetric struct { + TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"` + } +) + +// Type that a column can be. +type ValueType string + +var ColType = struct { //nolint:gochecknoglobals + String ValueType + Numeric ValueType + Bool ValueType + Undefined ValueType +}{ + String: "string", + Numeric: "numeric", + Bool: "bool", + Undefined: "undefined", +} diff --git a/pkg/model/model.go b/pkg/model/model.go deleted file mode 100644 index 0872006..0000000 --- a/pkg/model/model.go +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (C) 2023 CGI France -// -// This file is part of RIMO. -// -// RIMO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RIMO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RIMO. If not, see . - -package model - -import ( - "encoding/json" - "fmt" - "sort" - - "github.com/invopop/jsonschema" -) - -type RIMOType string - -const ( - SampleSize int = 5 - MostFrequentLenSize int = 5 - MostFrequentSampleSize int = 5 - LeastFrequentLenSize int = 5 - LeastFrequentSampleSize int = 5 -) - -var ValueType = struct { //nolint:gochecknoglobals - String RIMOType - Numeric RIMOType - Bool RIMOType - Undefined RIMOType -}{ - String: "string", - Numeric: "numeric", - Bool: "bool", - Undefined: "undefined", -} - -// RIMO YAML structure. -type ( - Base struct { - Name string `json:"database" jsonschema:"required" yaml:"database"` - Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"` - } - - Table struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"` - } - - Column struct { - Name string `json:"name" jsonschema:"required" yaml:"name"` - Type RIMOType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll - Concept string `json:"concept" jsonschema:"required" yaml:"concept"` - Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"` - Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"` - MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"` - - StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"` - NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"` - BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"` - } -) - -// RIMO YAML metrics. -type ( - GenericMetric struct { - Count int `json:"count" jsonschema:"required" yaml:"count"` - Empty int `json:"empty" jsonschema:"required" yaml:"empty"` - Unique int `json:"unique" jsonschema:"required" yaml:"unique"` - Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"` - } - StringMetric struct { - MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"` - LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"` - } - - LenFreq struct { - Length int `json:"length" jsonschema:"required" yaml:"length"` - Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"` - Sample []string `json:"sample" jsonschema:"required" yaml:"sample"` - } - - NumericMetric struct { - Min float64 `json:"min" jsonschema:"required" yaml:"min"` - Max float64 `json:"max" jsonschema:"required" yaml:"max"` - Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"` - } - - BoolMetric struct { - TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"` - } -) - -func (base *Base) SortBase() { - for _, table := range base.Tables { - sort.Slice(table.Columns, func(i, j int) bool { - return table.Columns[i].Name < table.Columns[j].Name - }) - } - - sort.Slice(base.Tables, func(i, j int) bool { - return base.Tables[i].Name < base.Tables[j].Name - }) -} - -func GetJSONSchema() (string, error) { - resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct - if err != nil { - return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) - } - - return string(resBytes), nil -} diff --git a/pkg/model/utils.go b/pkg/model/utils.go new file mode 100644 index 0000000..f80a6e2 --- /dev/null +++ b/pkg/model/utils.go @@ -0,0 +1,123 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package model + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "sort" + + "github.com/invopop/jsonschema" + "gopkg.in/yaml.v3" +) + +func GetJSONSchema() (string, error) { + resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct + if err != nil { + return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err) + } + + return string(resBytes), nil +} + +func NewBase(name string) *Base { + return &Base{ + Name: name, + Tables: make([]Table, 0), + } +} + +var ErrBaseFormat = errors.New("error while decoding yaml file in a Base struct") + +// Can be improved. +func LoadBase(path string) (*Base, error) { + file, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("error while opening file: %w", err) + } + + decoder := yaml.NewDecoder(file) + + var base Base + + err = decoder.Decode(&base) + if err != nil { + return nil, ErrBaseFormat + } + + file.Close() + + return &base, nil +} + +func RemoveSampleFromBase(base *Base) { + for tableI, table := range base.Tables { + for columnJ, column := range table.Columns { + column.MainMetric.Sample = nil + + if column.Type == ColType.String { + for freqLen := range column.StringMetric.MostFreqLen { + column.StringMetric.MostFreqLen[freqLen].Sample = nil + } + + for freqLen := range column.StringMetric.LeastFreqLen { + column.StringMetric.LeastFreqLen[freqLen].Sample = nil + } + } + + base.Tables[tableI].Columns[columnJ] = column + } + } +} + +func (base *Base) SortBase() { + for _, table := range base.Tables { + sort.Slice(table.Columns, func(i, j int) bool { + return table.Columns[i].Name < table.Columns[j].Name + }) + } + + sort.Slice(base.Tables, func(i, j int) bool { + return base.Tables[i].Name < base.Tables[j].Name + }) +} + +func (base *Base) AddColumn(column Column, tableName string) { + mapTableName := make(map[string]int) + for index, table := range base.Tables { + mapTableName[table.Name] = index + } + + if index, ok := mapTableName[tableName]; ok { + // If the table exists, append the column to the table + base.Tables[index].Columns = append(base.Tables[index].Columns, column) + } else { + // If the table does not exist, create a new table and add it to the base + table := Table{ + Name: tableName, + Columns: []Column{column}, + } + base.Tables = append(base.Tables, table) + } +} + +// If the table does not exist, create a new table and add it to the base +// table := Table{Name: tableName, Columns: []Column{column}} +// base.Tables = append(base.Tables, table) diff --git a/pkg/model/utils_test.go b/pkg/model/utils_test.go new file mode 100644 index 0000000..16c619e --- /dev/null +++ b/pkg/model/utils_test.go @@ -0,0 +1,66 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package model_test + +import ( + "testing" + + "github.com/cgi-fr/rimo/pkg/model" +) + +func TestAddColumn(t *testing.T) { + t.Parallel() + + base := model.NewBase("test_base") + + column := model.Column{ //nolint:exhaustruct + Name: "test_column", + Type: model.ColType.String, + Concept: "test_concept", + } + + tableName := "test_table" + + base.AddColumn(column, tableName) + + // fmt.Print(valast.String(base)) + + if len(base.Tables) != 1 { + t.Errorf("expected 1 table, got %d", len(base.Tables)) + } + + if base.Tables[0].Name != tableName { + t.Errorf("expected table name %q, got %q", tableName, base.Tables[0].Name) + } + + if len(base.Tables[0].Columns) != 1 { + t.Errorf("expected 1 column, got %d", len(base.Tables[0].Columns)) + } + + if base.Tables[0].Columns[0].Name != column.Name { + t.Errorf("expected column name %q, got %q", column.Name, base.Tables[0].Columns[0].Name) + } + + if base.Tables[0].Columns[0].Type != column.Type { + t.Errorf("expected column type %q, got %q", column.Type, base.Tables[0].Columns[0].Type) + } + + if base.Tables[0].Columns[0].Concept != column.Concept { + t.Errorf("expected column concept %q, got %q", column.Concept, base.Tables[0].Columns[0].Concept) + } +} diff --git a/pkg/io/export.go b/pkg/rimo/driven.go similarity index 60% rename from pkg/io/export.go rename to pkg/rimo/driven.go index 764ee2b..1928b2a 100644 --- a/pkg/io/export.go +++ b/pkg/rimo/driven.go @@ -15,32 +15,18 @@ // You should have received a copy of the GNU General Public License // along with RIMO. If not, see . -package io +package rimo import ( - "fmt" - "os" - "github.com/cgi-fr/rimo/pkg/model" - "gopkg.in/yaml.v3" ) -func Export(base model.Base, outputPath string) error { - // Create output file. - outputFile, err := os.Create(outputPath) - if err != nil { - return fmt.Errorf("failed to create output file: %w", err) - } - defer outputFile.Close() - - // Encode Base to YAML. - encoder := yaml.NewEncoder(outputFile) - defer encoder.Close() - - err = encoder.Encode(base) - if err != nil { - return fmt.Errorf("failed to encode Base to YAML: %w", err) - } +type Reader interface { + BaseName() string + Next() bool // itère sur les colonnes. + Value() ([]interface{}, string, string, error) // colValues, colName, tableName +} - return nil +type Writer interface { + Export(base *model.Base) error } diff --git a/pkg/rimo/driven_test.go b/pkg/rimo/driven_test.go new file mode 100644 index 0000000..da635c9 --- /dev/null +++ b/pkg/rimo/driven_test.go @@ -0,0 +1,108 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package rimo_test + +import ( + "log" + "math" + "testing" + + "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/rimo" +) + +// TESTS + +func TestTestInterface(t *testing.T) { + t.Parallel() + + var _ rimo.Reader = (*TestReader)(nil) + + var _ rimo.Writer = (*TestWriter)(nil) +} + +// TestReader implementation + +type colInput struct { + ColName string + ColValues []interface{} +} + +type TestReader struct { + baseName string + data []colInput + tableNames []string // Next() will progressively change tableName + // internal + index int + currentValues []interface{} + currentColName string + currentTableName string +} + +func (r *TestReader) BaseName() string { + return r.baseName +} + +func (r *TestReader) Next() bool { + if r.index == len(r.data) { + log.Println("End of data") + + return false + } + + // update tableName + if len(r.tableNames) == len(r.data) { + r.currentTableName = r.tableNames[r.index] + } else { + // use a percentage to determine the table name to use from the list + percentageComplete := float64(r.index) / float64(len(r.data)) + expectedTableIndex := percentageComplete * float64(len(r.tableNames)) + roundedTableIndex := math.Floor(expectedTableIndex) + tableNameIndex := int(roundedTableIndex) + + r.currentTableName = r.tableNames[tableNameIndex] + } + + r.currentColName = r.data[r.index].ColName + r.currentValues = r.data[r.index].ColValues + r.index++ + + return true +} + +func (r *TestReader) Value() ([]interface{}, string, string, error) { //nolint:wsl + // log.Printf("Processing %s column in %s table", r.currentTableName, r.currentColName) + + return r.currentValues, r.currentColName, r.currentTableName, nil +} + +// TestWriter implementation + +type TestWriter struct { + base model.Base +} + +func (w *TestWriter) Export(base *model.Base) error { + w.base = *base + + return nil +} + +func (w *TestWriter) Base() *model.Base { + return &w.base +} diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go new file mode 100644 index 0000000..e626bbc --- /dev/null +++ b/pkg/rimo/driver.go @@ -0,0 +1,66 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package rimo + +import ( + "fmt" + + "github.com/cgi-fr/rimo/pkg/metric" + "github.com/cgi-fr/rimo/pkg/model" + + "github.com/rs/zerolog/log" +) + +func AnalyseBase(reader Reader, writer Writer) error { + // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel) + baseName := reader.BaseName() + + // log.Debug().Msgf("Processing [%s base]", baseName) + + base := model.NewBase(baseName) + + for reader.Next() { // itère colonne par colonne + colValues, colName, tableName, err := reader.Value() + if err != nil { + return fmt.Errorf("failed to get column value : %w", err) + } + + column, err := metric.ComputeMetric(colName, colValues) + if err != nil { + return fmt.Errorf("failed to compute column : %w", err) + } + + log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, tableName, column.Name) + // log.Debug().Msg(valast.String(column)) + + base.AddColumn(column, tableName) + } + + base.SortBase() + + // log.Debug().Msg("---------- Finish processing base :") + // log.Debug().Msg(valast.String(*base)) + // log.Debug().Msg("----------") + + err := writer.Export(base) + if err != nil { + return fmt.Errorf("failed to export base : %w", err) + } + + return nil +} diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go new file mode 100644 index 0000000..186e18f --- /dev/null +++ b/pkg/rimo/driver_test.go @@ -0,0 +1,175 @@ +// Copyright (C) 2023 CGI France +// +// This file is part of RIMO. +// +// RIMO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RIMO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RIMO. If not, see . + +package rimo_test + +import ( + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/cgi-fr/rimo/internal/infra" + "github.com/cgi-fr/rimo/pkg/model" + "github.com/cgi-fr/rimo/pkg/rimo" + + "github.com/hexops/valast" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Run Analyse pipeline with FilesReader and TestWriter and compare with expected result. +const ( + dataDir = "../../testdata/" + inputName = "data_input.jsonl" + outputName = "interface_data_output.yaml" + expectedName = "data_expected.yaml" +) + +type testCase struct { + name string + inputPath string + expectedPath string +} + +func getTestCase(dataFolder string) testCase { + return testCase{ + name: filepath.Base(dataFolder), + inputPath: filepath.Join(dataFolder, inputName), + expectedPath: filepath.Join(dataFolder, expectedName), + } +} + +// PIPELINE TESTS + +// Note : numeric value should be converted to float64. +func TestManualPipeline(t *testing.T) { + t.Parallel() + + // Set up TestReader + baseName := "databaseName" + tableNames := []string{"tableTest"} + testInput := []colInput{ + { + ColName: "string", + ColValues: []interface{}{"val1", "val2", "val3"}, + }, + { + ColName: "col2", + ColValues: []interface{}{true, false, nil}, + }, + { + ColName: "col9", + ColValues: []interface{}{float64(31), float64(29), float64(42)}, + }, + { + ColName: "empty", + ColValues: []interface{}{nil, nil, nil}, + }, + } + + testReader := TestReader{ //nolint:exhaustruct + baseName: baseName, + tableNames: tableNames, + data: testInput, + index: 0, + } + + testWriter := TestWriter{} //nolint:exhaustruct + + err := rimo.AnalyseBase(&testReader, &testWriter) + if err != nil { + t.Errorf("Error: %v", err) + } + + t.Logf("Base returned : %s", valast.String(*testWriter.Base())) +} + +// Ensure that the pipeline produce the same base as expected. +func TestPipeline(t *testing.T) { + t.Parallel() + + testCases := []testCase{} + testCases = append(testCases, getTestCase("../../testdata/data1/")) + // testCases = append(testCases, getTestCase("../../testdata/data2/")) + + for _, testCase := range testCases { + testCase := testCase // capture range variable + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + + // Actual base + + reader, err := infra.FilesReaderFactory([]string{testCase.inputPath}) + assert.NoError(t, err) + + writer := &TestWriter{} //nolint:exhaustruct + + err = rimo.AnalyseBase(reader, writer) + assert.NoError(t, err) + + actualBase := writer.Base() + + // Expected base + expectedBase, err := model.LoadBase(testCase.expectedPath) + if err != nil { + t.Errorf("Error: %v", err) + } + + // Remove sample + model.RemoveSampleFromBase(expectedBase) + model.RemoveSampleFromBase(actualBase) + + fmt.Printf("Actual base : %s\n", valast.String(*actualBase)) + // Compare + equal, diff := model.SameBase(expectedBase, actualBase) + if !equal { + t.Errorf("Base are not equal:\n%s", diff) + } + }) + } +} + +// Benchmark (same as previous analyse_test.go benchmark). +func BenchmarkAnalyseInterface(b *testing.B) { + for _, numLines := range []int{100, 1000, 10000, 100000} { + inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines)) + inputList := []string{inputPath} + outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines)) + + b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) { + startTime := time.Now() + + reader, err := infra.FilesReaderFactory(inputList) + require.NoError(b, err) + + writer, err := infra.YAMLWriterFactory(outputPath) + require.NoError(b, err) + + b.ResetTimer() + for n := 0; n < b.N; n++ { + err := rimo.AnalyseBase(reader, writer) + require.NoError(b, err) + } + b.StopTimer() + + elapsed := time.Since(startTime) + linesPerSecond := float64(numLines*b.N) / elapsed.Seconds() + b.ReportMetric(linesPerSecond, "lines/s") + }) + } +} diff --git a/test/suites/testdata/data1/output/data.yaml b/test/suites/testdata/data1/output/data.yaml deleted file mode 100644 index 1bb32f7..0000000 --- a/test/suites/testdata/data1/output/data.yaml +++ /dev/null @@ -1,144 +0,0 @@ -database: data -tables: - - name: input - columns: - - name: address - type: string - concept: "" - constraint: [] - confidential: null - mainMetric: - count: 10 - empty: 0 - unique: 10 - sample: - - PSC 4713, Box 9649 APO AA 43433 - - 9038 Frye Ramp South Cheryltown, CT 54262 - - 25545 Cole Court Newtonfurt, KY 13882 - - 06210 David Court South Kimberly, IL 10236 - - 536 Robinson Estates Austinside, NV 69535 - stringMetric: - mostFrequentLen: - - length: 42 - freq: 0.3 - sample: - - 06210 David Court South Kimberly, IL 10236 - - 0301 Amy Grove Apt. 325 Janefort, MA 84102 - - 095 Jennifer Turnpike Castrobury, NY 98111 - - length: 41 - freq: 0.2 - sample: - - 536 Robinson Estates Austinside, NV 69535 - - 9038 Frye Ramp South Cheryltown, CT 54262 - - length: 31 - freq: 0.1 - sample: - - PSC 4713, Box 9649 APO AA 43433 - - length: 37 - freq: 0.1 - sample: - - 25545 Cole Court Newtonfurt, KY 13882 - leastFrequentLen: - - length: 52 - freq: 0.1 - sample: - - 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407 - - length: 45 - freq: 0.1 - sample: - - 2035 Simmons Islands Heatherchester, IN 46152 - - length: 43 - freq: 0.1 - sample: - - 38432 Moreno Turnpike Garrettland, TN 72939 - - name: age - type: numeric - concept: "" - constraint: [] - confidential: null - mainMetric: - count: 10 - empty: 0 - unique: 9 - sample: - - 35 - - 73 - - 73 - - 80 - - 73 - numericMetric: - min: 29 - max: 95 - mean: 57.7 - - name: date - type: string - concept: "" - constraint: [] - confidential: null - mainMetric: - count: 10 - empty: 0 - unique: 10 - sample: - - "2003-10-11" - - "2022-04-23" - - "2001-08-23" - - "2001-08-23" - - "2003-10-11" - stringMetric: - mostFrequentLen: - - length: 10 - freq: 1 - sample: - - "2022-04-23" - - "2004-07-04" - - "2004-07-04" - - "2005-05-10" - - "2014-07-24" - leastFrequentLen: [] - - name: phone - type: string - concept: "" - constraint: [] - confidential: null - mainMetric: - count: 10 - empty: 0 - unique: 10 - sample: - - 001-845-854-2110 - - +1-407-997-8293x68130 - - (517)819-3454 - - 001-845-854-2110 - - 260-587-0590 - stringMetric: - mostFrequentLen: - - length: 16 - freq: 0.4 - sample: - - 001-845-854-2110 - - 001-533-758-7269 - - 001-958-985-3039 - - 001-866-271-0116 - - length: 12 - freq: 0.2 - sample: - - 828-755-3826 - - 260-587-0590 - - length: 10 - freq: 0.1 - sample: - - "7795418893" - leastFrequentLen: - - length: 21 - freq: 0.1 - sample: - - +1-407-997-8293x68130 - - length: 18 - freq: 0.1 - sample: - - (330)616-7639x7810 - - length: 13 - freq: 0.1 - sample: - - (517)819-3454 diff --git a/testdata/data0/data_expected.yaml b/testdata/data0/data_expected.yaml new file mode 100644 index 0000000..78ced89 --- /dev/null +++ b/testdata/data0/data_expected.yaml @@ -0,0 +1,115 @@ +database: data +tables: + - name: input + columns: + - name: address + type: string + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 3 + empty: 0 + unique: 3 + sample: + - PSC + - "095" + - "06210" + stringMetric: + mostFrequentLen: + - length: 3 + freq: 0.6666666666666666 + sample: + - PSC + - "095" + leastFrequentLen: + - length: 5 + freq: 0.3333333333333333 + sample: + - "06210" + - name: age + type: numeric + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 3 + empty: 2 + unique: 1 + sample: + - 61 + numericMetric: + min: 61 + max: 61 + mean: 61 + - name: empty + type: undefined + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 3 + empty: 3 + unique: 0 + sample: [] + - name: major + type: bool + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 3 + empty: 0 + unique: 2 + sample: + - true + - false + boolMetric: + trueRatio: 0.6666666666666666 + - name: input2 + columns: + - name: string + type: string + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 4 + empty: 0 + unique: 4 + sample: + - Hello World + - Hello World2 + - Hello World3 + - Hello World5 + stringMetric: + mostFrequentLen: + - length: 12 + freq: 0.75 + sample: + - Hello World2 + - Hello World3 + - Hello World5 + leastFrequentLen: + - length: 11 + freq: 0.25 + sample: + - Hello World + - name: time + type: string + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 4 + empty: 0 + unique: 1 + sample: + - "20:03" + stringMetric: + mostFrequentLen: + - length: 5 + freq: 1 + sample: + - "20:03" + leastFrequentLen: [] diff --git a/testdata/data0/data_input.jsonl b/testdata/data0/data_input.jsonl new file mode 100644 index 0000000..9157492 --- /dev/null +++ b/testdata/data0/data_input.jsonl @@ -0,0 +1,3 @@ +{"address": "PSC", "age": null, "major": true, "empty": null} +{"address": "095", "age": null, "major": false, "empty": null} +{"address": "06210", "age": 61, "major": true, "empty": null} diff --git a/testdata/data0/data_input2.jsonl b/testdata/data0/data_input2.jsonl new file mode 100644 index 0000000..6af2498 --- /dev/null +++ b/testdata/data0/data_input2.jsonl @@ -0,0 +1,4 @@ +{"string" : "Hello World", "time" : "20:03"} +{"string" : "Hello World2", "time" : "20:03"} +{"string" : "Hello World3", "time" : "20:03"} +{"string" : "Hello World5", "time" : "20:03"}