diff --git a/cmd/rimo/main.go b/cmd/rimo/main.go
index 0b562c9..e75290a 100644
--- a/cmd/rimo/main.go
+++ b/cmd/rimo/main.go
@@ -22,9 +22,9 @@ import (
"os"
"path/filepath"
- "github.com/cgi-fr/rimo/pkg/analyse"
- "github.com/cgi-fr/rimo/pkg/io"
+ "github.com/cgi-fr/rimo/internal/infra"
"github.com/cgi-fr/rimo/pkg/model"
+ "github.com/cgi-fr/rimo/pkg/rimo"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"github.com/spf13/cobra"
@@ -67,6 +67,7 @@ func main() { //nolint:funlen
},
}
+ // Make use of interface instead of analyse/pkg
rimoAnalyseCmd := &cobra.Command{ //nolint:exhaustruct
Use: "analyse [inputDir] [outputDir]",
Short: "Generate a rimo.yaml from a directory of .jsonl files",
@@ -75,21 +76,33 @@ func main() { //nolint:funlen
inputDir := args[0]
outputDir := args[1]
- // List .jsonl files in input directory
- if err := io.ValidateDirPath(inputDir); err != nil {
- log.Fatal().Msgf("error validating input directory: %v", err)
- }
+ // Reader
- inputList, err := FilesList(inputDir, ".jsonl")
+ inputList, err := BuildFilepathList(inputDir, ".jsonl")
if err != nil {
log.Fatal().Msgf("error listing files: %v", err)
}
- if len(inputList) == 0 {
- log.Fatal().Msgf("no .jsonl files found in %s", inputDir)
+ reader, err := infra.FilesReaderFactory(inputList)
+ if err != nil {
+ log.Fatal().Msgf("error creating reader: %v", err)
+ }
+
+ // Writer
+ // (could be relocated to infra.FilesReader)
+ baseName, _, err := infra.ExtractName(inputList[0])
+ if err != nil {
+ log.Fatal().Msgf("error extracting base name: %v", err)
+ }
+
+ outputPath := filepath.Join(outputDir, fmt.Sprintf("%s.yaml", baseName))
+
+ writer, err := infra.YAMLWriterFactory(outputPath)
+ if err != nil {
+ log.Fatal().Msgf("error creating writer: %v", err)
}
- err = analyse.Orchestrator(inputList, outputDir)
+ err = rimo.AnalyseBase(reader, writer)
if err != nil {
log.Fatal().Msgf("error generating rimo.yaml: %v", err)
}
@@ -117,3 +130,44 @@ func FilesList(path string, extension string) ([]string, error) {
return files, nil
}
+
+var ErrNoFile = fmt.Errorf("no file found")
+
+func BuildFilepathList(path string, extension string) ([]string, error) {
+ err := ValidateDirPath(path)
+ if err != nil {
+ return nil, fmt.Errorf("failed to validate input directory: %w", err)
+ }
+
+ pattern := filepath.Join(path, "*"+extension)
+
+ files, err := filepath.Glob(pattern)
+ if err != nil {
+ return nil, fmt.Errorf("error listing files: %w", err)
+ }
+
+ if len(files) == 0 {
+ return nil, fmt.Errorf("%w : no %s files found in %s", ErrNoFile, extension, path)
+ }
+
+ return files, nil
+}
+
+func ValidateDirPath(path string) error {
+ fileInfo, err := os.Stat(path)
+ if os.IsNotExist(err) {
+ return fmt.Errorf("%w: %s", infra.ErrDirDoesNotExist, path)
+ } else if err != nil {
+ return fmt.Errorf("failed to get directory info: %w", err)
+ }
+
+ if !fileInfo.IsDir() {
+ return fmt.Errorf("%w: %s", infra.ErrPathIsNotDir, path)
+ }
+
+ if fileInfo.Mode().Perm()&infra.WriteDirPerm != infra.WriteDirPerm {
+ return fmt.Errorf("%w: %s", infra.ErrWriteDirPermission, path)
+ }
+
+ return nil
+}
diff --git a/internal/infra/fileWriter.go b/internal/infra/fileWriter.go
new file mode 100644
index 0000000..d652811
--- /dev/null
+++ b/internal/infra/fileWriter.go
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package infra
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/cgi-fr/rimo/pkg/model"
+ "gopkg.in/yaml.v3"
+)
+
+// Terminal writter interface
+
+type StdoutWriter struct{}
+
+func StdoutWriterFactory() *StdoutWriter {
+ writer := StdoutWriter{}
+
+ return &writer
+}
+
+func (w *StdoutWriter) Export(base *model.Base) error {
+ fmt.Printf("%v\n", base)
+
+ return nil
+}
+
+// YAML Writter interface
+
+type YAMLWriter struct {
+ outputPath string
+}
+
+func YAMLWriterFactory(filepath string) (*YAMLWriter, error) {
+ err := ValidateOutputPath(filepath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to validate file path: %w", err)
+ }
+
+ writer := YAMLWriter{
+ outputPath: filepath,
+ }
+
+ return &writer, nil
+}
+
+// Write a YAML file from RIMO base at outputPath.
+func (w *YAMLWriter) Export(base *model.Base) error {
+ outputFile, err := os.Create(w.outputPath)
+ if err != nil {
+ return fmt.Errorf("failed to create output file: %w", err)
+ }
+ defer outputFile.Close()
+
+ // Encode Base to YAML.
+ encoder := yaml.NewEncoder(outputFile)
+ defer encoder.Close()
+
+ err = encoder.Encode(base)
+ if err != nil {
+ return fmt.Errorf("failed to encode Base to YAML: %w", err)
+ }
+
+ return nil
+}
diff --git a/pkg/io/export_test.go b/internal/infra/fileWriter_test.go
similarity index 88%
rename from pkg/io/export_test.go
rename to internal/infra/fileWriter_test.go
index e07f353..cd34651 100644
--- a/pkg/io/export_test.go
+++ b/internal/infra/fileWriter_test.go
@@ -15,14 +15,14 @@
// You should have received a copy of the GNU General Public License
// along with RIMO. If not, see .
-package io_test
+package infra_test
import (
"os"
"path/filepath"
"testing"
- "github.com/cgi-fr/rimo/pkg/io"
+ "github.com/cgi-fr/rimo/internal/infra"
"github.com/cgi-fr/rimo/pkg/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -32,7 +32,7 @@ const (
dataDir = "../../testdata/"
)
-func TestExport(t *testing.T) {
+func TestWriterYAML(t *testing.T) {
t.Parallel()
base := model.Base{
@@ -54,8 +54,11 @@ func TestExport(t *testing.T) {
// Create a temporary file for the output
outputFile := filepath.Join(tempDir, "output.yaml")
- // Export the base to the output file
- err = io.Export(base, outputFile)
+ // Create the writer
+ writer, err := infra.YAMLWriterFactory(outputFile)
+ require.NoError(t, err)
+
+ err = writer.Export(&base)
require.NoError(t, err)
// Read the output file and check its contents
diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go
new file mode 100644
index 0000000..8430ae9
--- /dev/null
+++ b/internal/infra/filesReader.go
@@ -0,0 +1,173 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package infra
+
+import (
+ "errors"
+ "fmt"
+)
+
+// Errors declaration.
+var (
+ ErrInvalidFilePath = errors.New("failed to validate path")
+ ErrNoFilePath = errors.New("no file path provided")
+ ErrNonUniqueBase = errors.New("base name is not unique")
+)
+
+// FilesReader can read multiple type of file and feed data to rimo.
+// FilesReader is responsible of :
+// - BaseName() return the name of the base
+// - Next() return true if there is a next value to read
+// - Value() return the value of the current column, the name of the column and the name of the table
+// Interface itself with a Loader interface. Which currently only supports YAML files.
+// Loader and FilesReader can be initialized with LoaderFactory and FilesReaderFactory.
+type FilesReader struct {
+ filepathList []string
+ loader JSONLinesLoader // responsible of loading a file format
+ baseName string
+ // variable for looping over columns
+ fileIndex int
+ colNameMapIndex map[int]string // map of column name by index
+ colIndex int // value of current column index
+ // given by Value()
+ dataMap map[string][]interface{}
+ tableName string // filled by FilesReader
+}
+
+// Constructor for FilesReader.
+func FilesReaderFactory(filepathList []string) (*FilesReader, error) {
+ var err error
+
+ // Process inputDirList
+ if len(filepathList) == 0 {
+ return nil, ErrNoFilePath
+ }
+
+ for _, path := range filepathList {
+ err := ValidateFilePath(path)
+ if err != nil {
+ return nil, ErrInvalidFilePath
+ }
+ }
+
+ // Initialize FilesReader
+ var filesReader FilesReader
+ filesReader.filepathList = filepathList
+ filesReader.fileIndex = -1
+
+ filesReader.baseName, err = filesReader.isBaseUnique()
+ if err != nil {
+ return nil, fmt.Errorf("base is not unique: %w", err)
+ }
+
+ // Use of JSONLinesLoader
+ filesReader.loader = JSONLinesLoader{}
+
+ return &filesReader, nil
+}
+
+// Reader interface implementation
+
+func (r *FilesReader) BaseName() string {
+ return r.baseName
+}
+
+func (r *FilesReader) Next() bool {
+ // First call to Next()
+ if r.fileIndex == -1 {
+ r.fileIndex = 0
+ r.colIndex = 0
+
+ return true
+ }
+
+ // Current file contain column left to process.
+ if r.colIndex < len(r.dataMap) {
+ r.colIndex++
+ }
+
+ // Current file contain no columns left to process.
+ if r.colIndex == len(r.dataMap) {
+ // Current file is last file.
+ if r.fileIndex == len(r.filepathList)-1 {
+ return false
+ }
+ // There is a next file.
+ r.fileIndex++
+ r.colIndex = 0
+ }
+
+ return true
+}
+
+// Charger les fichiers un à un dans une dataMap.
+// Retourne les valeurs d'une colonne, son nom et le nom de table.
+func (r *FilesReader) Value() ([]interface{}, string, string, error) {
+ var err error
+
+ // colIndex = 0 : new file to load
+ if r.colIndex == 0 {
+ filepath := r.filepathList[r.fileIndex]
+
+ // Extract table name from file name
+ _, r.tableName, err = ExtractName(filepath)
+ if err != nil {
+ return nil, "", "", fmt.Errorf("failed to extract table name: %w", err)
+ }
+
+ // Load file in dataMap
+ r.dataMap, err = r.loader.Load(r.filepathList[r.fileIndex])
+ if err != nil {
+ panic(err)
+ }
+
+ // Create a map of column name by index
+ r.colNameMapIndex = make(map[int]string, 0)
+ i := 0
+
+ for k := range r.dataMap {
+ r.colNameMapIndex[i] = k
+ i++
+ }
+ }
+
+ // colIndex = n : current file have been partially processed
+ currentColName := r.colNameMapIndex[r.colIndex]
+ // return values, colName, tableName
+ return r.dataMap[currentColName], currentColName, r.tableName, nil
+}
+
+func (r *FilesReader) isBaseUnique() (string, error) {
+ baseName, _, err := ExtractName(r.filepathList[0])
+ if err != nil {
+ return "", err
+ }
+
+ for _, path := range r.filepathList {
+ baseNameI, _, err := ExtractName(path)
+ if err != nil {
+ return "", err
+ }
+
+ if baseName != baseNameI {
+ return "", fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI)
+ }
+ }
+
+ return baseName, nil
+}
diff --git a/internal/infra/filesReader_test.go b/internal/infra/filesReader_test.go
new file mode 100644
index 0000000..ed6cdc0
--- /dev/null
+++ b/internal/infra/filesReader_test.go
@@ -0,0 +1,83 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package infra_test
+
+import (
+ "fmt"
+ "path/filepath"
+ "testing"
+
+ "github.com/cgi-fr/rimo/internal/infra"
+ "github.com/stretchr/testify/assert"
+)
+
+func TestReader(t *testing.T) {
+ t.Parallel()
+
+ inputFile := filepath.Join(dataDir, "data0/data_input.jsonl")
+
+ reader, err := infra.FilesReaderFactory([]string{inputFile})
+ assert.NoError(t, err)
+
+ // Assertions.
+
+ actualBaseName := reader.BaseName()
+ expectedBaseName := "data"
+ assert.Equal(t, expectedBaseName, actualBaseName)
+
+ expectedTableName := "input"
+ expectedDataMap := map[string][]interface{}{
+ "address": {"PSC", "095", "06210"},
+ "age": {nil, nil, float64(61)},
+ "major": {true, false, true},
+ "empty": {nil, nil, nil},
+ }
+
+ for reader.Next() {
+ values, colName, tableName, err := reader.Value()
+ if err != nil {
+ assert.NoError(t, err)
+ }
+
+ expectedColData, ok := expectedDataMap[colName]
+ if !ok {
+ assert.Fail(t, "column name not found : %s", colName)
+ }
+
+ assert.Equal(t, expectedColData, values)
+ assert.Equal(t, expectedTableName, tableName)
+ }
+}
+
+func TestReaderMultipleFiles(t *testing.T) {
+ t.Parallel()
+
+ inputFile := filepath.Join(dataDir, "data0/data_input.jsonl")
+ inputFile2 := filepath.Join(dataDir, "data0/data_input2.jsonl")
+ reader, err := infra.FilesReaderFactory([]string{inputFile, inputFile2})
+ assert.NoError(t, err)
+
+ for reader.Next() {
+ values, colName, tableName, err := reader.Value()
+ if err != nil {
+ assert.NoError(t, err)
+ }
+
+ fmt.Printf("%s.%s: %v\n", tableName, colName, values)
+ }
+}
diff --git a/internal/infra/infra_test.go b/internal/infra/infra_test.go
new file mode 100644
index 0000000..ad40f79
--- /dev/null
+++ b/internal/infra/infra_test.go
@@ -0,0 +1,93 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package infra_test
+
+import (
+ "path/filepath"
+ "testing"
+
+ "github.com/cgi-fr/rimo/internal/infra"
+ "github.com/cgi-fr/rimo/pkg/rimo"
+ "github.com/stretchr/testify/require"
+)
+
+const (
+ testdataDir = "../../testdata/"
+)
+
+// Test RIMO pipeline with FilesReader, JSONLinesLoader and YAMLWriter.
+func TestPipeline(t *testing.T) {
+ t.Parallel()
+
+ inputPath := filepath.Join(testdataDir, "data1/data_input.jsonl")
+
+ reader, err := infra.FilesReaderFactory([]string{inputPath})
+ require.NoError(t, err)
+
+ writer := infra.StdoutWriterFactory()
+
+ err = rimo.AnalyseBase(reader, writer)
+ require.NoError(t, err)
+}
+
+// var (
+// Readers []*rimo.Reader
+// Writers []*rimo.Writer
+// )
+
+// // List of implemented readers and writers.
+// func GetReaders(filepathList []string) []*rimo.Reader {
+// filesReader, err := infra.FilesReaderFactory(filepathList)
+// if err != nil {
+// panic(err)
+// }
+
+// Readers = []*rimo.Reader{filesReader}
+
+// return Readers
+// }
+
+// func GetWriters() []*rimo.Writer {
+// yamlWriter := infra.YAMLWriterFactory("../../testdata/data1/data_output.yaml")
+
+// Writers = []*rimo.Writer{yamlWriter, infra.StdoutWriter{}}
+
+// return Writers
+// }
+
+// func TestInterface(t *testing.T) {
+// t.Parallel()
+
+// Writers = GetWriters()
+// Readers = GetReaders([]string{"../../testdata/data1/data_input.jsonl"})
+// // Assert that all readers and writers implement the Reader and Writer interfaces.
+// for _, reader := range Readers {
+// var _ rimo.Reader = (reader)(nil)
+// }
+// for _, writer := range Writers {
+// var _ rimo.Reader = (writer)(nil)
+// }
+
+// // Assert that all combinations of readers and writers can be used in the pipeline.
+// for _, reader := range Readers {
+// for _, writer := range Writers {
+// err := rimo.AnalyseBase(reader, writer)
+// require.NoError(t, err)
+// }
+// }
+// }
diff --git a/pkg/io/load.go b/internal/infra/loader.go
similarity index 82%
rename from pkg/io/load.go
rename to internal/infra/loader.go
index 7890ed5..f15ea38 100644
--- a/pkg/io/load.go
+++ b/internal/infra/loader.go
@@ -15,7 +15,7 @@
// You should have received a copy of the GNU General Public License
// along with RIMO. If not, see .
-package io
+package infra
import (
"bufio"
@@ -39,8 +39,14 @@ var (
type DataMap map[string][]interface{}
-// Load .jsonl and return DataMap.
-func Load(inputPath string) (DataMap, error) {
+// JSONLinesLoader loads JSON lines files with this format : { "col_name1" : value1, "col_name2" : value1, ... }.
+// It may be interesting performance wise to use this format :
+// "col_name1" : [value1, value2, ...],
+// "col_name2" : [value1, value2, ...],
+
+type JSONLinesLoader struct{}
+
+func (l *JSONLinesLoader) Load(inputPath string) (DataMap, error) {
file, err := os.Open(inputPath)
if err != nil {
return nil, fmt.Errorf("couldn't load %s : %w", inputPath, err)
@@ -49,7 +55,7 @@ func Load(inputPath string) (DataMap, error) {
scanner := bufio.NewScanner(file)
- data, err := LoadJSONLines(scanner)
+ data, err := l.LoadJSONLines(scanner)
if err != nil {
return nil, err
}
@@ -58,7 +64,7 @@ func Load(inputPath string) (DataMap, error) {
}
// Reads JSON lines structure: { "col_name1" : value1, "col_name2" : value1, ... }.
-func LoadJSONLines(scanner *bufio.Scanner) (DataMap, error) {
+func (l *JSONLinesLoader) LoadJSONLines(scanner *bufio.Scanner) (DataMap, error) {
var data map[string][]interface{} = DataMap{}
lineNumber := 0
diff --git a/pkg/io/load_test.go b/internal/infra/loader_test.go
similarity index 74%
rename from pkg/io/load_test.go
rename to internal/infra/loader_test.go
index 2cd17e3..d55186d 100644
--- a/pkg/io/load_test.go
+++ b/internal/infra/loader_test.go
@@ -15,21 +15,25 @@
// You should have received a copy of the GNU General Public License
// along with RIMO. If not, see .
-package io_test
+package infra_test
import (
+ "fmt"
"path/filepath"
"testing"
- "github.com/cgi-fr/rimo/pkg/io"
+ "github.com/cgi-fr/rimo/internal/infra"
"github.com/stretchr/testify/require"
)
-func TestLoad(t *testing.T) {
+func TestLoaderJSONL(t *testing.T) {
t.Parallel()
- path := filepath.Join(dataDir, "data1/data_input.jsonl")
+ path := filepath.Join(testdataDir, "data1/data_input.jsonl")
- _, err := io.Load(path)
+ LoaderJSONL := infra.JSONLinesLoader{}
+
+ data, err := LoaderJSONL.Load(path)
require.NoError(t, err)
+ fmt.Printf("dataMap: %v\n", data)
}
diff --git a/internal/infra/utils.go b/internal/infra/utils.go
new file mode 100644
index 0000000..cfb6b67
--- /dev/null
+++ b/internal/infra/utils.go
@@ -0,0 +1,110 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package infra
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+)
+
+var (
+ ErrFileDoesNotExist = fmt.Errorf("file does not exist")
+ ErrDirDoesNotExist = fmt.Errorf("directory does not exist")
+ ErrPathIsNotDir = fmt.Errorf("path is not a directory")
+ ErrNotRegularFile = fmt.Errorf("path is not a regular file")
+ ErrReadPermission = fmt.Errorf("user does not have read permission for file")
+ ErrWriteDirPermission = fmt.Errorf("user does not have write permission for directory")
+)
+
+const (
+ ReadPerm os.FileMode = 0o400
+ WriteDirPerm os.FileMode = 0o200
+)
+
+func ValidateFilePath(path string) error {
+ fileInfo, err := os.Stat(path)
+ if os.IsNotExist(err) {
+ return fmt.Errorf("%w: %s", ErrFileDoesNotExist, path)
+ } else if err != nil {
+ return fmt.Errorf("%w: failed to get file info %s", err, path)
+ }
+
+ if !fileInfo.Mode().IsRegular() {
+ return fmt.Errorf("%w: %s", ErrNotRegularFile, path)
+ }
+
+ if fileInfo.Mode().Perm()&ReadPerm != ReadPerm {
+ return fmt.Errorf("%w: %s", ErrReadPermission, path)
+ }
+
+ return nil
+}
+
+// Takes a filepath but only checks the directory part of it.
+func ValidateOutputPath(path string) error {
+ // Check if path is a directory
+ if filepath.Ext(path) == "" {
+ return fmt.Errorf("%w: %s", ErrPathIsNotDir, path)
+ }
+ // Get directory out of filepath
+ dirPath := filepath.Dir(path)
+
+ // Check if directory exists
+ fileInfo, err := os.Stat(dirPath)
+ if os.IsNotExist(err) {
+ return fmt.Errorf("%w: %s", ErrDirDoesNotExist, dirPath)
+ } else if err != nil {
+ return fmt.Errorf("failed to get directory info: %w", err)
+ }
+
+ // Check directory permissions
+ if fileInfo.Mode().Perm()&WriteDirPerm != WriteDirPerm {
+ return fmt.Errorf("%w: %s", ErrWriteDirPermission, dirPath)
+ }
+
+ return nil
+}
+
+// filesReader.go UTILS
+
+var ErrNonExtractibleValue = errors.New("couldn't extract base or table name from path")
+
+func ExtractName(path string) (string, string, error) {
+ // path format : /path/to/jsonl/BASE_TABLE.jsonl
+ fileName := strings.TrimSuffix(filepath.Base(path), filepath.Ext(filepath.Base(path)))
+
+ parts := strings.Split(fileName, "_")
+ if len(parts) != 2 { //nolint:gomnd
+ return "", "", fmt.Errorf("%w : %s", ErrNonExtractibleValue, path)
+ }
+
+ baseName := parts[0]
+ if baseName == "" {
+ return "", "", fmt.Errorf("%w : base name is empty from %s", ErrNonExtractibleValue, path)
+ }
+
+ tableName := parts[1]
+ if tableName == "" {
+ return "", "", fmt.Errorf("%w : table name is empty from %s", ErrNonExtractibleValue, path)
+ }
+
+ return baseName, tableName, nil
+}
diff --git a/pkg/analyse/analyse.go b/pkg/analyse/analyse.go
deleted file mode 100644
index 90ea7a0..0000000
--- a/pkg/analyse/analyse.go
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package analyse
-
-import (
- "errors"
- "fmt"
- "path/filepath"
- "strings"
-
- "github.com/cgi-fr/rimo/pkg/io"
- "github.com/cgi-fr/rimo/pkg/metric"
- "github.com/cgi-fr/rimo/pkg/model"
-)
-
-var ErrWrongParameter = errors.New("wrong parameter")
-
-// Handle execution pipeline of rimo analyse.
-func Orchestrator(inputList []string, outputPath string) error {
- // Process input
- err := ProcessInput(inputList, outputPath)
- if err != nil {
- return err
- }
-
- // Compute model.base
- base, err := Build(inputList)
- if err != nil {
- return err
- }
-
- // Export rimo.yaml
- outputPath = filepath.Join(outputPath, base.Name+".yaml")
-
- err = io.Export(base, outputPath)
- if err != nil {
- return fmt.Errorf("%w : cannot export to %s", err, outputPath)
- }
-
- return nil
-}
-
-func ProcessInput(inputList []string, outputPath string) error {
- // verify output dirPath
- err := io.ValidateDirPath(outputPath)
- if err != nil {
- return fmt.Errorf("failed to validate output path: %w", err)
- }
-
- // validate input filepath
- for i := range inputList {
- err := io.ValidateFilePath(inputList[i])
- if err != nil {
- return fmt.Errorf("failed to validate input file: %w", err)
- }
- }
-
- // verify that input files relates to the same base
- err = BaseIsUnique(inputList)
- if err != nil {
- return fmt.Errorf("failed to validate input file: %w", err)
- }
-
- return nil
-}
-
-// Return a model.Base from inputList.
-func Build(inputList []string) (model.Base, error) {
- baseName, _, err := ExtractName(inputList[0])
- if err != nil {
- return model.Base{}, fmt.Errorf("failed to extract base name for %s: %w", inputList[0], err)
- }
-
- base := model.Base{
- Name: baseName,
- Tables: []model.Table{},
- }
-
- for _, inputPath := range inputList {
- _, tableName, err := ExtractName(inputPath)
- if err != nil {
- return model.Base{}, fmt.Errorf("failed to extract table name for %s: %w", inputPath, err)
- }
-
- columns, err := Analyse(inputPath)
- if err != nil {
- return model.Base{}, fmt.Errorf("failed to analyse %s: %w", inputPath, err)
- }
-
- // Add columns to base
- table := model.Table{
- Name: tableName,
- Columns: columns,
- }
- base.Tables = append(base.Tables, table)
- }
-
- base.SortBase()
-
- return base, nil
-}
-
-// Return a list of column from a jsonl file.
-func Analyse(path string) ([]model.Column, error) {
- // Load file in a dataMap.
- data, err := io.Load(path)
- if err != nil {
- return nil, fmt.Errorf("failed to load jsonl file: %w", err)
- }
-
- columns := []model.Column{}
-
- for colName, values := range data {
- column, err := metric.ComputeMetric(colName, values)
- if err != nil {
- return nil, fmt.Errorf("failed to compute metric: %w", err)
- }
-
- columns = append(columns, column)
- }
-
- return columns, nil
-}
-
-// Error definitions.
-
-var ErrNonExtractibleValue = errors.New("couldn't extract base or table name from path")
-
-func ExtractName(path string) (string, string, error) {
- // path format : /path/to/jsonl/BASE_TABLE.jsonl
- fileName := strings.TrimSuffix(filepath.Base(path), filepath.Ext(filepath.Base(path)))
-
- parts := strings.Split(fileName, "_")
- if len(parts) != 2 { //nolint:gomnd
- return "", "", fmt.Errorf("%w : %s", ErrNonExtractibleValue, path)
- }
-
- baseName := parts[0]
- if baseName == "" {
- return "", "", fmt.Errorf("%w : base name is empty from %s", ErrNonExtractibleValue, path)
- }
-
- tableName := parts[1]
- if tableName == "" {
- return "", "", fmt.Errorf("%w : table name is empty from %s", ErrNonExtractibleValue, path)
- }
-
- return baseName, tableName, nil
-}
-
-var ErrNonUniqueBase = errors.New("base name is not unique")
-
-func BaseIsUnique(pathList []string) error {
- baseName, _, err := ExtractName(pathList[0])
- if err != nil {
- return err
- }
-
- for _, path := range pathList {
- baseNameI, _, err := ExtractName(path)
- if err != nil {
- return err
- }
-
- if baseName != baseNameI {
- return fmt.Errorf("%w : %s and %s", ErrNonUniqueBase, baseName, baseNameI)
- }
- }
-
- return nil
-}
diff --git a/pkg/analyse/analyse_test.go b/pkg/analyse/analyse_test.go
deleted file mode 100644
index b3a6aa2..0000000
--- a/pkg/analyse/analyse_test.go
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package analyse_test
-
-import (
- "bytes"
- "errors"
- "fmt"
- "os"
- "path/filepath"
- "reflect"
- "strings"
- "testing"
- "time"
-
- "gopkg.in/yaml.v3"
-
- "github.com/cgi-fr/rimo/pkg/analyse"
- "github.com/cgi-fr/rimo/pkg/io"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/hexops/valast"
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
-)
-
-const (
- dataDir = "../../testdata/"
- inputName = "data_input.jsonl"
- outputName = "data_output.yaml"
- expectedName = "data_expected.yaml"
-)
-
-type testCase struct {
- name string
- inputPath string
- outputPath string
- expectedPath string
-}
-
-func getTestCase(dataFolder string) testCase {
- return testCase{
- name: filepath.Base(dataFolder),
- inputPath: filepath.Join(dataFolder, inputName),
- outputPath: filepath.Join(dataFolder, outputName),
- expectedPath: filepath.Join(dataFolder, expectedName),
- }
-}
-
-// Execute Analyse pipeline and compare with expected result.
-func TestAnalyse(t *testing.T) {
- t.Parallel()
-
- testCases := []testCase{}
- testCases = append(testCases, getTestCase("../../testdata/data1/"))
- testCases = append(testCases, getTestCase("../../testdata/data2/"))
-
- for _, testCase := range testCases {
- testCase := testCase // capture range variable
- t.Run(testCase.name, func(t *testing.T) {
- t.Parallel()
-
- runAnalyse(t, testCase.inputPath, testCase.outputPath)
- compareFileOutput(t, testCase.outputPath, testCase.expectedPath)
- compareObjectOutput(t, testCase.outputPath, testCase.expectedPath)
- })
- }
-}
-
-func runAnalyse(t *testing.T, inputPath string, outputPath string) {
- t.Helper()
-
- inputList := []string{inputPath}
-
- base, err := analyse.Build(inputList)
- require.NoError(t, err)
-
- if outputPath != "" {
- err = io.Export(base, outputPath)
- require.NoError(t, err)
- }
-}
-
-func compareFileOutput(t *testing.T, outputPath string, testPath string) {
- t.Helper()
-
- actualOutput := getText(t, outputPath)
- expectedOutput := getText(t, testPath)
-
- // Call removeSampleFromStrings
- actualOutput = removeSampleFromStrings(actualOutput)
- expectedOutput = removeSampleFromStrings(expectedOutput)
-
- // Compare the expected output and actual output
- assert.Equal(t, expectedOutput, actualOutput)
-}
-
-func compareObjectOutput(t *testing.T, outputPath string, testPath string) {
- t.Helper()
-
- actualOutputBase := loadYAML(t, outputPath)
- expectedOutputBase := loadYAML(t, testPath)
-
- // Remove sample fields from both model.Base.
- actualOutputBase = removeSampleFromBase(actualOutputBase)
- expectedOutputBase = removeSampleFromBase(expectedOutputBase)
-
- // Compare the expected output and actual output except all sample fields.
- equal, diff := EqualBase(expectedOutputBase, actualOutputBase)
- if !equal {
- t.Errorf("base are not similar : %s", diff)
- }
-}
-
-// Benchmark Analyse pipeline.
-
-func BenchmarkAnalyse(b *testing.B) {
- for _, numLines := range []int{100, 1000, 10000, 100000} {
- inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines))
- inputList := []string{inputPath}
- outputPath := filepath.Join(dataDir, "benchmark/mixed/")
-
- b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) {
- startTime := time.Now()
-
- b.ResetTimer()
- for n := 0; n < b.N; n++ {
- err := analyse.Orchestrator(inputList, outputPath)
- require.NoError(b, err)
- }
- b.StopTimer()
-
- elapsed := time.Since(startTime)
- linesPerSecond := float64(numLines*b.N) / elapsed.Seconds()
- b.ReportMetric(linesPerSecond, "lines/s")
- })
- }
-}
-
-func TestExtractName(t *testing.T) {
- t.Parallel()
-
- path := "path/to/dir/basename_tablename.jsonl"
- expectedBase, expectedName := "basename", "tablename"
- actualBase, actualName, err := analyse.ExtractName(path)
- assert.NoError(t, err)
-
- assert.Equal(t, expectedBase, actualBase)
- assert.Equal(t, expectedName, actualName)
-
- path = "basename_tablename.jsonl"
- expectedBase, expectedName = "basename", "tablename"
- actualBase, actualName, err = analyse.ExtractName(path)
- assert.NoError(t, err)
-
- assert.Equal(t, expectedBase, actualBase)
- assert.Equal(t, expectedName, actualName)
-
- invalidPath := ""
-
- _, _, err = analyse.ExtractName(invalidPath)
- if !errors.Is(err, analyse.ErrNonExtractibleValue) {
- t.Errorf("expected error %v, but got %v", analyse.ErrNonExtractibleValue, err)
- }
-}
-
-func TestBaseIsUnique(t *testing.T) {
- t.Parallel()
-
- inputList := []string{
- "/data/somewhere/BASE_test.jsonl",
- "/data/somewhere/BASE3221_test.jsonl",
- }
-
- err := analyse.BaseIsUnique(inputList)
- assert.ErrorIs(t, err, analyse.ErrNonUniqueBase)
-}
-
-// Helper functions
-
-func loadYAML(t *testing.T, path string) model.Base {
- t.Helper()
-
- // Load output file
- file, err := os.Open(path)
- require.NoError(t, err)
-
- decoder := yaml.NewDecoder(file)
-
- var base model.Base
- err = decoder.Decode(&base)
-
- if err != nil {
- t.Errorf("error while decoding yaml file: %v", err)
- }
-
- file.Close()
-
- return base
-}
-
-func getText(t *testing.T, outputPath string) string {
- t.Helper()
-
- file, err := os.Open(outputPath)
- require.NoError(t, err)
-
- var output string
-
- buf := new(bytes.Buffer)
- _, err = buf.ReadFrom(file)
- require.NoError(t, err)
- file.Close()
-
- output = buf.String()
-
- return output
-}
-
-func removeSampleFromBase(base model.Base) model.Base {
- for tableI, table := range base.Tables {
- for columnJ, column := range table.Columns {
- column.MainMetric.Sample = nil
-
- if column.Type == model.ValueType.String {
- for freqLen := range column.StringMetric.MostFreqLen {
- column.StringMetric.MostFreqLen[freqLen].Sample = nil
- }
-
- for freqLen := range column.StringMetric.LeastFreqLen {
- column.StringMetric.LeastFreqLen[freqLen].Sample = nil
- }
- }
-
- base.Tables[tableI].Columns[columnJ] = column
- }
- }
-
- return base
-}
-
-func removeSampleFromStrings(rimoString string) string {
- // Split at every new line
- lines := strings.Split(rimoString, "\n")
-
- // Filter out sample by skipping sampleSize + 1 lines when a line contain "sample" or "leastFrequentSample:"
- var filteredLines []string
-
- var skipLine int
-
- sampleSizeSkip := model.SampleSize + 1
-
- for _, line := range lines {
- // sample of stringMetric.MostFreqLen and stringMetric.LeastFreqLen may be of different length, skipping when nex
- if skipLine > 0 && strings.Contains(line, " - length:") || strings.Contains(line, " - name:") {
- skipLine = 0
- }
-
- switch {
- case skipLine > 0:
- skipLine--
- case strings.Contains(line, "sample:"):
- skipLine = sampleSizeSkip
- default:
- filteredLines = append(filteredLines, line)
- }
- }
-
- // Join the filtered lines back into a string
- rimoString = strings.Join(filteredLines, "\n")
-
- return rimoString
-}
-
-// DeepEqual two model.Base.
-func EqualBase(base1, base2 model.Base) (bool, string) {
- if !reflect.DeepEqual(base1, base2) {
- return false, fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2))
- }
-
- return true, ""
-}
diff --git a/pkg/io/utils.go b/pkg/io/utils.go
deleted file mode 100644
index 9b8fbef..0000000
--- a/pkg/io/utils.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package io
-
-import (
- "fmt"
- "os"
-)
-
-var (
- ErrFileDoesNotExist = fmt.Errorf("file does not exist")
- ErrDirDoesNotExist = fmt.Errorf("directory does not exist")
- ErrPathIsNotDir = fmt.Errorf("path is not a directory")
- ErrNotRegularFile = fmt.Errorf("path is not a regular file")
- ErrReadPermission = fmt.Errorf("user does not have read permission for file")
- ErrWriteDirPermission = fmt.Errorf("user does not have write permission for directory")
-)
-
-const (
- ReadPerm os.FileMode = 0o400
- WriteDirPerm os.FileMode = 0o200
-)
-
-func ValidateFilePath(path string) error {
- fileInfo, err := os.Stat(path)
- if err != nil {
- if os.IsNotExist(err) {
- return fmt.Errorf("%w : %s", ErrFileDoesNotExist, path)
- }
-
- return fmt.Errorf("%w : failed to get file info %s", err, path)
- }
-
- if !fileInfo.Mode().IsRegular() {
- return fmt.Errorf("%w : %s", ErrNotRegularFile, path)
- }
-
- if fileInfo.Mode().Perm()&ReadPerm != ReadPerm {
- return fmt.Errorf("%w : %s", ErrReadPermission, path)
- }
-
- return nil
-}
-
-func ValidateDirPath(path string) error {
- fileInfo, err := os.Stat(path)
- if err != nil {
- if os.IsNotExist(err) {
- return fmt.Errorf("%w : %s", ErrDirDoesNotExist, path)
- }
-
- return fmt.Errorf("failed to get directory info: %w", err)
- }
-
- if !fileInfo.IsDir() {
- return fmt.Errorf("%w : %s", ErrPathIsNotDir, path)
- }
-
- if fileInfo.Mode().Perm()&WriteDirPerm != WriteDirPerm {
- return fmt.Errorf("%w : %s", ErrWriteDirPermission, path)
- }
-
- return nil
-}
diff --git a/pkg/metric/build.go b/pkg/metric/build.go
index b80c6c5..058c539 100644
--- a/pkg/metric/build.go
+++ b/pkg/metric/build.go
@@ -34,7 +34,7 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) {
// Create the column.
col := model.Column{
Name: colName,
- Type: ColType(values),
+ Type: GetColType(values),
Concept: "",
Constraint: []string{},
Confidential: confidential,
@@ -52,19 +52,19 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) {
// Type specific metric
switch col.Type {
- case model.ValueType.String:
+ case model.ColType.String:
err := SetStringMetric(values, &col.StringMetric)
if err != nil {
return model.Column{}, fmt.Errorf("error computing string metric in column %v : %w", col.Name, err)
}
- case model.ValueType.Numeric:
+ case model.ColType.Numeric:
err := SetNumericMetric(values, &col.NumericMetric)
if err != nil {
return model.Column{}, fmt.Errorf("error computing numeric metric in column %v : %w", col.Name, err)
}
- case model.ValueType.Bool:
+ case model.ColType.Bool:
err := SetBoolMetric(values, &col.BoolMetric)
if err != nil {
return model.Column{}, fmt.Errorf("error computing bool metric in column %v : %w", col.Name, err)
@@ -74,10 +74,10 @@ func ComputeMetric(colName string, values []interface{}) (model.Column, error) {
return col, nil
}
-func ColType(values []interface{}) model.RIMOType {
- colType := model.ValueType.Undefined
- for i := 0; i < len(values) && colType == model.ValueType.Undefined; i++ {
- colType = ValueType(values[i])
+func GetColType(values []interface{}) model.ValueType {
+ colType := model.ColType.Undefined
+ for i := 0; i < len(values) && colType == model.ColType.Undefined; i++ {
+ colType = ColType(values[i])
}
return colType
@@ -89,6 +89,7 @@ func GetFrequency(occurrence int, count int) float64 {
return float64(occurrence) / float64(count)
}
+// To check why not using isNil() ?
func GetFirstValue(values []interface{}) interface{} {
for _, value := range values {
if value != nil {
@@ -99,19 +100,19 @@ func GetFirstValue(values []interface{}) interface{} {
return nil
}
-func ValueType(value interface{}) model.RIMOType {
+func ColType(value interface{}) model.ValueType {
switch value.(type) {
case int:
- return model.ValueType.Numeric
+ return model.ColType.Numeric
case float64:
- return model.ValueType.Numeric
+ return model.ColType.Numeric
case json.Number:
- return model.ValueType.Numeric
+ return model.ColType.Numeric
case string:
- return model.ValueType.String
+ return model.ColType.String
case bool:
- return model.ValueType.Bool
+ return model.ColType.Bool
default:
- return model.ValueType.Undefined
+ return model.ColType.Undefined
}
}
diff --git a/pkg/metric/build_test.go b/pkg/metric/build_test.go
deleted file mode 100644
index 5fa9249..0000000
--- a/pkg/metric/build_test.go
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package metric_test
-
-import (
- "fmt"
- "testing"
- "time"
-
- "github.com/cgi-fr/rimo/pkg/analyse"
- "github.com/cgi-fr/rimo/pkg/model"
- "github.com/stretchr/testify/require"
-)
-
-const (
- dataDir = "../../testdata/benchmark"
-)
-
-var result model.Base //nolint:gochecknoglobals // used in benchmark to avoid misleading compiler optimisation.
-
-func BenchmarkMetric(b *testing.B) {
- listNumValues := []int{100, 1000, 10000}
- listType := []string{"numeric", "text", "bool"}
-
- for _, dataType := range listType {
- for _, numValues := range listNumValues {
- inputList := []string{fmt.Sprintf("%s/%s/%d_input.jsonl", dataDir, dataType, numValues)}
-
- b.Run(fmt.Sprintf("type= %s, numValues=%d", dataType, numValues), func(b *testing.B) {
- startTime := time.Now()
-
- base := model.Base{} //nolint:exhaustruct
- var err error
-
- for n := 0; n < b.N; n++ {
- base, err = analyse.Build(inputList)
- require.NoError(b, err)
- }
-
- result = base
-
- elapsed := time.Since(startTime)
- valuesPerSecond := float64(numValues*b.N) / elapsed.Seconds()
- b.ReportMetric(valuesPerSecond, "lines/s")
- })
- }
- }
-}
diff --git a/pkg/metric/generic_test.go b/pkg/metric/generic_test.go
index 5210e7d..4af5071 100644
--- a/pkg/metric/generic_test.go
+++ b/pkg/metric/generic_test.go
@@ -36,16 +36,16 @@ func TestCountEmpty(t *testing.T) {
assert.Equal(t, expected, actual)
}
-func TestColType(t *testing.T) {
+func TestGetColType(t *testing.T) {
t.Parallel()
t.Run("numeric", func(t *testing.T) {
t.Parallel()
slice := []interface{}{nil, 2, 3}
- expected := model.ValueType.Numeric
+ expected := model.ColType.Numeric
- actual := metric.ColType(slice)
+ actual := metric.GetColType(slice)
require.Equal(t, expected, actual)
})
@@ -53,9 +53,9 @@ func TestColType(t *testing.T) {
t.Parallel()
slice := []interface{}{nil, "text", nil}
- expected := model.ValueType.String
+ expected := model.ColType.String
- actual := metric.ColType(slice)
+ actual := metric.GetColType(slice)
require.Equal(t, expected, actual)
})
@@ -63,9 +63,9 @@ func TestColType(t *testing.T) {
t.Parallel()
slice := []interface{}{nil, true, false}
- expected := model.ValueType.Bool
+ expected := model.ColType.Bool
- actual := metric.ColType(slice)
+ actual := metric.GetColType(slice)
require.Equal(t, expected, actual)
})
@@ -74,9 +74,9 @@ func TestColType(t *testing.T) {
t.Parallel()
slice := []interface{}{"text", 2, false}
- expected := model.ValueType.String
+ expected := model.ColType.String
- actual := metric.ColType(slice)
+ actual := metric.GetColType(slice)
require.Equal(t, expected, actual)
})
@@ -84,9 +84,9 @@ func TestColType(t *testing.T) {
t.Parallel()
slice := []interface{}{nil, nil, nil}
- expected := model.ValueType.Undefined
+ expected := model.ColType.Undefined
- actual := metric.ColType(slice)
+ actual := metric.GetColType(slice)
require.Equal(t, expected, actual)
})
}
diff --git a/pkg/metric/metricbool.go b/pkg/metric/metricbool.go
index fb38bfe..a3a38ed 100644
--- a/pkg/metric/metricbool.go
+++ b/pkg/metric/metricbool.go
@@ -2,18 +2,18 @@
//
// This file is part of RIMO.
//
-// RIMO is free software: you can redistribute it and/or modify
+// rimo is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
-// RIMO is distributed in the hope that it will be useful,
+// rimo is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
+// along with rimo. If not, see .
package metric
diff --git a/pkg/model/base.go b/pkg/model/base.go
new file mode 100644
index 0000000..49eed63
--- /dev/null
+++ b/pkg/model/base.go
@@ -0,0 +1,50 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package model
+
+import (
+ "fmt"
+ "reflect"
+
+ "github.com/hexops/valast"
+)
+
+// RIMO YAML structure.
+type (
+ Base struct {
+ Name string `json:"database" jsonschema:"required" yaml:"database"`
+ // Tables should be map[string][]Column
+ Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"`
+ }
+
+ Table struct {
+ Name string `json:"name" jsonschema:"required" yaml:"name"`
+ Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"`
+ }
+)
+
+// Should be improved with more detail about difference.
+func SameBase(base1, base2 *Base) (bool, string) {
+ if !reflect.DeepEqual(base1, base2) {
+ msg := fmt.Sprintf("base is different : %s \n \n %s", valast.String(base1), valast.String(base2))
+
+ return false, msg
+ }
+
+ return true, ""
+}
diff --git a/pkg/model/column.go b/pkg/model/column.go
new file mode 100644
index 0000000..223bcb0
--- /dev/null
+++ b/pkg/model/column.go
@@ -0,0 +1,44 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package model
+
+const (
+ SampleSize int = 5
+ MostFrequentLenSize int = 5
+ MostFrequentSampleSize int = 5
+ LeastFrequentLenSize int = 5
+ LeastFrequentSampleSize int = 5
+)
+
+type (
+ Column struct {
+ Name string `json:"name" jsonschema:"required" yaml:"name"`
+ Type ValueType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll
+
+ // The 3 following parameter should be part of a Config struct
+ Concept string `json:"concept" jsonschema:"required" yaml:"concept"`
+ Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"`
+ Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"`
+
+ MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"`
+
+ StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"`
+ NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"`
+ BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"`
+ }
+)
diff --git a/pkg/model/metric.go b/pkg/model/metric.go
new file mode 100644
index 0000000..b9c8f5d
--- /dev/null
+++ b/pkg/model/metric.go
@@ -0,0 +1,64 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package model
+
+// RIMO YAML metrics.
+type (
+ GenericMetric struct {
+ Count int `json:"count" jsonschema:"required" yaml:"count"`
+ Empty int `json:"empty" jsonschema:"required" yaml:"empty"`
+ Unique int `json:"unique" jsonschema:"required" yaml:"unique"`
+ Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"`
+ }
+
+ StringMetric struct {
+ MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"`
+ LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"`
+ }
+
+ LenFreq struct {
+ Length int `json:"length" jsonschema:"required" yaml:"length"`
+ Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"`
+ Sample []string `json:"sample" jsonschema:"required" yaml:"sample"`
+ }
+
+ NumericMetric struct {
+ Min float64 `json:"min" jsonschema:"required" yaml:"min"`
+ Max float64 `json:"max" jsonschema:"required" yaml:"max"`
+ Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"`
+ }
+
+ BoolMetric struct {
+ TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"`
+ }
+)
+
+// Type that a column can be.
+type ValueType string
+
+var ColType = struct { //nolint:gochecknoglobals
+ String ValueType
+ Numeric ValueType
+ Bool ValueType
+ Undefined ValueType
+}{
+ String: "string",
+ Numeric: "numeric",
+ Bool: "bool",
+ Undefined: "undefined",
+}
diff --git a/pkg/model/model.go b/pkg/model/model.go
deleted file mode 100644
index 0872006..0000000
--- a/pkg/model/model.go
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (C) 2023 CGI France
-//
-// This file is part of RIMO.
-//
-// RIMO is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// RIMO is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RIMO. If not, see .
-
-package model
-
-import (
- "encoding/json"
- "fmt"
- "sort"
-
- "github.com/invopop/jsonschema"
-)
-
-type RIMOType string
-
-const (
- SampleSize int = 5
- MostFrequentLenSize int = 5
- MostFrequentSampleSize int = 5
- LeastFrequentLenSize int = 5
- LeastFrequentSampleSize int = 5
-)
-
-var ValueType = struct { //nolint:gochecknoglobals
- String RIMOType
- Numeric RIMOType
- Bool RIMOType
- Undefined RIMOType
-}{
- String: "string",
- Numeric: "numeric",
- Bool: "bool",
- Undefined: "undefined",
-}
-
-// RIMO YAML structure.
-type (
- Base struct {
- Name string `json:"database" jsonschema:"required" yaml:"database"`
- Tables []Table `json:"tables" jsonschema:"required" yaml:"tables"`
- }
-
- Table struct {
- Name string `json:"name" jsonschema:"required" yaml:"name"`
- Columns []Column `json:"columns" jsonschema:"required" yaml:"columns"`
- }
-
- Column struct {
- Name string `json:"name" jsonschema:"required" yaml:"name"`
- Type RIMOType `json:"type" jsonschema:"required" validate:"oneof=string numeric boolean" yaml:"type"` //nolint:lll
- Concept string `json:"concept" jsonschema:"required" yaml:"concept"`
- Constraint []string `json:"constraint" jsonschema:"required" yaml:"constraint"`
- Confidential *bool `json:"confidential" jsonschema:"required" yaml:"confidential"`
- MainMetric GenericMetric `json:"mainMetric" jsonschema:"required" yaml:"mainMetric"`
-
- StringMetric StringMetric `json:"stringMetric,omitempty" jsonschema:"required" yaml:"stringMetric,omitempty"`
- NumericMetric NumericMetric `json:"numericMetric,omitempty" jsonschema:"required" yaml:"numericMetric,omitempty"`
- BoolMetric BoolMetric `json:"boolMetric,omitempty" jsonschema:"required" yaml:"boolMetric,omitempty"`
- }
-)
-
-// RIMO YAML metrics.
-type (
- GenericMetric struct {
- Count int `json:"count" jsonschema:"required" yaml:"count"`
- Empty int `json:"empty" jsonschema:"required" yaml:"empty"`
- Unique int `json:"unique" jsonschema:"required" yaml:"unique"`
- Sample []interface{} `json:"sample" jsonschema:"required" yaml:"sample"`
- }
- StringMetric struct {
- MostFreqLen []LenFreq `json:"mostFrequentLen" jsonschema:"required" yaml:"mostFrequentLen"`
- LeastFreqLen []LenFreq `json:"leastFrequentLen" jsonschema:"required" yaml:"leastFrequentLen"`
- }
-
- LenFreq struct {
- Length int `json:"length" jsonschema:"required" yaml:"length"`
- Freq float64 `json:"freq" jsonschema:"required" yaml:"freq"`
- Sample []string `json:"sample" jsonschema:"required" yaml:"sample"`
- }
-
- NumericMetric struct {
- Min float64 `json:"min" jsonschema:"required" yaml:"min"`
- Max float64 `json:"max" jsonschema:"required" yaml:"max"`
- Mean float64 `json:"mean" jsonschema:"required" yaml:"mean"`
- }
-
- BoolMetric struct {
- TrueRatio float64 `json:"trueRatio" jsonschema:"required" yaml:"trueRatio"`
- }
-)
-
-func (base *Base) SortBase() {
- for _, table := range base.Tables {
- sort.Slice(table.Columns, func(i, j int) bool {
- return table.Columns[i].Name < table.Columns[j].Name
- })
- }
-
- sort.Slice(base.Tables, func(i, j int) bool {
- return base.Tables[i].Name < base.Tables[j].Name
- })
-}
-
-func GetJSONSchema() (string, error) {
- resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct
- if err != nil {
- return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err)
- }
-
- return string(resBytes), nil
-}
diff --git a/pkg/model/utils.go b/pkg/model/utils.go
new file mode 100644
index 0000000..f80a6e2
--- /dev/null
+++ b/pkg/model/utils.go
@@ -0,0 +1,123 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package model
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "os"
+ "sort"
+
+ "github.com/invopop/jsonschema"
+ "gopkg.in/yaml.v3"
+)
+
+func GetJSONSchema() (string, error) {
+ resBytes, err := json.MarshalIndent(jsonschema.Reflect(&Base{}), "", " ") //nolint:exhaustruct
+ if err != nil {
+ return "", fmt.Errorf("couldn't unmarshall Base in JSON : %w", err)
+ }
+
+ return string(resBytes), nil
+}
+
+func NewBase(name string) *Base {
+ return &Base{
+ Name: name,
+ Tables: make([]Table, 0),
+ }
+}
+
+var ErrBaseFormat = errors.New("error while decoding yaml file in a Base struct")
+
+// Can be improved.
+func LoadBase(path string) (*Base, error) {
+ file, err := os.Open(path)
+ if err != nil {
+ return nil, fmt.Errorf("error while opening file: %w", err)
+ }
+
+ decoder := yaml.NewDecoder(file)
+
+ var base Base
+
+ err = decoder.Decode(&base)
+ if err != nil {
+ return nil, ErrBaseFormat
+ }
+
+ file.Close()
+
+ return &base, nil
+}
+
+func RemoveSampleFromBase(base *Base) {
+ for tableI, table := range base.Tables {
+ for columnJ, column := range table.Columns {
+ column.MainMetric.Sample = nil
+
+ if column.Type == ColType.String {
+ for freqLen := range column.StringMetric.MostFreqLen {
+ column.StringMetric.MostFreqLen[freqLen].Sample = nil
+ }
+
+ for freqLen := range column.StringMetric.LeastFreqLen {
+ column.StringMetric.LeastFreqLen[freqLen].Sample = nil
+ }
+ }
+
+ base.Tables[tableI].Columns[columnJ] = column
+ }
+ }
+}
+
+func (base *Base) SortBase() {
+ for _, table := range base.Tables {
+ sort.Slice(table.Columns, func(i, j int) bool {
+ return table.Columns[i].Name < table.Columns[j].Name
+ })
+ }
+
+ sort.Slice(base.Tables, func(i, j int) bool {
+ return base.Tables[i].Name < base.Tables[j].Name
+ })
+}
+
+func (base *Base) AddColumn(column Column, tableName string) {
+ mapTableName := make(map[string]int)
+ for index, table := range base.Tables {
+ mapTableName[table.Name] = index
+ }
+
+ if index, ok := mapTableName[tableName]; ok {
+ // If the table exists, append the column to the table
+ base.Tables[index].Columns = append(base.Tables[index].Columns, column)
+ } else {
+ // If the table does not exist, create a new table and add it to the base
+ table := Table{
+ Name: tableName,
+ Columns: []Column{column},
+ }
+ base.Tables = append(base.Tables, table)
+ }
+}
+
+// If the table does not exist, create a new table and add it to the base
+// table := Table{Name: tableName, Columns: []Column{column}}
+// base.Tables = append(base.Tables, table)
diff --git a/pkg/model/utils_test.go b/pkg/model/utils_test.go
new file mode 100644
index 0000000..16c619e
--- /dev/null
+++ b/pkg/model/utils_test.go
@@ -0,0 +1,66 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package model_test
+
+import (
+ "testing"
+
+ "github.com/cgi-fr/rimo/pkg/model"
+)
+
+func TestAddColumn(t *testing.T) {
+ t.Parallel()
+
+ base := model.NewBase("test_base")
+
+ column := model.Column{ //nolint:exhaustruct
+ Name: "test_column",
+ Type: model.ColType.String,
+ Concept: "test_concept",
+ }
+
+ tableName := "test_table"
+
+ base.AddColumn(column, tableName)
+
+ // fmt.Print(valast.String(base))
+
+ if len(base.Tables) != 1 {
+ t.Errorf("expected 1 table, got %d", len(base.Tables))
+ }
+
+ if base.Tables[0].Name != tableName {
+ t.Errorf("expected table name %q, got %q", tableName, base.Tables[0].Name)
+ }
+
+ if len(base.Tables[0].Columns) != 1 {
+ t.Errorf("expected 1 column, got %d", len(base.Tables[0].Columns))
+ }
+
+ if base.Tables[0].Columns[0].Name != column.Name {
+ t.Errorf("expected column name %q, got %q", column.Name, base.Tables[0].Columns[0].Name)
+ }
+
+ if base.Tables[0].Columns[0].Type != column.Type {
+ t.Errorf("expected column type %q, got %q", column.Type, base.Tables[0].Columns[0].Type)
+ }
+
+ if base.Tables[0].Columns[0].Concept != column.Concept {
+ t.Errorf("expected column concept %q, got %q", column.Concept, base.Tables[0].Columns[0].Concept)
+ }
+}
diff --git a/pkg/io/export.go b/pkg/rimo/driven.go
similarity index 60%
rename from pkg/io/export.go
rename to pkg/rimo/driven.go
index 764ee2b..1928b2a 100644
--- a/pkg/io/export.go
+++ b/pkg/rimo/driven.go
@@ -15,32 +15,18 @@
// You should have received a copy of the GNU General Public License
// along with RIMO. If not, see .
-package io
+package rimo
import (
- "fmt"
- "os"
-
"github.com/cgi-fr/rimo/pkg/model"
- "gopkg.in/yaml.v3"
)
-func Export(base model.Base, outputPath string) error {
- // Create output file.
- outputFile, err := os.Create(outputPath)
- if err != nil {
- return fmt.Errorf("failed to create output file: %w", err)
- }
- defer outputFile.Close()
-
- // Encode Base to YAML.
- encoder := yaml.NewEncoder(outputFile)
- defer encoder.Close()
-
- err = encoder.Encode(base)
- if err != nil {
- return fmt.Errorf("failed to encode Base to YAML: %w", err)
- }
+type Reader interface {
+ BaseName() string
+ Next() bool // itère sur les colonnes.
+ Value() ([]interface{}, string, string, error) // colValues, colName, tableName
+}
- return nil
+type Writer interface {
+ Export(base *model.Base) error
}
diff --git a/pkg/rimo/driven_test.go b/pkg/rimo/driven_test.go
new file mode 100644
index 0000000..da635c9
--- /dev/null
+++ b/pkg/rimo/driven_test.go
@@ -0,0 +1,108 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package rimo_test
+
+import (
+ "log"
+ "math"
+ "testing"
+
+ "github.com/cgi-fr/rimo/pkg/model"
+ "github.com/cgi-fr/rimo/pkg/rimo"
+)
+
+// TESTS
+
+func TestTestInterface(t *testing.T) {
+ t.Parallel()
+
+ var _ rimo.Reader = (*TestReader)(nil)
+
+ var _ rimo.Writer = (*TestWriter)(nil)
+}
+
+// TestReader implementation
+
+type colInput struct {
+ ColName string
+ ColValues []interface{}
+}
+
+type TestReader struct {
+ baseName string
+ data []colInput
+ tableNames []string // Next() will progressively change tableName
+ // internal
+ index int
+ currentValues []interface{}
+ currentColName string
+ currentTableName string
+}
+
+func (r *TestReader) BaseName() string {
+ return r.baseName
+}
+
+func (r *TestReader) Next() bool {
+ if r.index == len(r.data) {
+ log.Println("End of data")
+
+ return false
+ }
+
+ // update tableName
+ if len(r.tableNames) == len(r.data) {
+ r.currentTableName = r.tableNames[r.index]
+ } else {
+ // use a percentage to determine the table name to use from the list
+ percentageComplete := float64(r.index) / float64(len(r.data))
+ expectedTableIndex := percentageComplete * float64(len(r.tableNames))
+ roundedTableIndex := math.Floor(expectedTableIndex)
+ tableNameIndex := int(roundedTableIndex)
+
+ r.currentTableName = r.tableNames[tableNameIndex]
+ }
+
+ r.currentColName = r.data[r.index].ColName
+ r.currentValues = r.data[r.index].ColValues
+ r.index++
+
+ return true
+}
+
+func (r *TestReader) Value() ([]interface{}, string, string, error) { //nolint:wsl
+ // log.Printf("Processing %s column in %s table", r.currentTableName, r.currentColName)
+
+ return r.currentValues, r.currentColName, r.currentTableName, nil
+}
+
+// TestWriter implementation
+
+type TestWriter struct {
+ base model.Base
+}
+
+func (w *TestWriter) Export(base *model.Base) error {
+ w.base = *base
+
+ return nil
+}
+
+func (w *TestWriter) Base() *model.Base {
+ return &w.base
+}
diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go
new file mode 100644
index 0000000..e626bbc
--- /dev/null
+++ b/pkg/rimo/driver.go
@@ -0,0 +1,66 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package rimo
+
+import (
+ "fmt"
+
+ "github.com/cgi-fr/rimo/pkg/metric"
+ "github.com/cgi-fr/rimo/pkg/model"
+
+ "github.com/rs/zerolog/log"
+)
+
+func AnalyseBase(reader Reader, writer Writer) error {
+ // log.Logger = zerolog.New(os.Stdout).Level(zerolog.DebugLevel)
+ baseName := reader.BaseName()
+
+ // log.Debug().Msgf("Processing [%s base]", baseName)
+
+ base := model.NewBase(baseName)
+
+ for reader.Next() { // itère colonne par colonne
+ colValues, colName, tableName, err := reader.Value()
+ if err != nil {
+ return fmt.Errorf("failed to get column value : %w", err)
+ }
+
+ column, err := metric.ComputeMetric(colName, colValues)
+ if err != nil {
+ return fmt.Errorf("failed to compute column : %w", err)
+ }
+
+ log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, tableName, column.Name)
+ // log.Debug().Msg(valast.String(column))
+
+ base.AddColumn(column, tableName)
+ }
+
+ base.SortBase()
+
+ // log.Debug().Msg("---------- Finish processing base :")
+ // log.Debug().Msg(valast.String(*base))
+ // log.Debug().Msg("----------")
+
+ err := writer.Export(base)
+ if err != nil {
+ return fmt.Errorf("failed to export base : %w", err)
+ }
+
+ return nil
+}
diff --git a/pkg/rimo/driver_test.go b/pkg/rimo/driver_test.go
new file mode 100644
index 0000000..186e18f
--- /dev/null
+++ b/pkg/rimo/driver_test.go
@@ -0,0 +1,175 @@
+// Copyright (C) 2023 CGI France
+//
+// This file is part of RIMO.
+//
+// RIMO is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RIMO is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RIMO. If not, see .
+
+package rimo_test
+
+import (
+ "fmt"
+ "path/filepath"
+ "testing"
+ "time"
+
+ "github.com/cgi-fr/rimo/internal/infra"
+ "github.com/cgi-fr/rimo/pkg/model"
+ "github.com/cgi-fr/rimo/pkg/rimo"
+
+ "github.com/hexops/valast"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// Run Analyse pipeline with FilesReader and TestWriter and compare with expected result.
+const (
+ dataDir = "../../testdata/"
+ inputName = "data_input.jsonl"
+ outputName = "interface_data_output.yaml"
+ expectedName = "data_expected.yaml"
+)
+
+type testCase struct {
+ name string
+ inputPath string
+ expectedPath string
+}
+
+func getTestCase(dataFolder string) testCase {
+ return testCase{
+ name: filepath.Base(dataFolder),
+ inputPath: filepath.Join(dataFolder, inputName),
+ expectedPath: filepath.Join(dataFolder, expectedName),
+ }
+}
+
+// PIPELINE TESTS
+
+// Note : numeric value should be converted to float64.
+func TestManualPipeline(t *testing.T) {
+ t.Parallel()
+
+ // Set up TestReader
+ baseName := "databaseName"
+ tableNames := []string{"tableTest"}
+ testInput := []colInput{
+ {
+ ColName: "string",
+ ColValues: []interface{}{"val1", "val2", "val3"},
+ },
+ {
+ ColName: "col2",
+ ColValues: []interface{}{true, false, nil},
+ },
+ {
+ ColName: "col9",
+ ColValues: []interface{}{float64(31), float64(29), float64(42)},
+ },
+ {
+ ColName: "empty",
+ ColValues: []interface{}{nil, nil, nil},
+ },
+ }
+
+ testReader := TestReader{ //nolint:exhaustruct
+ baseName: baseName,
+ tableNames: tableNames,
+ data: testInput,
+ index: 0,
+ }
+
+ testWriter := TestWriter{} //nolint:exhaustruct
+
+ err := rimo.AnalyseBase(&testReader, &testWriter)
+ if err != nil {
+ t.Errorf("Error: %v", err)
+ }
+
+ t.Logf("Base returned : %s", valast.String(*testWriter.Base()))
+}
+
+// Ensure that the pipeline produce the same base as expected.
+func TestPipeline(t *testing.T) {
+ t.Parallel()
+
+ testCases := []testCase{}
+ testCases = append(testCases, getTestCase("../../testdata/data1/"))
+ // testCases = append(testCases, getTestCase("../../testdata/data2/"))
+
+ for _, testCase := range testCases {
+ testCase := testCase // capture range variable
+ t.Run(testCase.name, func(t *testing.T) {
+ t.Parallel()
+
+ // Actual base
+
+ reader, err := infra.FilesReaderFactory([]string{testCase.inputPath})
+ assert.NoError(t, err)
+
+ writer := &TestWriter{} //nolint:exhaustruct
+
+ err = rimo.AnalyseBase(reader, writer)
+ assert.NoError(t, err)
+
+ actualBase := writer.Base()
+
+ // Expected base
+ expectedBase, err := model.LoadBase(testCase.expectedPath)
+ if err != nil {
+ t.Errorf("Error: %v", err)
+ }
+
+ // Remove sample
+ model.RemoveSampleFromBase(expectedBase)
+ model.RemoveSampleFromBase(actualBase)
+
+ fmt.Printf("Actual base : %s\n", valast.String(*actualBase))
+ // Compare
+ equal, diff := model.SameBase(expectedBase, actualBase)
+ if !equal {
+ t.Errorf("Base are not equal:\n%s", diff)
+ }
+ })
+ }
+}
+
+// Benchmark (same as previous analyse_test.go benchmark).
+func BenchmarkAnalyseInterface(b *testing.B) {
+ for _, numLines := range []int{100, 1000, 10000, 100000} {
+ inputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%d_input.jsonl", numLines))
+ inputList := []string{inputPath}
+ outputPath := filepath.Join(dataDir, fmt.Sprintf("benchmark/mixed/%dinterface_output.yaml", numLines))
+
+ b.Run(fmt.Sprintf("numLines=%d", numLines), func(b *testing.B) {
+ startTime := time.Now()
+
+ reader, err := infra.FilesReaderFactory(inputList)
+ require.NoError(b, err)
+
+ writer, err := infra.YAMLWriterFactory(outputPath)
+ require.NoError(b, err)
+
+ b.ResetTimer()
+ for n := 0; n < b.N; n++ {
+ err := rimo.AnalyseBase(reader, writer)
+ require.NoError(b, err)
+ }
+ b.StopTimer()
+
+ elapsed := time.Since(startTime)
+ linesPerSecond := float64(numLines*b.N) / elapsed.Seconds()
+ b.ReportMetric(linesPerSecond, "lines/s")
+ })
+ }
+}
diff --git a/test/suites/testdata/data1/output/data.yaml b/test/suites/testdata/data1/output/data.yaml
deleted file mode 100644
index 1bb32f7..0000000
--- a/test/suites/testdata/data1/output/data.yaml
+++ /dev/null
@@ -1,144 +0,0 @@
-database: data
-tables:
- - name: input
- columns:
- - name: address
- type: string
- concept: ""
- constraint: []
- confidential: null
- mainMetric:
- count: 10
- empty: 0
- unique: 10
- sample:
- - PSC 4713, Box 9649 APO AA 43433
- - 9038 Frye Ramp South Cheryltown, CT 54262
- - 25545 Cole Court Newtonfurt, KY 13882
- - 06210 David Court South Kimberly, IL 10236
- - 536 Robinson Estates Austinside, NV 69535
- stringMetric:
- mostFrequentLen:
- - length: 42
- freq: 0.3
- sample:
- - 06210 David Court South Kimberly, IL 10236
- - 0301 Amy Grove Apt. 325 Janefort, MA 84102
- - 095 Jennifer Turnpike Castrobury, NY 98111
- - length: 41
- freq: 0.2
- sample:
- - 536 Robinson Estates Austinside, NV 69535
- - 9038 Frye Ramp South Cheryltown, CT 54262
- - length: 31
- freq: 0.1
- sample:
- - PSC 4713, Box 9649 APO AA 43433
- - length: 37
- freq: 0.1
- sample:
- - 25545 Cole Court Newtonfurt, KY 13882
- leastFrequentLen:
- - length: 52
- freq: 0.1
- sample:
- - 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407
- - length: 45
- freq: 0.1
- sample:
- - 2035 Simmons Islands Heatherchester, IN 46152
- - length: 43
- freq: 0.1
- sample:
- - 38432 Moreno Turnpike Garrettland, TN 72939
- - name: age
- type: numeric
- concept: ""
- constraint: []
- confidential: null
- mainMetric:
- count: 10
- empty: 0
- unique: 9
- sample:
- - 35
- - 73
- - 73
- - 80
- - 73
- numericMetric:
- min: 29
- max: 95
- mean: 57.7
- - name: date
- type: string
- concept: ""
- constraint: []
- confidential: null
- mainMetric:
- count: 10
- empty: 0
- unique: 10
- sample:
- - "2003-10-11"
- - "2022-04-23"
- - "2001-08-23"
- - "2001-08-23"
- - "2003-10-11"
- stringMetric:
- mostFrequentLen:
- - length: 10
- freq: 1
- sample:
- - "2022-04-23"
- - "2004-07-04"
- - "2004-07-04"
- - "2005-05-10"
- - "2014-07-24"
- leastFrequentLen: []
- - name: phone
- type: string
- concept: ""
- constraint: []
- confidential: null
- mainMetric:
- count: 10
- empty: 0
- unique: 10
- sample:
- - 001-845-854-2110
- - +1-407-997-8293x68130
- - (517)819-3454
- - 001-845-854-2110
- - 260-587-0590
- stringMetric:
- mostFrequentLen:
- - length: 16
- freq: 0.4
- sample:
- - 001-845-854-2110
- - 001-533-758-7269
- - 001-958-985-3039
- - 001-866-271-0116
- - length: 12
- freq: 0.2
- sample:
- - 828-755-3826
- - 260-587-0590
- - length: 10
- freq: 0.1
- sample:
- - "7795418893"
- leastFrequentLen:
- - length: 21
- freq: 0.1
- sample:
- - +1-407-997-8293x68130
- - length: 18
- freq: 0.1
- sample:
- - (330)616-7639x7810
- - length: 13
- freq: 0.1
- sample:
- - (517)819-3454
diff --git a/testdata/data0/data_expected.yaml b/testdata/data0/data_expected.yaml
new file mode 100644
index 0000000..78ced89
--- /dev/null
+++ b/testdata/data0/data_expected.yaml
@@ -0,0 +1,115 @@
+database: data
+tables:
+ - name: input
+ columns:
+ - name: address
+ type: string
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 3
+ empty: 0
+ unique: 3
+ sample:
+ - PSC
+ - "095"
+ - "06210"
+ stringMetric:
+ mostFrequentLen:
+ - length: 3
+ freq: 0.6666666666666666
+ sample:
+ - PSC
+ - "095"
+ leastFrequentLen:
+ - length: 5
+ freq: 0.3333333333333333
+ sample:
+ - "06210"
+ - name: age
+ type: numeric
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 3
+ empty: 2
+ unique: 1
+ sample:
+ - 61
+ numericMetric:
+ min: 61
+ max: 61
+ mean: 61
+ - name: empty
+ type: undefined
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 3
+ empty: 3
+ unique: 0
+ sample: []
+ - name: major
+ type: bool
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 3
+ empty: 0
+ unique: 2
+ sample:
+ - true
+ - false
+ boolMetric:
+ trueRatio: 0.6666666666666666
+ - name: input2
+ columns:
+ - name: string
+ type: string
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 4
+ empty: 0
+ unique: 4
+ sample:
+ - Hello World
+ - Hello World2
+ - Hello World3
+ - Hello World5
+ stringMetric:
+ mostFrequentLen:
+ - length: 12
+ freq: 0.75
+ sample:
+ - Hello World2
+ - Hello World3
+ - Hello World5
+ leastFrequentLen:
+ - length: 11
+ freq: 0.25
+ sample:
+ - Hello World
+ - name: time
+ type: string
+ concept: ""
+ constraint: []
+ confidential: null
+ mainMetric:
+ count: 4
+ empty: 0
+ unique: 1
+ sample:
+ - "20:03"
+ stringMetric:
+ mostFrequentLen:
+ - length: 5
+ freq: 1
+ sample:
+ - "20:03"
+ leastFrequentLen: []
diff --git a/testdata/data0/data_input.jsonl b/testdata/data0/data_input.jsonl
new file mode 100644
index 0000000..9157492
--- /dev/null
+++ b/testdata/data0/data_input.jsonl
@@ -0,0 +1,3 @@
+{"address": "PSC", "age": null, "major": true, "empty": null}
+{"address": "095", "age": null, "major": false, "empty": null}
+{"address": "06210", "age": 61, "major": true, "empty": null}
diff --git a/testdata/data0/data_input2.jsonl b/testdata/data0/data_input2.jsonl
new file mode 100644
index 0000000..6af2498
--- /dev/null
+++ b/testdata/data0/data_input2.jsonl
@@ -0,0 +1,4 @@
+{"string" : "Hello World", "time" : "20:03"}
+{"string" : "Hello World2", "time" : "20:03"}
+{"string" : "Hello World3", "time" : "20:03"}
+{"string" : "Hello World5", "time" : "20:03"}