diff --git a/CHANGELOG.md b/CHANGELOG.md index 792ec12..60bc5a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [0.4.0] + +- `Added` `Open` and `Close` methods to column/value reader inferfaces. + ## [0.3.0] - `Added` moved `min` and `max` to the main metric. diff --git a/internal/infra/filesReader.go b/internal/infra/filesReader.go index 9503edd..c0d8245 100644 --- a/internal/infra/filesReader.go +++ b/internal/infra/filesReader.go @@ -51,6 +51,9 @@ func NewJSONLFolderReader(folderpath string) (*JSONLFolderReader, error) { }, nil } +func (r *JSONLFolderReader) Open() error { return nil } +func (r *JSONLFolderReader) Close() error { return nil } + func (r *JSONLFolderReader) BaseName() string { return r.basename } @@ -155,6 +158,9 @@ func NewJSONLColReader(table, column string, decoder *json.Decoder) *JSONLColRea } } +func (cr *JSONLColReader) Open() error { return nil } +func (cr *JSONLColReader) Close() error { return nil } + func (cr *JSONLColReader) ColName() string { return cr.column } diff --git a/pkg/rimo/driven.go b/pkg/rimo/driven.go index fb79719..098b5b4 100644 --- a/pkg/rimo/driven.go +++ b/pkg/rimo/driven.go @@ -19,7 +19,13 @@ package rimo import "github.com/cgi-fr/rimo/pkg/model" +type Resource interface { + Open() error + Close() error +} + type ColReader interface { + Resource ColName() string TableName() string Next() bool @@ -27,6 +33,7 @@ type ColReader interface { } type Reader interface { + Resource BaseName() string Next() bool Col() (ColReader, error) diff --git a/pkg/rimo/driver.go b/pkg/rimo/driver.go index 6e28979..98f71ff 100644 --- a/pkg/rimo/driver.go +++ b/pkg/rimo/driver.go @@ -32,102 +32,130 @@ type Driver struct { Distinct bool } -//nolint:funlen,cyclop,gocognit -func (d Driver) AnalyseBase(reader Reader, writer Writer) error { +func (d Driver) AnalyseBase(reader Reader, writer Writer) (err error) { + if err := reader.Open(); err != nil { + return fmt.Errorf("failed to open column reader : %w", err) + } + + defer func() { + if localerr := reader.Close(); err != nil { + err = localerr + } + }() + baseName := reader.BaseName() base := model.NewBase(baseName) tables := map[string]model.Table{} for reader.Next() { // itère colonne par colonne - valreader, err := reader.Col() - if err != nil { - return fmt.Errorf("failed to get column reader : %w", err) + if err := d.analyse(reader, tables); err != nil { + return err } + } - nilcount := 0 + for _, table := range tables { + sort.SliceStable(table.Columns, func(i, j int) bool { + return table.Columns[i].Name < table.Columns[j].Name + }) - for valreader.Next() { - val, err := valreader.Value() - if err != nil { - return fmt.Errorf("failed to read value : %w", err) - } + base.Tables = append(base.Tables, table) + } - log.Debug().Msgf("Processing [%s base][%s table][%s column]", baseName, valreader.TableName(), valreader.ColName()) + sort.SliceStable(base.Tables, func(i, j int) bool { + return base.Tables[i].Name < base.Tables[j].Name + }) - switch valtyped := val.(type) { - case string: - col, err := d.AnalyseString(nilcount, valtyped, valreader) - if err != nil { - return fmt.Errorf("failed to analyse column : %w", err) - } + err = writer.Export(base) + if err != nil { + return fmt.Errorf("failed to export base : %w", err) + } - table, exists := tables[valreader.TableName()] - if !exists { - table = model.Table{ - Name: valreader.TableName(), - Columns: []model.Column{}, - } - } + return nil +} - table.Columns = append(table.Columns, col) +//nolint:funlen,cyclop +func (d Driver) analyse(reader Reader, tables map[string]model.Table) (err error) { + valreader, err := reader.Col() + if err != nil { + return fmt.Errorf("failed to get column reader : %w", err) + } - tables[valreader.TableName()] = table - case float64, float32, int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: - col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) - if err != nil { - return fmt.Errorf("failed to analyse column : %w", err) - } + if err := valreader.Open(); err != nil { + return fmt.Errorf("failed to open value reader : %w", err) + } - table, exists := tables[valreader.TableName()] - if !exists { - table = model.Table{ - Name: valreader.TableName(), - Columns: []model.Column{}, - } - } + defer func() { + if localerr := reader.Close(); err != nil { + err = localerr + } + }() - table.Columns = append(table.Columns, col) + nilcount := 0 - tables[valreader.TableName()] = table - case bool: - col, err := d.AnalyseBool(nilcount, valtyped, valreader) - if err != nil { - return fmt.Errorf("failed to analyse column : %w", err) - } + for valreader.Next() { + val, err := valreader.Value() + if err != nil { + return fmt.Errorf("failed to read value : %w", err) + } - table, exists := tables[valreader.TableName()] - if !exists { - table = model.Table{ - Name: valreader.TableName(), - Columns: []model.Column{}, - } + log.Debug().Msgf("Processing [%s][%s][%s]", reader.BaseName(), valreader.TableName(), valreader.ColName()) + + switch valtyped := val.(type) { + case string: + col, err := d.AnalyseString(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } + + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, } + } - table.Columns = append(table.Columns, col) + table.Columns = append(table.Columns, col) - tables[valreader.TableName()] = table - case nil: - nilcount++ + tables[valreader.TableName()] = table + case float64, float32, int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: + col, err := d.AnalyseNumeric(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) } - } - } - for _, table := range tables { - sort.SliceStable(table.Columns, func(i, j int) bool { - return table.Columns[i].Name < table.Columns[j].Name - }) + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, + } + } - base.Tables = append(base.Tables, table) - } + table.Columns = append(table.Columns, col) - sort.SliceStable(base.Tables, func(i, j int) bool { - return base.Tables[i].Name < base.Tables[j].Name - }) + tables[valreader.TableName()] = table + case bool: + col, err := d.AnalyseBool(nilcount, valtyped, valreader) + if err != nil { + return fmt.Errorf("failed to analyse column : %w", err) + } - err := writer.Export(base) - if err != nil { - return fmt.Errorf("failed to export base : %w", err) + table, exists := tables[valreader.TableName()] + if !exists { + table = model.Table{ + Name: valreader.TableName(), + Columns: []model.Column{}, + } + } + + table.Columns = append(table.Columns, col) + + tables[valreader.TableName()] = table + case nil: + nilcount++ + } } return nil diff --git a/test/suites/testdata/data1/output/data1.yaml b/test/suites/testdata/data1/output/data1.yaml new file mode 100644 index 0000000..e158360 --- /dev/null +++ b/test/suites/testdata/data1/output/data1.yaml @@ -0,0 +1,244 @@ +database: data1 +tables: + - name: data_input + columns: + - name: address + type: string + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 0 + nulls: 0 + min: 0301 Amy Grove Apt. 325 Janefort, MA 84102 + max: PSC 4713, Box 9649 APO AA 43433 + samples: + - 0301 Amy Grove Apt. 325 Janefort, MA 84102 + - 095 Jennifer Turnpike Castrobury, NY 98111 + - 06210 David Court South Kimberly, IL 10236 + - 9038 Frye Ramp South Cheryltown, CT 54262 + - 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407 + stringMetric: + minLen: 31 + maxLen: 52 + countLen: 7 + lengths: + - length: 42 + freq: 0.3 + metrics: + count: 3 + empty: 0 + nulls: 0 + min: 0301 Amy Grove Apt. 325 Janefort, MA 84102 + max: 095 Jennifer Turnpike Castrobury, NY 98111 + samples: + - 095 Jennifer Turnpike Castrobury, NY 98111 + - 06210 David Court South Kimberly, IL 10236 + - 0301 Amy Grove Apt. 325 Janefort, MA 84102 + - length: 41 + freq: 0.2 + metrics: + count: 2 + empty: 0 + nulls: 0 + min: 536 Robinson Estates Austinside, NV 69535 + max: 9038 Frye Ramp South Cheryltown, CT 54262 + samples: + - 536 Robinson Estates Austinside, NV 69535 + - 9038 Frye Ramp South Cheryltown, CT 54262 + - length: 37 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: 25545 Cole Court Newtonfurt, KY 13882 + max: 25545 Cole Court Newtonfurt, KY 13882 + samples: + - 25545 Cole Court Newtonfurt, KY 13882 + - length: 31 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: PSC 4713, Box 9649 APO AA 43433 + max: PSC 4713, Box 9649 APO AA 43433 + samples: + - PSC 4713, Box 9649 APO AA 43433 + - length: 45 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: 2035 Simmons Islands Heatherchester, IN 46152 + max: 2035 Simmons Islands Heatherchester, IN 46152 + samples: + - 2035 Simmons Islands Heatherchester, IN 46152 + - length: 52 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407 + max: 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407 + samples: + - 275 Stone Ridges Suite 885 East Aliciafurt, MH 15407 + - length: 43 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: 38432 Moreno Turnpike Garrettland, TN 72939 + max: 38432 Moreno Turnpike Garrettland, TN 72939 + samples: + - 38432 Moreno Turnpike Garrettland, TN 72939 + - name: age + type: numeric + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 0 + nulls: 0 + min: 29 + max: 95 + samples: + - 47 + - 35 + - 61 + - 73 + - 45 + numericMetric: + mean: 57.7 + - name: date + type: string + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 0 + nulls: 0 + min: "2001-08-23" + max: "2022-04-23" + samples: + - "2001-08-23" + - "2014-09-09" + - "2003-10-11" + - "2011-07-13" + - "2022-04-23" + stringMetric: + minLen: 10 + maxLen: 10 + countLen: 1 + lengths: + - length: 10 + freq: 1 + metrics: + count: 10 + empty: 0 + nulls: 0 + min: "2001-08-23" + max: "2022-04-23" + samples: + - "2013-06-11" + - "2011-07-13" + - "2003-10-11" + - "2001-08-23" + - "2010-11-18" + - name: phone + type: string + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 0 + nulls: 0 + min: (330)616-7639x7810 + max: 828-755-3826 + samples: + - 001-958-985-3039 + - 001-845-854-2110 + - "7795418893" + - 260-587-0590 + - 001-533-758-7269 + stringMetric: + minLen: 10 + maxLen: 21 + countLen: 6 + lengths: + - length: 16 + freq: 0.4 + metrics: + count: 4 + empty: 0 + nulls: 0 + min: 001-533-758-7269 + max: 001-958-985-3039 + samples: + - 001-958-985-3039 + - 001-866-271-0116 + - 001-845-854-2110 + - 001-533-758-7269 + - length: 12 + freq: 0.2 + metrics: + count: 2 + empty: 0 + nulls: 0 + min: 260-587-0590 + max: 828-755-3826 + samples: + - 828-755-3826 + - 260-587-0590 + - length: 13 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: (517)819-3454 + max: (517)819-3454 + samples: + - (517)819-3454 + - length: 21 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: +1-407-997-8293x68130 + max: +1-407-997-8293x68130 + samples: + - +1-407-997-8293x68130 + - length: 10 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: "7795418893" + max: "7795418893" + samples: + - "7795418893" + - length: 18 + freq: 0.1 + metrics: + count: 1 + empty: 0 + nulls: 0 + min: (330)616-7639x7810 + max: (330)616-7639x7810 + samples: + - (330)616-7639x7810 diff --git a/test/suites/testdata/main/output/main.yaml b/test/suites/testdata/main/output/main.yaml new file mode 100644 index 0000000..a39f0db --- /dev/null +++ b/test/suites/testdata/main/output/main.yaml @@ -0,0 +1,107 @@ +database: main +tables: + - name: data + columns: + - name: bool + type: bool + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 4 + nulls: 1 + distinct: 2 + samples: + - false + - true + - true + - false + - true + boolMetric: + trueRatio: 0.5555555555555556 + - name: numeric + type: numeric + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 3 + nulls: 1 + distinct: 7 + min: -235 + max: 100 + samples: + - -0 + - 1 + - 3.1415 + - -235 + - 2.12e-06 + numericMetric: + mean: -13.539833097777777 + - name: string + type: string + config: + concept: "" + constraint: [] + confidential: null + mainMetric: + count: 10 + empty: 1 + nulls: 1 + distinct: 9 + min: "" + max: 教育漢字 + samples: + - \ + - 教育漢字 + - "tabs\t " + - "new\nline " + - 'hello world ' + stringMetric: + minLen: 0 + maxLen: 12 + countLen: 3 + lengths: + - length: 12 + freq: 0.5 + metrics: + count: 5 + empty: 0 + nulls: 0 + distinct: 5 + min: 'hello world ' + max: 教育漢字 + samples: + - "new\nline " + - 'hello world ' + - "tabs\t " + - 教育漢字 + - '€ ' + - length: 1 + freq: 0.3 + metrics: + count: 3 + empty: 0 + nulls: 0 + distinct: 3 + min: ' ' + max: _ + samples: + - ' ' + - _ + - \ + - length: 0 + freq: 0.1 + metrics: + count: 1 + empty: 1 + nulls: 0 + distinct: 1 + min: "" + max: "" + samples: + - ""