From a9492868d668d8bfac84058a7db92498786b5e8b Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Mon, 22 Apr 2024 11:59:57 +0000 Subject: [PATCH 01/31] feat: add null value should return error venom test --- test/suites/04-run-anonymizer.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index 7d9203a..1ccab1f 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -52,3 +52,21 @@ testcases: - result.code ShouldEqual 0 - result.systemout ShouldBeEmpty + - name: null value in dataset should return error + steps: + - script: rm -f output_sigo.json + - script: rm -f output_jq.json + - script: |- + sigo -q taille,poids,fruit,natation,course,voltige -s meurtre -k 2 -l 2 -a general > output_sigo.json < Date: Tue, 23 Apr 2024 08:41:25 +0000 Subject: [PATCH 02/31] feat: quaisi identifier check value type is float 64 --- internal/infra/source.go | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index 8b016ee..e0e71b3 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -19,7 +19,9 @@ package infra import ( "errors" + "fmt" "io" + "strconv" "github.com/cgi-fr/jsonline/pkg/jsonline" "github.com/cgi-fr/sigo/pkg/sigo" @@ -35,14 +37,36 @@ type JSONLineRecord struct { sensitives *[]string } -func (jlr JSONLineRecord) QuasiIdentifer() []float64 { +func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { result := []float64{} for _, key := range *jlr.quasiIdentifers { - result = append(result, (*jlr.row).GetFloat64(key)) + value, _ := (*jlr.row).Get(key) + if value == nil { + //nolint: goerr113 + err := errors.New("null value in dataset") + return []float64{}, err + } + + switch v := value.(type) { + case float64: + result = append(result, v) + case int: + result = append(result, float64(v)) + case string: + floatValue, err := strconv.ParseFloat(v, 64) + if err != nil { + return []float64{}, err + } + result = append(result, floatValue) + default: + err := fmt.Errorf("unsupported type: %T", v) + return []float64{}, err + } + } - return result + return result, nil } func (jlr JSONLineRecord) Sensitives() []interface{} { From cfaa5873bfeabd347413facf46191c60ab0e91f8 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 23 Apr 2024 15:15:31 +0000 Subject: [PATCH 03/31] feat:return error when a null value in dataset --- cmd/sigo/main.go | 4 +- internal/infra/source.go | 33 ++++++++-------- pkg/sigo/anonymizer.go | 43 ++++++++++++++------ pkg/sigo/driver.go | 9 +++-- pkg/sigo/driver_test.go | 24 +++++++++++ pkg/sigo/info.go | 2 +- pkg/sigo/kdtree.go | 66 +++++++++++++++++++++++-------- pkg/sigo/model.go | 4 +- test/suites/04-run-anonymizer.yml | 6 +-- 9 files changed, 135 insertions(+), 56 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 5a6fb7f..ffd28ef 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -175,7 +175,9 @@ func run(definition pdef, logs logs) { err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), definition.k, definition.l, len(definition.qi), newAnonymizer(definition.method, definition.args), sink, debugger) if err != nil { - panic(err) + log.Err(err).Msg("Anonymize failed") + log.Warn().Msg("End SIGO") + os.Exit(1) } if logs.profiling { diff --git a/internal/infra/source.go b/internal/infra/source.go index e0e71b3..45d9f76 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -19,9 +19,7 @@ package infra import ( "errors" - "fmt" "io" - "strconv" "github.com/cgi-fr/jsonline/pkg/jsonline" "github.com/cgi-fr/sigo/pkg/sigo" @@ -45,25 +43,26 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { if value == nil { //nolint: goerr113 err := errors.New("null value in dataset") - return []float64{}, err - } - switch v := value.(type) { - case float64: - result = append(result, v) - case int: - result = append(result, float64(v)) - case string: - floatValue, err := strconv.ParseFloat(v, 64) - if err != nil { - return []float64{}, err - } - result = append(result, floatValue) - default: - err := fmt.Errorf("unsupported type: %T", v) return []float64{}, err } + result = append(result, (*jlr.row).GetFloat64(key)) + + // switch v := value.(type) { + // case float64: + // result = append(result, v) + // case json.Number: + // floatValue, err := v.Float64() + // if err != nil { + // return []float64{}, err + // } + // result = append(result, floatValue) + // default: + // err := fmt.Errorf("unsupported type: %T", v) + // return []float64{}, err + // } + } return result, nil diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index 40ea218..37fa90f 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -18,6 +18,7 @@ package sigo import ( + "fmt" "os" "github.com/cgi-fr/jsonline/pkg/cast" @@ -97,7 +98,7 @@ type ( } ) -func (ar AnonymizedRecord) QuasiIdentifer() []float64 { +func (ar AnonymizedRecord) QuasiIdentifer() ([]float64, error) { return ar.original.QuasiIdentifer() } @@ -142,7 +143,7 @@ func (a GeneralAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) R // ComputeGeneralization calculates the min and max values of the cluster for each qi. func (a GeneralAnonymizer) ComputeGeneralization(clus Cluster, qi []string) { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) boundsVal := make(map[string]bounds) @@ -175,7 +176,7 @@ func (a AggregationAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []strin // ComputeAggregation calculates the mean (method meanAggreagtion) // or median (method medianAggregation) value of the cluster for each qi. func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) valAggregation := make(map[string]float64) @@ -196,7 +197,7 @@ func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) { // if the record is > Q3 then it takes the Q3 value // if the record is < Q1 then it takes the Q1 value. func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { @@ -205,7 +206,12 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re bottom := q.Q1 top := q.Q3 - val := rec.QuasiIdentifer()[i] + recVals, err := rec.QuasiIdentifer() + if err != nil { + fmt.Println(err) + break + } + val := recVals[i] switch { case val < bottom: @@ -224,11 +230,16 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re // the record takes as value the original value added to a Laplacian or Gaussian noise // the anonymized value stays within the bounds of the cluster. func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { - val := rec.QuasiIdentifer()[i] + recVals, err := rec.QuasiIdentifer() + if err != nil { + fmt.Println(err) + break + } + val := recVals[i] laplaceVal := Scaling(val, values[key], laplace) gaussianVal := Scaling(val, values[key], gaussian) @@ -281,7 +292,8 @@ func (a SwapAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Reco func (a SwapAnonymizer) Swap(clus Cluster, qi []string) { // retrieve the cluster values for each qi - values := listValues(clus, qi) + values, _ := listValues(clus, qi) + swapVal := make(map[string][]float64) for _, key := range qi { @@ -416,7 +428,8 @@ func (r Reidentification) Statistics(idCluster string, q string) (mean float64, // ComputeSimilarity computes the similarity score between the record rec and the anonymized cluster data. func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, - qi []string, s []string) map[float64]interface{} { + qi []string, s []string, +) map[float64]interface{} { scores := make(map[float64]interface{}) x := make(map[string]interface{}) @@ -448,14 +461,20 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, } // Returns the list of values present in the cluster for each qi. -func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) { +func listValues(clus Cluster, qi []string) (mapValues map[string][]float64, err error) { mapValues = make(map[string][]float64) for _, record := range clus.Records() { for i, key := range qi { - mapValues[key] = append(mapValues[key], record.QuasiIdentifer()[i]) + vals, _ := record.QuasiIdentifer() + // if err != nil { + // fmt.Println(err) + // return map[string][]float64{}, err + // } + val := vals[i] + mapValues[key] = append(mapValues[key], val) } } - return mapValues + return mapValues, nil } diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 214e9ba..99538c0 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -24,12 +24,12 @@ import ( ) func Anonymize(source RecordSource, factory GeneralizerFactory, - k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error { + k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger, +) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) count := 0 log.Info().Msg("Reading source") - for source.Next() { if source.Err() != nil { return fmt.Errorf("%w", source.Err()) @@ -42,7 +42,10 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") - generalizer.Build() + err := generalizer.Build() + if err != nil { + return err + } log.Info().Msg("Cluster Anonymization") diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 0be482a..fa717da 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -144,3 +144,27 @@ func BenchmarkLongClustering(b *testing.B) { jsonBytes.Close() } } + +func TestNullValueShouldReturnError(t *testing.T) { + t.Parallel() + + // nolint: goconst + sourceText := `{"x":0, "y":0, "foo":"bar"} + {"x":null, "y":1, "foo":"bar"} + {"x":0, "y":null, "foo":"bar"} + {"x":2, "y":1, "foo":"null"} + {"x":3, "y":2, "foo":"baz"} + {"x":2, "y":3, "foo":"baz"}` + + expectedMessage := "null value in dataset" + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.NotNil(t, err) + assert.Equal(t, expectedMessage, err.Error()) +} diff --git a/pkg/sigo/info.go b/pkg/sigo/info.go index b3b9735..e86b33c 100644 --- a/pkg/sigo/info.go +++ b/pkg/sigo/info.go @@ -29,7 +29,7 @@ type InfosRecord struct { infos map[string]interface{} } -func (ir InfosRecord) QuasiIdentifer() []float64 { +func (ir InfosRecord) QuasiIdentifer() ([]float64, error) { return ir.original.QuasiIdentifer() } diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index a866e63..f79e189 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -63,8 +63,8 @@ func (t KDTree) Add(r Record) { } // Build starts building the tree. -func (t KDTree) Build() { - t.root.build() +func (t KDTree) Build() error { + return t.root.build() } // Clusters returns the list of clusters in the tree. @@ -77,7 +77,7 @@ func (t KDTree) String() string { return t.root.string(0) } -//nolint: revive, golint +// nolint: revive, golint func NewNode(tree *KDTree, path string, rot int) node { return node{ tree: tree, @@ -111,7 +111,7 @@ func (n *node) incRot() { } // build creates nodes. -func (n *node) build() { +func (n *node) build() error { log.Debug(). Str("Dimension", n.tree.qi[n.rot]). Str("Path", n.clusterPath). @@ -123,11 +123,14 @@ func (n *node) build() { var ( lower, upper node valide bool + err error ) for i := 1; i <= n.tree.dim; i++ { - lower, upper, valide = n.split() - if !valide { + lower, upper, valide, err = n.split() + if err != nil { + return err + } else if !valide { n.incRot() } else { break @@ -135,7 +138,7 @@ func (n *node) build() { } if !valide { - return + return nil } lower.validate() @@ -147,17 +150,43 @@ func (n *node) build() { } n.cluster = nil - n.subNodes[0].build() - n.subNodes[1].build() + err = n.subNodes[0].build() + if err != nil { + return err + } + err = n.subNodes[1].build() + if err != nil { + return err + } } + + return nil } // split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts // and all elements having the same value in the same subnode. -func (n *node) split() (node, node, bool) { - sort.SliceStable(n.cluster, func(i int, j int) bool { - return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot] - }) +func (n *node) split() (node, node, bool, error) { + var globalError error + + less := func(i, j int) bool { + valueI, err := n.cluster[i].QuasiIdentifer() + if err != nil { + // Stocker l'erreur dans la variable globale + globalError = err + return false + } + valueJ, err := n.cluster[j].QuasiIdentifer() + if err != nil { + globalError = err + return false + } + return valueI[n.rot] < valueJ[n.rot] + } + + sort.SliceStable(n.cluster, less) + if globalError != nil { + return node{}, node{}, false, globalError + } n.pivot = nil lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1) @@ -168,21 +197,23 @@ func (n *node) split() (node, node, bool) { previous := n.cluster[0] for _, row := range n.cluster { + rowValue, _ := row.QuasiIdentifer() + previousValue, _ := previous.QuasiIdentifer() // equal subnodes and all elements having the same value in the same subnode - if lowerSize < len(n.cluster)/2 || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] { + if lowerSize < len(n.cluster)/2 || rowValue[n.rot] == previousValue[n.rot] { lower.Add(row) previous = row lowerSize++ } else { if n.pivot == nil { - n.pivot = row.QuasiIdentifer() + n.pivot = rowValue } upper.Add(row) upperSize++ } } - return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv() + return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv(), nil } // Records returns the list of records in the node. @@ -214,7 +245,8 @@ func (n *node) string(offset int) string { result := "[" for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) - result += fmt.Sprintf("%v ", rec.QuasiIdentifer()) + recValue, _ := rec.QuasiIdentifer() + result += fmt.Sprintf("%v ", recValue) } result += "]" diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index b0cc461..9a9c491 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -30,7 +30,7 @@ type RecordSink interface { } type Record interface { - QuasiIdentifer() []float64 + QuasiIdentifer() ([]float64, error) Sensitives() []interface{} Row() map[string]interface{} } @@ -44,7 +44,7 @@ type Generalizer interface { Add(Record) Clusters() []Cluster String() string - Build() + Build() error } type GeneralizerFactory interface { diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index 1ccab1f..8e9e42f 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -57,7 +57,7 @@ testcases: - script: rm -f output_sigo.json - script: rm -f output_jq.json - script: |- - sigo -q taille,poids,fruit,natation,course,voltige -s meurtre -k 2 -l 2 -a general > output_sigo.json < output_sigo.json < Date: Tue, 23 Apr 2024 15:49:46 +0000 Subject: [PATCH 04/31] refactor: repair lint warning --- cmd/sigo/main.go | 9 +++++---- internal/infra/source.go | 1 - pkg/sigo/anonymizer.go | 27 +++++++++++---------------- pkg/sigo/driver.go | 1 + pkg/sigo/driver_test.go | 1 - pkg/sigo/kdtree.go | 13 ++++++++++++- pkg/sigo/kdtree_test.go | 12 +++++++----- 7 files changed, 36 insertions(+), 28 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index ffd28ef..cfb0527 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -77,7 +77,7 @@ Copyright (C) 2022 CGI France \n License GPLv3: GNU GPL version 3 Q3 then it takes the Q3 value // if the record is < Q1 then it takes the Q1 value. func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values, _ := listValues(clus, qi) + values := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { @@ -206,11 +206,8 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re bottom := q.Q1 top := q.Q3 - recVals, err := rec.QuasiIdentifer() - if err != nil { - fmt.Println(err) - break - } + recVals, _ := rec.QuasiIdentifer() + val := recVals[i] switch { @@ -230,15 +227,17 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re // the record takes as value the original value added to a Laplacian or Gaussian noise // the anonymized value stays within the bounds of the cluster. func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values, _ := listValues(clus, qi) + values := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { recVals, err := rec.QuasiIdentifer() if err != nil { fmt.Println(err) + break } + val := recVals[i] laplaceVal := Scaling(val, values[key], laplace) @@ -292,7 +291,7 @@ func (a SwapAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Reco func (a SwapAnonymizer) Swap(clus Cluster, qi []string) { // retrieve the cluster values for each qi - values, _ := listValues(clus, qi) + values := listValues(clus, qi) swapVal := make(map[string][]float64) @@ -461,20 +460,16 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, } // Returns the list of values present in the cluster for each qi. -func listValues(clus Cluster, qi []string) (mapValues map[string][]float64, err error) { +func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) { mapValues = make(map[string][]float64) for _, record := range clus.Records() { for i, key := range qi { vals, _ := record.QuasiIdentifer() - // if err != nil { - // fmt.Println(err) - // return map[string][]float64{}, err - // } val := vals[i] mapValues[key] = append(mapValues[key], val) } } - return mapValues, nil + return mapValues } diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 99538c0..59ce58d 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -30,6 +30,7 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, count := 0 log.Info().Msg("Reading source") + for source.Next() { if source.Err() != nil { return fmt.Errorf("%w", source.Err()) diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index fa717da..1aced71 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -148,7 +148,6 @@ func BenchmarkLongClustering(b *testing.B) { func TestNullValueShouldReturnError(t *testing.T) { t.Parallel() - // nolint: goconst sourceText := `{"x":0, "y":0, "foo":"bar"} {"x":null, "y":1, "foo":"bar"} {"x":0, "y":null, "foo":"bar"} diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index f79e189..6b372eb 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -118,6 +118,7 @@ func (n *node) build() error { Int("Size", len(n.cluster)). Msg("Cluster:") + //nolint: nestif if n.isValid() && len(n.cluster) >= 2*n.tree.k { // rollback to simple node var ( @@ -130,7 +131,9 @@ func (n *node) build() error { lower, upper, valide, err = n.split() if err != nil { return err - } else if !valide { + } + + if !valide { n.incRot() } else { break @@ -151,9 +154,11 @@ func (n *node) build() error { n.cluster = nil err = n.subNodes[0].build() + if err != nil { return err } + err = n.subNodes[1].build() if err != nil { return err @@ -173,17 +178,22 @@ func (n *node) split() (node, node, bool, error) { if err != nil { // Stocker l'erreur dans la variable globale globalError = err + return false } + valueJ, err := n.cluster[j].QuasiIdentifer() if err != nil { globalError = err + return false } + return valueI[n.rot] < valueJ[n.rot] } sort.SliceStable(n.cluster, less) + if globalError != nil { return node{}, node{}, false, globalError } @@ -243,6 +253,7 @@ func (n *node) Clusters() []Cluster { func (n *node) string(offset int) string { if n.cluster != nil { result := "[" + for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) recValue, _ := rec.QuasiIdentifer() diff --git a/pkg/sigo/kdtree_test.go b/pkg/sigo/kdtree_test.go index 86493e3..d3bbad6 100644 --- a/pkg/sigo/kdtree_test.go +++ b/pkg/sigo/kdtree_test.go @@ -50,7 +50,7 @@ func TestAddRecord(t *testing.T) { assert.Equal(t, 0, clusters[0].Records()[0].Row()["x"]) } -// nolint: funlen +//nolint: funlen func TestAddNRecords(t *testing.T) { t.Parallel() @@ -112,7 +112,7 @@ func TestAddNRecords(t *testing.T) { {k: 6, n: 1000, d: 6, s: 5}, } - // nolint: paralleltest + //nolint: paralleltest for i, tc := range tests { t.Run(fmt.Sprintf("test %d", i), func(t *testing.T) { t.Parallel() @@ -127,9 +127,9 @@ func TestAddNRecords(t *testing.T) { rows := []jsonline.Row{} for i := 0; i < N; i++ { - // nolint: gosec + //nolint: gosec x := rand.Intn(N) - // nolint: gosec + //nolint: gosec y := rand.Intn(N) for j := 0; j < D; j++ { row := jsonline.NewRow() @@ -186,7 +186,9 @@ func TestAddClusterInfos(t *testing.T) { kdtree.Add(record) } - kdtree.Build() + err := kdtree.Build() + assert.Nil(t, err) + clusters := kdtree.Clusters() for _, cluster := range clusters { From 264cc9292e5e18dac3fb7bd5ec743ff22381b225 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 23 Apr 2024 15:57:29 +0000 Subject: [PATCH 05/31] refactor: repair lint warning --- internal/infra/source.go | 14 -------------- pkg/sigo/kdtree.go | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index a85561d..80de769 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -48,20 +48,6 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { } result = append(result, (*jlr.row).GetFloat64(key)) - - // switch v := value.(type) { - // case float64: - // result = append(result, v) - // case json.Number: - // floatValue, err := v.Float64() - // if err != nil { - // return []float64{}, err - // } - // result = append(result, floatValue) - // default: - // err := fmt.Errorf("unsupported type: %T", v) - // return []float64{}, err - // } } return result, nil diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index 6b372eb..5e83807 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -253,7 +253,7 @@ func (n *node) Clusters() []Cluster { func (n *node) string(offset int) string { if n.cluster != nil { result := "[" - + for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) recValue, _ := rec.QuasiIdentifer() From 43f54e9c2a493f35707cc087a017164367570add Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 24 Apr 2024 08:50:34 +0000 Subject: [PATCH 06/31] feat: add type check in quasi identifier --- internal/infra/source.go | 24 +++++++++++++++++++++++- pkg/sigo/driver_test.go | 23 ++++++++++++++++++++++- pkg/sigo/kdtree.go | 1 + 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index 80de769..93f6dbb 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -18,8 +18,11 @@ package infra import ( + "encoding/json" "errors" + "fmt" "io" + "strconv" "github.com/cgi-fr/jsonline/pkg/jsonline" "github.com/cgi-fr/sigo/pkg/sigo" @@ -47,7 +50,26 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { return []float64{}, err } - result = append(result, (*jlr.row).GetFloat64(key)) + var val float64 + switch t := value.(type) { + case int: + val = float64(t) + case string: + val, _ = strconv.ParseFloat(t, 64) + case float32: + val = float64(t) + case json.Number: + val, _ = t.Float64() + case float64: + val = t + default: + //nolint: goerr113 + err := fmt.Errorf("unsupported type: %T", t) + + return []float64{}, err + } + + result = append(result, val) } return result, nil diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 1aced71..3b6e0c9 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -164,6 +164,27 @@ func TestNullValueShouldReturnError(t *testing.T) { sink := infra.NewSliceDictionariesSink(&result) err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, sigo.NewSequenceDebugger("clusterID")) - assert.NotNil(t, err) + assert.Equal(t, expectedMessage, err.Error()) +} + +func TestWrongTypeShouldReturnError(t *testing.T) { + t.Parallel() + + sourceText := `{"x":0, "y":false, "foo":"bar"} + {"x":0, "y":1, "foo":"bar"} + {"x":0, "y":2, "foo":"bar"} + {"x":2, "y":1, "foo":"baz"} + {"x":3, "y":2, "foo":"baz"} + {"x":2, "y":3, "foo":"baz"}` + + expectedMessage := "unsupported type: bool" + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) assert.Equal(t, expectedMessage, err.Error()) } diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index 5e83807..4c45e17 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -206,6 +206,7 @@ func (n *node) split() (node, node, bool, error) { upperSize := 0 previous := n.cluster[0] + // All cluster are passed err check from sort func for _, row := range n.cluster { rowValue, _ := row.QuasiIdentifer() previousValue, _ := previous.QuasiIdentifer() From 3776cb059f4a7c2f783837c0d636ba4e023b8e28 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 24 Apr 2024 09:21:01 +0000 Subject: [PATCH 07/31] feat: add venom test wront type error --- internal/infra/source.go | 1 + test/suites/04-run-anonymizer.yml | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/internal/infra/source.go b/internal/infra/source.go index 93f6dbb..8595a82 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -55,6 +55,7 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { case int: val = float64(t) case string: + //nolint: gomnd val, _ = strconv.ParseFloat(t, 64) case float32: val = float64(t) diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index 8e9e42f..ca8dd41 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -70,3 +70,22 @@ testcases: assertions: - result.code ShouldEqual 1 - result.systemerr ShouldContainSubstring Anonymize failed + + - name: wrong type in dataset should return error + steps: + - script: rm -f output_sigo.json + - script: rm -f output_jq.json + - script: |- + sigo -q taille,poids,fruit,natation,course,voltige -s meurtre -k 2 -l 2 -a general -v 5 > output_sigo.json < Date: Wed, 24 Apr 2024 12:14:40 +0000 Subject: [PATCH 08/31] feat: handling all quasi identifer error --- pkg/sigo/anonymizer.go | 24 +++++++++++++++++------- pkg/sigo/kdtree.go | 9 ++++++++- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index f498fad..9ea6d6f 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -18,7 +18,6 @@ package sigo import ( - "fmt" "os" "github.com/cgi-fr/jsonline/pkg/cast" @@ -206,7 +205,12 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re bottom := q.Q1 top := q.Q3 - recVals, _ := rec.QuasiIdentifer() + recVals, err := rec.QuasiIdentifer() + if err != nil { + log.Err(err).Msg("Cannot cast quasi-identifier to float64") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) + } val := recVals[i] @@ -233,9 +237,9 @@ func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Rec for i, key := range qi { recVals, err := rec.QuasiIdentifer() if err != nil { - fmt.Println(err) - - break + log.Err(err).Msg("Cannot cast quasi-identifier to float64") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) } val := recVals[i] @@ -465,8 +469,14 @@ func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) { for _, record := range clus.Records() { for i, key := range qi { - vals, _ := record.QuasiIdentifer() - val := vals[i] + recVals, err := record.QuasiIdentifer() + if err != nil { + log.Err(err).Msg("Cannot cast quasi-identifier to float64") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) + } + + val := recVals[i] mapValues[key] = append(mapValues[key], val) } } diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index 4c45e17..72b0daf 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -20,6 +20,7 @@ package sigo import ( "fmt" "math" + "os" "sort" "strings" @@ -257,7 +258,13 @@ func (n *node) string(offset int) string { for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) - recValue, _ := rec.QuasiIdentifer() + recValue, err := rec.QuasiIdentifer() + if err != nil { + log.Err(err).Msg("Cannot cast quasi-identifier to float64 in node") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) + } + result += fmt.Sprintf("%v ", recValue) } From 36f945384e10819147bd9d0fb79d3091557d53bd Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 24 Apr 2024 12:21:17 +0000 Subject: [PATCH 09/31] docs: update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f344f18..32a2c21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ Types of changes - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [0.3.1] +- `Fixed` panic when a null value in dataset (#22) + ## [0.3.0] - `Added`export configuration in yaml file (#32) From 9cf9b70bc5198fdf95fc1806cdce28e8b65644e8 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 24 Apr 2024 12:22:04 +0000 Subject: [PATCH 10/31] docs: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a2c21..7fc1cf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Types of changes - `Security` in case of vulnerabilities. ## [0.3.1] + - `Fixed` panic when a null value in dataset (#22) ## [0.3.0] From 8ddf40f3c70803af70feda12ffa8a39825967663 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Thu, 25 Apr 2024 08:16:13 +0000 Subject: [PATCH 11/31] refactor: reverted commits until a949286 --- cmd/sigo/main.go | 13 +++---- internal/infra/source.go | 37 ++---------------- pkg/sigo/anonymizer.go | 34 +++------------- pkg/sigo/driver.go | 8 +--- pkg/sigo/driver_test.go | 44 --------------------- pkg/sigo/info.go | 2 +- pkg/sigo/kdtree.go | 83 ++++++++-------------------------------- pkg/sigo/kdtree_test.go | 12 +++--- pkg/sigo/model.go | 4 +- 9 files changed, 39 insertions(+), 198 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index cfb0527..5a6fb7f 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -77,7 +77,7 @@ Copyright (C) 2022 CGI France \n License GPLv3: GNU GPL version 3 = 2*n.tree.k { // rollback to simple node var ( lower, upper node valide bool - err error ) for i := 1; i <= n.tree.dim; i++ { - lower, upper, valide, err = n.split() - if err != nil { - return err - } - + lower, upper, valide = n.split() if !valide { n.incRot() } else { @@ -142,7 +135,7 @@ func (n *node) build() error { } if !valide { - return nil + return } lower.validate() @@ -154,50 +147,17 @@ func (n *node) build() error { } n.cluster = nil - err = n.subNodes[0].build() - - if err != nil { - return err - } - - err = n.subNodes[1].build() - if err != nil { - return err - } + n.subNodes[0].build() + n.subNodes[1].build() } - - return nil } // split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts // and all elements having the same value in the same subnode. -func (n *node) split() (node, node, bool, error) { - var globalError error - - less := func(i, j int) bool { - valueI, err := n.cluster[i].QuasiIdentifer() - if err != nil { - // Stocker l'erreur dans la variable globale - globalError = err - - return false - } - - valueJ, err := n.cluster[j].QuasiIdentifer() - if err != nil { - globalError = err - - return false - } - - return valueI[n.rot] < valueJ[n.rot] - } - - sort.SliceStable(n.cluster, less) - - if globalError != nil { - return node{}, node{}, false, globalError - } +func (n *node) split() (node, node, bool) { + sort.SliceStable(n.cluster, func(i int, j int) bool { + return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot] + }) n.pivot = nil lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1) @@ -207,25 +167,22 @@ func (n *node) split() (node, node, bool, error) { upperSize := 0 previous := n.cluster[0] - // All cluster are passed err check from sort func for _, row := range n.cluster { - rowValue, _ := row.QuasiIdentifer() - previousValue, _ := previous.QuasiIdentifer() // equal subnodes and all elements having the same value in the same subnode - if lowerSize < len(n.cluster)/2 || rowValue[n.rot] == previousValue[n.rot] { + if lowerSize < len(n.cluster)/2 || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] { lower.Add(row) previous = row lowerSize++ } else { if n.pivot == nil { - n.pivot = rowValue + n.pivot = row.QuasiIdentifer() } upper.Add(row) upperSize++ } } - return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv(), nil + return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv() } // Records returns the list of records in the node. @@ -255,17 +212,9 @@ func (n *node) Clusters() []Cluster { func (n *node) string(offset int) string { if n.cluster != nil { result := "[" - for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) - recValue, err := rec.QuasiIdentifer() - if err != nil { - log.Err(err).Msg("Cannot cast quasi-identifier to float64 in node") - log.Warn().Int("return", 1).Msg("End SIGO") - os.Exit(1) - } - - result += fmt.Sprintf("%v ", recValue) + result += fmt.Sprintf("%v ", rec.QuasiIdentifer()) } result += "]" diff --git a/pkg/sigo/kdtree_test.go b/pkg/sigo/kdtree_test.go index d3bbad6..86493e3 100644 --- a/pkg/sigo/kdtree_test.go +++ b/pkg/sigo/kdtree_test.go @@ -50,7 +50,7 @@ func TestAddRecord(t *testing.T) { assert.Equal(t, 0, clusters[0].Records()[0].Row()["x"]) } -//nolint: funlen +// nolint: funlen func TestAddNRecords(t *testing.T) { t.Parallel() @@ -112,7 +112,7 @@ func TestAddNRecords(t *testing.T) { {k: 6, n: 1000, d: 6, s: 5}, } - //nolint: paralleltest + // nolint: paralleltest for i, tc := range tests { t.Run(fmt.Sprintf("test %d", i), func(t *testing.T) { t.Parallel() @@ -127,9 +127,9 @@ func TestAddNRecords(t *testing.T) { rows := []jsonline.Row{} for i := 0; i < N; i++ { - //nolint: gosec + // nolint: gosec x := rand.Intn(N) - //nolint: gosec + // nolint: gosec y := rand.Intn(N) for j := 0; j < D; j++ { row := jsonline.NewRow() @@ -186,9 +186,7 @@ func TestAddClusterInfos(t *testing.T) { kdtree.Add(record) } - err := kdtree.Build() - assert.Nil(t, err) - + kdtree.Build() clusters := kdtree.Clusters() for _, cluster := range clusters { diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index 9a9c491..b0cc461 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -30,7 +30,7 @@ type RecordSink interface { } type Record interface { - QuasiIdentifer() ([]float64, error) + QuasiIdentifer() []float64 Sensitives() []interface{} Row() map[string]interface{} } @@ -44,7 +44,7 @@ type Generalizer interface { Add(Record) Clusters() []Cluster String() string - Build() error + Build() } type GeneralizerFactory interface { From 1178cd02c1c0688c0afc7ac592adadb509e29f10 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Thu, 25 Apr 2024 09:53:16 +0000 Subject: [PATCH 12/31] feat: creation of data validator and float64 data validator --- internal/infra/validator.go | 52 +++++++++++++++++++++++++++++++++++++ pkg/sigo/model.go | 4 +++ 2 files changed, 56 insertions(+) create mode 100644 internal/infra/validator.go diff --git a/internal/infra/validator.go b/internal/infra/validator.go new file mode 100644 index 0000000..c131471 --- /dev/null +++ b/internal/infra/validator.go @@ -0,0 +1,52 @@ +// Copyright (C) 2022 CGI France +// +// This file is part of SIGO. +// +// SIGO is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SIGO is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SIGO. If not, see . + +package infra + +import ( + "errors" + + "github.com/cgi-fr/sigo/pkg/sigo" +) + +type Float64DataValidator struct { + source sigo.RecordSource +} + +func NewFloat64DataValidator(source sigo.RecordSource) Float64DataValidator { + return Float64DataValidator{source: source} +} + +func (v Float64DataValidator) Validation() error { + // Check Null value + // if value == nil { + // //nolint: goerr113 + // err := errors.New("null value in dataset") + + // return err + // } + + // Check all data can transfer to float64 + // for _, value := range data { + // _, err := strconv.ParseFloat(value, 64) + // if err != nil { + // return errors.New("unsupported type in dataset") + // } + // } + //nolint: goerr113 + return errors.Unwrap(errors.New("not valide")) +} diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index b0cc461..c6273bb 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -58,3 +58,7 @@ type Anonymizer interface { type Debugger interface { Information(Record, Cluster) Record } + +type DataValidator interface { + Validation() error +} From 87cb2780adf9f8e0114aab941cf5703a4cd9b310 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Thu, 25 Apr 2024 10:08:01 +0000 Subject: [PATCH 13/31] feat: integrates validator in main --- cmd/sigo/main.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 5a6fb7f..e35a70c 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -145,6 +145,15 @@ func run(definition pdef, logs logs) { os.Exit(1) } + validator := infra.NewFloat64DataValidator(source) + err = validator.Validation() + + if err != nil { + log.Err(err).Msg("Unsupported input data.") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) + } + sink := infra.NewJSONLineSink(os.Stdout) var debugger sigo.Debugger @@ -233,7 +242,7 @@ func initLog(logs logs, entropy bool) { log.Info().Msgf("%v %v (commit=%v date=%v by=%v)", name, version, commit, buildDate, builtBy) } -//nolint: cyclop +// nolint: cyclop func newAnonymizer(name string, args []string) sigo.Anonymizer { switch name { case "general": From 307621d6f55e9b486b4eb1474cf95f9dc46e12a5 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 10:15:52 +0000 Subject: [PATCH 14/31] refactor: move validator in sigo --- cmd/sigo/main.go | 15 ++----- pkg/sigo/anonymizer.go | 3 +- pkg/sigo/driver.go | 12 +++++- pkg/sigo/driver_test.go | 41 ++++++++++++++++++ {internal/infra => pkg/sigo}/validator.go | 51 +++++++++++++---------- test/suites/04-run-anonymizer.yml | 6 +-- 6 files changed, 88 insertions(+), 40 deletions(-) rename {internal/infra => pkg/sigo}/validator.go (56%) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index e35a70c..36ff8a5 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -145,15 +145,6 @@ func run(definition pdef, logs logs) { os.Exit(1) } - validator := infra.NewFloat64DataValidator(source) - err = validator.Validation() - - if err != nil { - log.Err(err).Msg("Unsupported input data.") - log.Warn().Int("return", 1).Msg("End SIGO") - os.Exit(1) - } - sink := infra.NewJSONLineSink(os.Stdout) var debugger sigo.Debugger @@ -184,7 +175,9 @@ func run(definition pdef, logs logs) { err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), definition.k, definition.l, len(definition.qi), newAnonymizer(definition.method, definition.args), sink, debugger) if err != nil { - panic(err) + log.Err(err).Msg("Unsupported input data.") + log.Warn().Int("return", 1).Msg("End SIGO") + os.Exit(1) } if logs.profiling { @@ -192,7 +185,7 @@ func run(definition pdef, logs logs) { } } -// nolint: cyclop +//nolint: cyclop func initLog(logs logs, entropy bool) { color := false diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index 40ea218..e04d041 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -416,7 +416,8 @@ func (r Reidentification) Statistics(idCluster string, q string) (mean float64, // ComputeSimilarity computes the similarity score between the record rec and the anonymized cluster data. func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, - qi []string, s []string) map[float64]interface{} { + qi []string, s []string, +) map[float64]interface{} { scores := make(map[float64]interface{}) x := make(map[string]interface{}) diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 214e9ba..f2a2ed1 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -24,10 +24,11 @@ import ( ) func Anonymize(source RecordSource, factory GeneralizerFactory, - k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error { + k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger, +) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) count := 0 - + records := []Record{} log.Info().Msg("Reading source") for source.Next() { @@ -36,9 +37,16 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, } generalizer.Add(source.Value()) + records = append(records, source.Value()) count++ } + validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) + err := validator.Validation() + if err != nil { + return err + } + log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 0be482a..edcb898 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -144,3 +144,44 @@ func BenchmarkLongClustering(b *testing.B) { jsonBytes.Close() } } + +func TestDataValidatorShouldReturnErrorWithNullValue(t *testing.T) { + t.Parallel() + + sourceText := `{"x":0, "y":2, "foo":"bar"} + {"x":1, "y":1, "foo":"bar"} + {"x":0, "y":null, "foo":"bar"} + {"x":2, "y":1, "foo":"baz"} + {"x":3, "y":2, "foo":"baz"}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + expectedMessage := "null value in dataset" + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.Equal(t, expectedMessage, err.Error()) +} + +func TestDataValidatorShouldReturnErrorWithList(t *testing.T) { + t.Parallel() + + sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"souris"} + {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"saumon"} + {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"chouette"} + {"fruit":[0,1],"taille":[1,2],"poids":null,"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"canard"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"loup"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"singe"}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"poids", "taille"}, []string{"animal"}) + assert.Nil(t, err) + + expectedMessage := "null value in dataset" + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.Equal(t, expectedMessage, err.Error()) +} diff --git a/internal/infra/validator.go b/pkg/sigo/validator.go similarity index 56% rename from internal/infra/validator.go rename to pkg/sigo/validator.go index c131471..653f07e 100644 --- a/internal/infra/validator.go +++ b/pkg/sigo/validator.go @@ -15,38 +15,43 @@ // You should have received a copy of the GNU General Public License // along with SIGO. If not, see . -package infra +package sigo import ( "errors" - - "github.com/cgi-fr/sigo/pkg/sigo" + "fmt" ) type Float64DataValidator struct { - source sigo.RecordSource + records []Record + quasiIdentifers []string } -func NewFloat64DataValidator(source sigo.RecordSource) Float64DataValidator { - return Float64DataValidator{source: source} +func NewFloat64DataValidator(records []Record, quasiIdentifers []string) Float64DataValidator { + return Float64DataValidator{records: records, quasiIdentifers: quasiIdentifers} } +// nolint: cyclop func (v Float64DataValidator) Validation() error { - // Check Null value - // if value == nil { - // //nolint: goerr113 - // err := errors.New("null value in dataset") - - // return err - // } - - // Check all data can transfer to float64 - // for _, value := range data { - // _, err := strconv.ParseFloat(value, 64) - // if err != nil { - // return errors.New("unsupported type in dataset") - // } - // } - //nolint: goerr113 - return errors.Unwrap(errors.New("not valide")) + for _, record := range v.records { + row := record.Row() + + for _, key := range v.quasiIdentifers { + if row[key] == nil { + //nolint: goerr113 + err := errors.New("null value in dataset") + + return err + } + + switch t := row[key].(type) { + case bool: + err := fmt.Errorf("unsupported type: %T", t) + + return err + } + } + } + + return nil } diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index ca8dd41..9f554c1 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -61,15 +61,15 @@ testcases: {"fruit":1, "taille": 2, "poids": 2 , "meurtre": 0, "natation":1, "course":1, "voltige":1 , "animal": "canard"} {"fruit":1, "taille": 5, "poids": 5 , "meurtre": 0, "natation":1, "course":1, "voltige":0 , "animal": "elephant"} {"fruit":1, "taille": 4, "poids": 4 , "meurtre": 1, "natation":1, "course":1, "voltige":0 , "animal": "ours"} - {"fruit":1, "taille": 1, "poids": 1 , "meurtre": 0, "natation":0, "course":1, "voltige":0, "animal":"souris" } + {"fruit":1, "taille": 1, "poids": null , "meurtre": 0, "natation":0, "course":1, "voltige":0, "animal":"souris" } {"fruit":0, "taille": 2, "poids": 2 , "meurtre": 0, "natation":1, "course":0, "voltige":0 , "animal": "saumon"} {"fruit":0, "taille": 2, "poids": 1 , "meurtre": 1, "natation":0, "course":0, "voltige":1, "animal" : "chouette" } {"fruit":1, "taille": 3, "poids": 3 , "meurtre": 0, "natation":1, "course":0, "voltige":1 , "animal": "singe"} - {"fruit":0, "taille": 3, "poids": null , "meurtre": 1, "natation":0, "course":1, "voltige":0 , "animal" : "loup"} + {"fruit":0, "taille": 3, "poids": 2 , "meurtre": 1, "natation":0, "course":1, "voltige":0 , "animal" : "loup"} EOF assertions: - result.code ShouldEqual 1 - - result.systemerr ShouldContainSubstring Anonymize failed + - result.systemerr ShouldContainSubstring null value - name: wrong type in dataset should return error steps: From c187edbee944e84962d9470f0bb58bf0241996b9 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 12:29:53 +0000 Subject: [PATCH 15/31] refactor: fix lint error --- cmd/sigo/main.go | 3 +-- pkg/sigo/driver.go | 2 ++ pkg/sigo/driver_test.go | 3 +-- pkg/sigo/validator.go | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 36ff8a5..97e9130 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -146,7 +146,7 @@ func run(definition pdef, logs logs) { } sink := infra.NewJSONLineSink(os.Stdout) - + var debugger sigo.Debugger if logs.info != "" { @@ -156,7 +156,6 @@ func run(definition pdef, logs logs) { } var cpuProfiler interface{ Stop() } - if logs.profiling { cpuProfiler = profile.Start(profile.ProfilePath(".")) } diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index f2a2ed1..a388ac8 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -29,6 +29,7 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) count := 0 records := []Record{} + log.Info().Msg("Reading source") for source.Next() { @@ -43,6 +44,7 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) err := validator.Validation() + if err != nil { return err } diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index edcb898..9295a04 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -167,9 +167,8 @@ func TestDataValidatorShouldReturnErrorWithNullValue(t *testing.T) { func TestDataValidatorShouldReturnErrorWithList(t *testing.T) { t.Parallel() - + //nolint: go-golangci-lint sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"souris"} - {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"saumon"} {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"chouette"} {"fruit":[0,1],"taille":[1,2],"poids":null,"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"canard"} {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"loup"} diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index 653f07e..6fb9587 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -31,7 +31,6 @@ func NewFloat64DataValidator(records []Record, quasiIdentifers []string) Float64 return Float64DataValidator{records: records, quasiIdentifers: quasiIdentifers} } -// nolint: cyclop func (v Float64DataValidator) Validation() error { for _, record := range v.records { row := record.Row() @@ -44,8 +43,10 @@ func (v Float64DataValidator) Validation() error { return err } + //nolint: gocritic switch t := row[key].(type) { case bool: + //nolint: goerr113 err := fmt.Errorf("unsupported type: %T", t) return err From 83201e6ae43ff099c8283a73ddc76ec20665b4b6 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 12:54:32 +0000 Subject: [PATCH 16/31] refactor: fix lint error --- cmd/sigo/main.go | 13 +++++++------ pkg/sigo/driver.go | 2 +- pkg/sigo/driver_test.go | 10 +++++----- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 97e9130..d3afe0e 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -33,7 +33,7 @@ import ( "github.com/spf13/cobra" ) -// nolint: gochecknoglobals +//nolint: gochecknoglobals var ( name string version string @@ -94,7 +94,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa rootCmd.PersistentFlags(). BoolVar(&logs.jsonlog, "log-json", false, "output logs in JSON format") rootCmd.PersistentFlags().StringVar(&logs.colormode, "color", "auto", "use colors in log outputs : yes, no or auto") - // nolint: gomnd + //nolint: gomnd rootCmd.PersistentFlags().IntVarP(&definition.k, "k-value", "k", 3, "k-value for k-anonymization") rootCmd.PersistentFlags().IntVarP(&definition.l, "l-value", "l", 1, "l-value for l-diversity") rootCmd.PersistentFlags(). @@ -120,7 +120,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa os.Exit(1) } } - +//nolint: funlen func run(definition pdef, logs logs) { initLog(logs, definition.entropy) @@ -146,7 +146,7 @@ func run(definition pdef, logs logs) { } sink := infra.NewJSONLineSink(os.Stdout) - + var debugger sigo.Debugger if logs.info != "" { @@ -155,7 +155,8 @@ func run(definition pdef, logs logs) { debugger = sigo.NewNoDebugger() } - var cpuProfiler interface{ Stop() } + var cpuProfiler interface{ Stop()} + if logs.profiling { cpuProfiler = profile.Start(profile.ProfilePath(".")) } @@ -234,7 +235,7 @@ func initLog(logs logs, entropy bool) { log.Info().Msgf("%v %v (commit=%v date=%v by=%v)", name, version, commit, buildDate, builtBy) } -// nolint: cyclop +//nolint: cyclop func newAnonymizer(name string, args []string) sigo.Anonymizer { switch name { case "general": diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index a388ac8..e90916a 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -43,8 +43,8 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, } validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) + err := validator.Validation() - if err != nil { return err } diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 9295a04..743a3b6 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -168,11 +168,11 @@ func TestDataValidatorShouldReturnErrorWithNullValue(t *testing.T) { func TestDataValidatorShouldReturnErrorWithList(t *testing.T) { t.Parallel() //nolint: go-golangci-lint - sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"souris"} - {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"chouette"} - {"fruit":[0,1],"taille":[1,2],"poids":null,"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"canard"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":1,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"loup"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"meurtre":0,"natation":[0,1],"course":[0,1],"voltige":[0,1],"animal":"singe"}` + sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"souris"} + {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"chouette"} + {"fruit":[0,1],"taille":[1,2],"poids":null,"animal":"canard"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"loup"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"singe"}` source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"poids", "taille"}, []string{"animal"}) assert.Nil(t, err) From abd0d21b266f4612e727ccab7ca64399e21fe9e1 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 12:59:27 +0000 Subject: [PATCH 17/31] refactor: fix lint error --- cmd/sigo/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index d3afe0e..60bd3c2 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -120,6 +120,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa os.Exit(1) } } + //nolint: funlen func run(definition pdef, logs logs) { initLog(logs, definition.entropy) From 8bb01374d35fc2524b009ee639d44dcc9cb6c28b Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 13:05:22 +0000 Subject: [PATCH 18/31] refactor: fix lint error --- cmd/sigo/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 60bd3c2..82a3fa0 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -77,7 +77,7 @@ Copyright (C) 2022 CGI France \n License GPLv3: GNU GPL version 3 Date: Fri, 26 Apr 2024 13:55:40 +0000 Subject: [PATCH 19/31] feat: add check type func --- pkg/sigo/driver_test.go | 20 ++++++++++++++++++ pkg/sigo/validator.go | 45 ++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 743a3b6..27f4e4d 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -184,3 +184,23 @@ func TestDataValidatorShouldReturnErrorWithList(t *testing.T) { sigo.NewSequenceDebugger("clusterID")) assert.Equal(t, expectedMessage, err.Error()) } + +func TestDataValidatorShouldReturnErrorWithStringValue(t *testing.T) { + t.Parallel() + + sourceText := `{"x":0, "y":2, "foo":"bar"} + {"x":1, "y":1, "foo":"bar"} + {"x":0, "y":"hello", "foo":"bar"} + {"x":2, "y":1, "foo":"baz"} + {"x":3, "y":2, "foo":"baz"}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + expectedMessage := "unsupported type: string" + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.Equal(t, expectedMessage, err.Error()) +} diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index 6fb9587..ed784c9 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -18,8 +18,10 @@ package sigo import ( + "encoding/json" "errors" "fmt" + "strconv" ) type Float64DataValidator struct { @@ -36,6 +38,7 @@ func (v Float64DataValidator) Validation() error { row := record.Row() for _, key := range v.quasiIdentifers { + // Null value check if row[key] == nil { //nolint: goerr113 err := errors.New("null value in dataset") @@ -43,16 +46,44 @@ func (v Float64DataValidator) Validation() error { return err } - //nolint: gocritic - switch t := row[key].(type) { - case bool: - //nolint: goerr113 - err := fmt.Errorf("unsupported type: %T", t) - - return err + // Type check + isValide, typeErr := checkType(row, key) + if isValide { + return typeErr } } } return nil } + +func checkType(row map[string]interface{}, key string) (bool, error) { + //nolint: varnamelen + switch t := row[key].(type) { + case int: + return false, nil + case string: + _, err := strconv.ParseFloat(t, 64) + if err != nil { + //nolint: goerr113 + err = fmt.Errorf("unsupported type: %T", t) + + return true, err + } + case float32: + return false, nil + case json.Number: + return false, nil + case float64: + return false, nil + case []interface{}: + return false, nil + default: + //nolint: goerr113 + err := fmt.Errorf("unsupported type: %T", t) + + return true, err + } + + return false, nil +} From 59c99a97a34859f5a80522744b66885d5b95077c Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Fri, 26 Apr 2024 14:03:49 +0000 Subject: [PATCH 20/31] refactor: fix lint error --- pkg/sigo/validator.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index ed784c9..f7ed091 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -63,6 +63,7 @@ func checkType(row map[string]interface{}, key string) (bool, error) { case int: return false, nil case string: + //nolint: gomnd _, err := strconv.ParseFloat(t, 64) if err != nil { //nolint: goerr113 From 8ed4b40cf0e4106f640a6976428707a36bbf5c72 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Mon, 13 May 2024 10:05:09 +0000 Subject: [PATCH 21/31] feat: add transform type func in validator --- pkg/sigo/driver.go | 3 ++- pkg/sigo/validator.go | 53 ++++++++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index e90916a..b3f3d03 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -44,11 +44,12 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) - err := validator.Validation() + vals, err := validator.Validation() if err != nil { return err } + fmt.Println(vals) log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index f7ed091..eff49b7 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -33,7 +33,9 @@ func NewFloat64DataValidator(records []Record, quasiIdentifers []string) Float64 return Float64DataValidator{records: records, quasiIdentifers: quasiIdentifers} } -func (v Float64DataValidator) Validation() error { +func (v Float64DataValidator) Validation() ([]float64, error) { + results := []float64{} + for _, record := range v.records { row := record.Row() @@ -43,48 +45,67 @@ func (v Float64DataValidator) Validation() error { //nolint: goerr113 err := errors.New("null value in dataset") - return err + return nil, err } // Type check - isValide, typeErr := checkType(row, key) - if isValide { - return typeErr + valFloat64, typeErr := transformType(row, key) + if typeErr != nil { + return nil, typeErr } + + results = append(results, valFloat64) } } - return nil + return results, nil } -func checkType(row map[string]interface{}, key string) (bool, error) { +func transformType(row map[string]interface{}, key string) (float64, error) { + var result float64 + //nolint: varnamelen switch t := row[key].(type) { case int: - return false, nil + result = float64(t) case string: //nolint: gomnd - _, err := strconv.ParseFloat(t, 64) + val, err := strconv.ParseFloat(t, 64) if err != nil { //nolint: goerr113 err = fmt.Errorf("unsupported type: %T", t) - return true, err + return result, err } + result = val case float32: - return false, nil + result = float64(t) case json.Number: - return false, nil + val, err := t.Float64() + if err != nil { + //nolint: goerr113 + err = fmt.Errorf("unsupported type: %T", t) + + return result, err + } + result = val case float64: - return false, nil + result = t case []interface{}: - return false, nil + for _, val := range t { + if val == nil { + //nolint: goerr113 + err := errors.New("null value in dataset") + + return result, err + } + } default: //nolint: goerr113 err := fmt.Errorf("unsupported type: %T", t) - return true, err + return result, err } - return false, nil + return result, nil } From 5d3224d0c3c9d2de2f1bb663bffd688d74cb7d7b Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Mon, 13 May 2024 12:48:03 +0000 Subject: [PATCH 22/31] feat: add null value in list should return err test --- pkg/sigo/driver.go | 3 +-- pkg/sigo/driver_test.go | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index b3f3d03..de66fd4 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -44,12 +44,11 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) - vals, err := validator.Validation() + _, err := validator.Validation() if err != nil { return err } - fmt.Println(vals) log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 27f4e4d..aa5560d 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -204,3 +204,23 @@ func TestDataValidatorShouldReturnErrorWithStringValue(t *testing.T) { sigo.NewSequenceDebugger("clusterID")) assert.Equal(t, expectedMessage, err.Error()) } + +func TestDataValidatorShouldReturnErrorWithNullvalueInList(t *testing.T) { + t.Parallel() + //nolint: go-golangci-lint + sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"souris"} + {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"chouette"} + {"fruit":[0,1],"taille":[1,2],"poids":[1, null],"animal":"canard"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"loup"} + {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"singe"}` + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"poids", "taille"}, []string{"animal"}) + assert.Nil(t, err) + + expectedMessage := "null value in dataset" + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.Equal(t, expectedMessage, err.Error()) +} From a0e90641b100d5646afe915c026260d0accf6bb1 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Mon, 13 May 2024 13:17:53 +0000 Subject: [PATCH 23/31] refactor: remove list value type check and test --- pkg/sigo/driver_test.go | 40 ---------------------------------------- pkg/sigo/validator.go | 9 --------- 2 files changed, 49 deletions(-) diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index aa5560d..dd0624d 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -165,26 +165,6 @@ func TestDataValidatorShouldReturnErrorWithNullValue(t *testing.T) { assert.Equal(t, expectedMessage, err.Error()) } -func TestDataValidatorShouldReturnErrorWithList(t *testing.T) { - t.Parallel() - //nolint: go-golangci-lint - sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"souris"} - {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"chouette"} - {"fruit":[0,1],"taille":[1,2],"poids":null,"animal":"canard"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"loup"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"singe"}` - - source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"poids", "taille"}, []string{"animal"}) - assert.Nil(t, err) - - expectedMessage := "null value in dataset" - result := []map[string]interface{}{} - sink := infra.NewSliceDictionariesSink(&result) - err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, - sigo.NewSequenceDebugger("clusterID")) - assert.Equal(t, expectedMessage, err.Error()) -} - func TestDataValidatorShouldReturnErrorWithStringValue(t *testing.T) { t.Parallel() @@ -204,23 +184,3 @@ func TestDataValidatorShouldReturnErrorWithStringValue(t *testing.T) { sigo.NewSequenceDebugger("clusterID")) assert.Equal(t, expectedMessage, err.Error()) } - -func TestDataValidatorShouldReturnErrorWithNullvalueInList(t *testing.T) { - t.Parallel() - //nolint: go-golangci-lint - sourceText := `{"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"souris"} - {"fruit":[0,1],"taille":[1,2],"poids":[1,2],"animal":"chouette"} - {"fruit":[0,1],"taille":[1,2],"poids":[1, null],"animal":"canard"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"loup"} - {"fruit":[0,1],"taille":[3,3],"poids":[3,4],"animal":"singe"}` - - source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"poids", "taille"}, []string{"animal"}) - assert.Nil(t, err) - - expectedMessage := "null value in dataset" - result := []map[string]interface{}{} - sink := infra.NewSliceDictionariesSink(&result) - err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, - sigo.NewSequenceDebugger("clusterID")) - assert.Equal(t, expectedMessage, err.Error()) -} diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index eff49b7..de672fb 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -91,15 +91,6 @@ func transformType(row map[string]interface{}, key string) (float64, error) { result = val case float64: result = t - case []interface{}: - for _, val := range t { - if val == nil { - //nolint: goerr113 - err := errors.New("null value in dataset") - - return result, err - } - } default: //nolint: goerr113 err := fmt.Errorf("unsupported type: %T", t) From 3226fbdbada113ed5e2f27d5c4716125c12e24e6 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Mon, 13 May 2024 13:21:46 +0000 Subject: [PATCH 24/31] refactor: fix lint err --- pkg/sigo/validator.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go index de672fb..0aa622f 100644 --- a/pkg/sigo/validator.go +++ b/pkg/sigo/validator.go @@ -77,6 +77,7 @@ func transformType(row map[string]interface{}, key string) (float64, error) { return result, err } + result = val case float32: result = float64(t) @@ -88,6 +89,7 @@ func transformType(row map[string]interface{}, key string) (float64, error) { return result, err } + result = val case float64: result = t From 426ea0963d5459b3bd00190f201c5af744175249 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 14 May 2024 09:38:50 +0000 Subject: [PATCH 25/31] feat: add get qi func in record --- internal/infra/source.go | 6 +++++- pkg/sigo/anonymizer.go | 6 ++++-- pkg/sigo/driver.go | 4 ++-- pkg/sigo/info.go | 4 ++++ pkg/sigo/model.go | 1 + 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index 8b016ee..78b1eda 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -26,13 +26,14 @@ import ( ) func NewJSONLineRecord(row *jsonline.Row, quasiIdentifers *[]string, sensitives *[]string) JSONLineRecord { - return JSONLineRecord{row, quasiIdentifers, sensitives} + return JSONLineRecord{row, quasiIdentifers, sensitives, make(map[string]float64)} } type JSONLineRecord struct { row *jsonline.Row quasiIdentifers *[]string sensitives *[]string + float64QI map[string]float64 } func (jlr JSONLineRecord) QuasiIdentifer() []float64 { @@ -65,6 +66,9 @@ func (jlr JSONLineRecord) Row() map[string]interface{} { return result.(map[string]interface{}) } +func (jlr JSONLineRecord) GetQI() map[string]float64 { + return jlr.float64QI +} func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { // nolint: exhaustivestruct source := &JSONLineSource{importer: jsonline.NewImporter(r), quasiIdentifers: quasiIdentifers, sensitives: sensitives} diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index e04d041..0ea87b3 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -105,6 +105,10 @@ func (ar AnonymizedRecord) Sensitives() []interface{} { return ar.original.Sensitives() } +func (ar AnonymizedRecord) GetQI() map[string]float64 { + return ar.original.GetQI() +} + func (ar AnonymizedRecord) Row() map[string]interface{} { original := ar.original.Row() for k, v := range ar.mask { @@ -419,7 +423,6 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, qi []string, s []string, ) map[float64]interface{} { scores := make(map[float64]interface{}) - x := make(map[string]interface{}) for _, q := range qi { @@ -438,7 +441,6 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, } Y := MapItoMapF(y) - // Compute similarity score := Similarity(ComputeDistance("", X, Y)) diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index de66fd4..6eaa7dd 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -44,11 +44,11 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) - _, err := validator.Validation() + valsFloat64, err := validator.Validation() if err != nil { return err } - + fmt.Println(valsFloat64) log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") diff --git a/pkg/sigo/info.go b/pkg/sigo/info.go index b3b9735..c6f0c43 100644 --- a/pkg/sigo/info.go +++ b/pkg/sigo/info.go @@ -46,6 +46,10 @@ func (ir InfosRecord) Row() map[string]interface{} { return original } +func (ir InfosRecord) GetQI() map[string]float64 { + return ir.original.GetQI() +} + // id returns the path of cluster c coverts to integer. func (d SequenceDebugger) id(c Cluster) int { count := len(d.cache) diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index c6273bb..da3b2a8 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -33,6 +33,7 @@ type Record interface { QuasiIdentifer() []float64 Sensitives() []interface{} Row() map[string]interface{} + GetQI() map[string]float64 } type Cluster interface { From 0e0bd65ea5ae211cfadf3b10a0ed62335fe1c2e8 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 14 May 2024 12:45:33 +0000 Subject: [PATCH 26/31] refactor: change float64 qi to pointer for swap anonymizer --- internal/infra/source.go | 7 ++++--- pkg/sigo/driver.go | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index 78b1eda..63e434e 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -26,14 +26,14 @@ import ( ) func NewJSONLineRecord(row *jsonline.Row, quasiIdentifers *[]string, sensitives *[]string) JSONLineRecord { - return JSONLineRecord{row, quasiIdentifers, sensitives, make(map[string]float64)} + return JSONLineRecord{row, quasiIdentifers, sensitives, &map[string]float64{}} } type JSONLineRecord struct { row *jsonline.Row quasiIdentifers *[]string sensitives *[]string - float64QI map[string]float64 + float64QI *map[string]float64 } func (jlr JSONLineRecord) QuasiIdentifer() []float64 { @@ -67,8 +67,9 @@ func (jlr JSONLineRecord) Row() map[string]interface{} { } func (jlr JSONLineRecord) GetQI() map[string]float64 { - return jlr.float64QI + return *jlr.float64QI } + func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { // nolint: exhaustivestruct source := &JSONLineSource{importer: jsonline.NewImporter(r), quasiIdentifers: quasiIdentifers, sensitives: sensitives} diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 6eaa7dd..de66fd4 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -44,11 +44,11 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) - valsFloat64, err := validator.Validation() + _, err := validator.Validation() if err != nil { return err } - fmt.Println(valsFloat64) + log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") From 1c8a9edd5cb00e521598c6fea851fa7bbd3cc57d Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 14 May 2024 14:34:10 +0000 Subject: [PATCH 27/31] feat: add set qi fun for recordd --- internal/infra/source.go | 4 ++++ pkg/sigo/anonymizer.go | 4 ++++ pkg/sigo/info.go | 4 ++++ pkg/sigo/model.go | 1 + 4 files changed, 13 insertions(+) diff --git a/internal/infra/source.go b/internal/infra/source.go index 63e434e..01e70fa 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -70,6 +70,10 @@ func (jlr JSONLineRecord) GetQI() map[string]float64 { return *jlr.float64QI } +func (jlr JSONLineRecord) SetQI(float64Map map[string]float64) { + jlr.float64QI = &float64Map +} + func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { // nolint: exhaustivestruct source := &JSONLineSource{importer: jsonline.NewImporter(r), quasiIdentifers: quasiIdentifers, sensitives: sensitives} diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index 0ea87b3..66fd4b6 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -109,6 +109,10 @@ func (ar AnonymizedRecord) GetQI() map[string]float64 { return ar.original.GetQI() } +func (ar AnonymizedRecord) SetQI(float64Map map[string]float64) { + ar.original.SetQI(float64Map) +} + func (ar AnonymizedRecord) Row() map[string]interface{} { original := ar.original.Row() for k, v := range ar.mask { diff --git a/pkg/sigo/info.go b/pkg/sigo/info.go index c6f0c43..33a6324 100644 --- a/pkg/sigo/info.go +++ b/pkg/sigo/info.go @@ -50,6 +50,10 @@ func (ir InfosRecord) GetQI() map[string]float64 { return ir.original.GetQI() } +func (ir InfosRecord) SetQI(float64Map map[string]float64) { + ir.original.SetQI(float64Map) +} + // id returns the path of cluster c coverts to integer. func (d SequenceDebugger) id(c Cluster) int { count := len(d.cache) diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index da3b2a8..87fe538 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -34,6 +34,7 @@ type Record interface { Sensitives() []interface{} Row() map[string]interface{} GetQI() map[string]float64 + SetQI(map[string]float64) } type Cluster interface { From a007de38039af2ecab47e118357c55f899502287 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 14 May 2024 15:18:32 +0000 Subject: [PATCH 28/31] feat: add test get qi and set qi --- internal/infra/source.go | 8 +++++++- pkg/sigo/anonymizer_test.go | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index 01e70fa..e3dfde3 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -71,7 +71,13 @@ func (jlr JSONLineRecord) GetQI() map[string]float64 { } func (jlr JSONLineRecord) SetQI(float64Map map[string]float64) { - jlr.float64QI = &float64Map + copiedMap := make(map[string]float64) + + for key, value := range float64Map { + copiedMap[key] = value + } + + jlr.float64QI = &copiedMap } func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { diff --git a/pkg/sigo/anonymizer_test.go b/pkg/sigo/anonymizer_test.go index e30bdcb..ad3245b 100644 --- a/pkg/sigo/anonymizer_test.go +++ b/pkg/sigo/anonymizer_test.go @@ -321,3 +321,19 @@ func createRowReidentification(x, y float64, qi []string, o json.Number, s []str return infra.NewJSONLineRecord(&row, &qi, &s) } + +func TestGetAndSetQI(t *testing.T) { + t.Parallel() + + qi := []string{"x", "y"} + record := createRow(1, 4, qi, "", []string{}) + assert.Equal(t, 0, len(record.GetQI())) + + float64QI := make(map[string]float64) + float64QI["x"] = float64(1) + float64QI["y"] = float64(2) + assert.Equal(t, 2, len(float64QI)) + + record.SetQI(float64QI) + assert.Equal(t, 2, len(record.GetQI())) +} From f7c658efc7e5cd86d640f3e78d4c0e7c0db74b65 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 15 May 2024 08:50:18 +0000 Subject: [PATCH 29/31] feat: add float64 qi as arg in newj jsonline record --- internal/infra/source.go | 8 ++++++-- pkg/sigo/anonymizer_test.go | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index e3dfde3..fdbe09c 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -25,8 +25,12 @@ import ( "github.com/cgi-fr/sigo/pkg/sigo" ) -func NewJSONLineRecord(row *jsonline.Row, quasiIdentifers *[]string, sensitives *[]string) JSONLineRecord { - return JSONLineRecord{row, quasiIdentifers, sensitives, &map[string]float64{}} +func NewJSONLineRecord(row *jsonline.Row, quasiIdentifers *[]string, sensitives *[]string, options ...*map[string]float64) JSONLineRecord { + if len(options) != 0 { + return JSONLineRecord{row, quasiIdentifers, sensitives, options[0]} + } else { + return JSONLineRecord{row, quasiIdentifers, sensitives, &map[string]float64{}} + } } type JSONLineRecord struct { diff --git a/pkg/sigo/anonymizer_test.go b/pkg/sigo/anonymizer_test.go index ad3245b..06ae0de 100644 --- a/pkg/sigo/anonymizer_test.go +++ b/pkg/sigo/anonymizer_test.go @@ -326,14 +326,24 @@ func TestGetAndSetQI(t *testing.T) { t.Parallel() qi := []string{"x", "y"} - record := createRow(1, 4, qi, "", []string{}) - assert.Equal(t, 0, len(record.GetQI())) - float64QI := make(map[string]float64) float64QI["x"] = float64(1) - float64QI["y"] = float64(2) - assert.Equal(t, 2, len(float64QI)) + float64QI["y"] = float64(4) + record := createRowWithFloatQI(1, 4, qi, "", []string{}, make(map[string]float64)) + assert.Equal(t, 0, len(record.GetQI())) + + assert.Equal(t, 2, len(float64QI)) + recordWithFloat64QI := createRowWithFloatQI(1, 4, qi, "", []string{}, float64QI) record.SetQI(float64QI) - assert.Equal(t, 2, len(record.GetQI())) + assert.Equal(t, 2, len(recordWithFloat64QI.GetQI())) +} + +func createRowWithFloatQI(x, y float64, qi []string, z string, s []string, float64QI map[string]float64) infra.JSONLineRecord { + row := jsonline.NewRow() + row.Set("x", x) + row.Set("y", y) + row.Set("z", z) + + return infra.NewJSONLineRecord(&row, &qi, &s, &float64QI) } From c6a9469221848532e0a18ed1f2a00f9c94dfdb94 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 15 May 2024 08:55:14 +0000 Subject: [PATCH 30/31] refactor: remove set qi func --- internal/infra/source.go | 10 ---------- pkg/sigo/anonymizer.go | 4 ---- pkg/sigo/anonymizer_test.go | 1 - pkg/sigo/info.go | 3 --- pkg/sigo/model.go | 1 - 5 files changed, 19 deletions(-) diff --git a/internal/infra/source.go b/internal/infra/source.go index fdbe09c..968a511 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -74,16 +74,6 @@ func (jlr JSONLineRecord) GetQI() map[string]float64 { return *jlr.float64QI } -func (jlr JSONLineRecord) SetQI(float64Map map[string]float64) { - copiedMap := make(map[string]float64) - - for key, value := range float64Map { - copiedMap[key] = value - } - - jlr.float64QI = &copiedMap -} - func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { // nolint: exhaustivestruct source := &JSONLineSource{importer: jsonline.NewImporter(r), quasiIdentifers: quasiIdentifers, sensitives: sensitives} diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index 66fd4b6..0ea87b3 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -109,10 +109,6 @@ func (ar AnonymizedRecord) GetQI() map[string]float64 { return ar.original.GetQI() } -func (ar AnonymizedRecord) SetQI(float64Map map[string]float64) { - ar.original.SetQI(float64Map) -} - func (ar AnonymizedRecord) Row() map[string]interface{} { original := ar.original.Row() for k, v := range ar.mask { diff --git a/pkg/sigo/anonymizer_test.go b/pkg/sigo/anonymizer_test.go index 06ae0de..ca6d095 100644 --- a/pkg/sigo/anonymizer_test.go +++ b/pkg/sigo/anonymizer_test.go @@ -335,7 +335,6 @@ func TestGetAndSetQI(t *testing.T) { assert.Equal(t, 2, len(float64QI)) recordWithFloat64QI := createRowWithFloatQI(1, 4, qi, "", []string{}, float64QI) - record.SetQI(float64QI) assert.Equal(t, 2, len(recordWithFloat64QI.GetQI())) } diff --git a/pkg/sigo/info.go b/pkg/sigo/info.go index 33a6324..4c049f2 100644 --- a/pkg/sigo/info.go +++ b/pkg/sigo/info.go @@ -50,9 +50,6 @@ func (ir InfosRecord) GetQI() map[string]float64 { return ir.original.GetQI() } -func (ir InfosRecord) SetQI(float64Map map[string]float64) { - ir.original.SetQI(float64Map) -} // id returns the path of cluster c coverts to integer. func (d SequenceDebugger) id(c Cluster) int { diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index 87fe538..da3b2a8 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -34,7 +34,6 @@ type Record interface { Sensitives() []interface{} Row() map[string]interface{} GetQI() map[string]float64 - SetQI(map[string]float64) } type Cluster interface { From a0e43dfa4ad65d5bf4be09e20ed763b9ce76e5cb Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Wed, 15 May 2024 14:21:28 +0000 Subject: [PATCH 31/31] refactor: replace validator in source --- internal/infra/source.go | 98 ++++++++++++++++++++++++++++++++++-- pkg/sigo/driver.go | 9 ---- pkg/sigo/model.go | 3 +- pkg/sigo/validator.go | 104 --------------------------------------- 4 files changed, 97 insertions(+), 117 deletions(-) delete mode 100644 pkg/sigo/validator.go diff --git a/internal/infra/source.go b/internal/infra/source.go index 968a511..ad98c0b 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -18,8 +18,11 @@ package infra import ( + "encoding/json" "errors" + "fmt" "io" + "strconv" "github.com/cgi-fr/jsonline/pkg/jsonline" "github.com/cgi-fr/sigo/pkg/sigo" @@ -76,7 +79,12 @@ func (jlr JSONLineRecord) GetQI() map[string]float64 { func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []string) (sigo.RecordSource, error) { // nolint: exhaustivestruct - source := &JSONLineSource{importer: jsonline.NewImporter(r), quasiIdentifers: quasiIdentifers, sensitives: sensitives} + source := &JSONLineSource{ + importer: jsonline.NewImporter(r), + quasiIdentifers: quasiIdentifers, + sensitives: sensitives, + DataValidator: NewFloat64DataValidator(), + } //nolint: goerr113 err := errors.New("indicate the list of quasi-identifiers") @@ -88,6 +96,7 @@ func NewJSONLineSource(r io.Reader, quasiIdentifers []string, sensitives []strin } type JSONLineSource struct { + sigo.DataValidator importer jsonline.Importer err error record sigo.Record @@ -110,11 +119,22 @@ func (s *JSONLineSource) Next() bool { s.err = err if s.err != nil { - return false + return true + } + rowInterface, err2 := row.Export() + s.err = err2 + if err2 != nil { + return true } - s.record = NewJSONLineRecord(&row, &s.quasiIdentifers, &s.sensitives) + float64Map, err3 := s.DataValidator.Validation(rowInterface.(map[string]interface{}), s.quasiIdentifers) + s.err = err3 + + if s.err != nil { + return true + } + s.record = NewJSONLineRecord(&row, &s.quasiIdentifers, &s.sensitives, &float64Map) return true } @@ -129,3 +149,75 @@ func (s *JSONLineSource) QuasiIdentifer() []string { func (s *JSONLineSource) Sensitive() []string { return s.sensitives } + +type Float64DataValidator struct{} + +func NewFloat64DataValidator() Float64DataValidator { + return Float64DataValidator{} +} + +func (v Float64DataValidator) Validation(row map[string]interface{}, quasiIdentifers []string) (map[string]float64, error) { + result := make(map[string]float64) + + for _, key := range quasiIdentifers { + // Null value check + if row[key] == nil { + //nolint: goerr113 + err := errors.New("null value in dataset") + + return nil, err + } + + // Type check + valFloat64, typeErr := transformType(row, key) + if typeErr != nil { + return nil, typeErr + } + + result[key] = valFloat64 + } + + return result, nil +} + +func transformType(row map[string]interface{}, key string) (float64, error) { + var result float64 + + //nolint: varnamelen + switch t := row[key].(type) { + case int: + result = float64(t) + case string: + //nolint: gomnd + val, err := strconv.ParseFloat(t, 64) + if err != nil { + //nolint: goerr113 + err = fmt.Errorf("unsupported type: %T", t) + + return result, err + } + + result = val + case float32: + result = float64(t) + case json.Number: + val, err := t.Float64() + if err != nil { + //nolint: goerr113 + err = fmt.Errorf("unsupported type: %T", t) + + return result, err + } + + result = val + case float64: + result = t + default: + //nolint: goerr113 + err := fmt.Errorf("unsupported type: %T", t) + + return result, err + } + + return result, nil +} diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index de66fd4..51bea66 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -28,7 +28,6 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, ) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) count := 0 - records := []Record{} log.Info().Msg("Reading source") @@ -38,17 +37,9 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, } generalizer.Add(source.Value()) - records = append(records, source.Value()) count++ } - validator := NewFloat64DataValidator(records, source.QuasiIdentifer()) - - _, err := validator.Validation() - if err != nil { - return err - } - log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index da3b2a8..876112c 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -18,6 +18,7 @@ package sigo type RecordSource interface { + DataValidator Next() bool Err() error Value() Record @@ -61,5 +62,5 @@ type Debugger interface { } type DataValidator interface { - Validation() error + Validation(row map[string]interface{}, quasiIdentifers []string) (map[string]float64, error) } diff --git a/pkg/sigo/validator.go b/pkg/sigo/validator.go deleted file mode 100644 index 0aa622f..0000000 --- a/pkg/sigo/validator.go +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (C) 2022 CGI France -// -// This file is part of SIGO. -// -// SIGO is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// SIGO is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with SIGO. If not, see . - -package sigo - -import ( - "encoding/json" - "errors" - "fmt" - "strconv" -) - -type Float64DataValidator struct { - records []Record - quasiIdentifers []string -} - -func NewFloat64DataValidator(records []Record, quasiIdentifers []string) Float64DataValidator { - return Float64DataValidator{records: records, quasiIdentifers: quasiIdentifers} -} - -func (v Float64DataValidator) Validation() ([]float64, error) { - results := []float64{} - - for _, record := range v.records { - row := record.Row() - - for _, key := range v.quasiIdentifers { - // Null value check - if row[key] == nil { - //nolint: goerr113 - err := errors.New("null value in dataset") - - return nil, err - } - - // Type check - valFloat64, typeErr := transformType(row, key) - if typeErr != nil { - return nil, typeErr - } - - results = append(results, valFloat64) - } - } - - return results, nil -} - -func transformType(row map[string]interface{}, key string) (float64, error) { - var result float64 - - //nolint: varnamelen - switch t := row[key].(type) { - case int: - result = float64(t) - case string: - //nolint: gomnd - val, err := strconv.ParseFloat(t, 64) - if err != nil { - //nolint: goerr113 - err = fmt.Errorf("unsupported type: %T", t) - - return result, err - } - - result = val - case float32: - result = float64(t) - case json.Number: - val, err := t.Float64() - if err != nil { - //nolint: goerr113 - err = fmt.Errorf("unsupported type: %T", t) - - return result, err - } - - result = val - case float64: - result = t - default: - //nolint: goerr113 - err := fmt.Errorf("unsupported type: %T", t) - - return result, err - } - - return result, nil -}