Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft bugfix: panic null value in dataset #91

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a949286
feat: add null value should return error venom test
Chao-Ma5566 Apr 22, 2024
e7f8a6c
feat: quaisi identifier check value type is float 64
Chao-Ma5566 Apr 23, 2024
cfaa587
feat:return error when a null value in dataset
Chao-Ma5566 Apr 23, 2024
79d86df
refactor: repair lint warning
Chao-Ma5566 Apr 23, 2024
264cc92
refactor: repair lint warning
Chao-Ma5566 Apr 23, 2024
43f54e9
feat: add type check in quasi identifier
Chao-Ma5566 Apr 24, 2024
3776cb0
feat: add venom test wront type error
Chao-Ma5566 Apr 24, 2024
f62b758
feat: handling all quasi identifer error
Chao-Ma5566 Apr 24, 2024
36f9453
docs: update changelog
Chao-Ma5566 Apr 24, 2024
9cf9b70
docs: update changelog
Chao-Ma5566 Apr 24, 2024
8ddf40f
refactor: reverted commits until a949286
Chao-Ma5566 Apr 25, 2024
1178cd0
feat: creation of data validator and float64 data validator
Chao-Ma5566 Apr 25, 2024
87cb278
feat: integrates validator in main
Chao-Ma5566 Apr 25, 2024
307621d
refactor: move validator in sigo
Chao-Ma5566 Apr 26, 2024
c187edb
refactor: fix lint error
Chao-Ma5566 Apr 26, 2024
83201e6
refactor: fix lint error
Chao-Ma5566 Apr 26, 2024
abd0d21
refactor: fix lint error
Chao-Ma5566 Apr 26, 2024
8bb0137
refactor: fix lint error
Chao-Ma5566 Apr 26, 2024
f473bd9
feat: add check type func
Chao-Ma5566 Apr 26, 2024
59c99a9
refactor: fix lint error
Chao-Ma5566 Apr 26, 2024
8ed4b40
feat: add transform type func in validator
Chao-Ma5566 May 13, 2024
5d3224d
feat: add null value in list should return err test
Chao-Ma5566 May 13, 2024
a0e9064
refactor: remove list value type check and test
Chao-Ma5566 May 13, 2024
3226fbd
refactor: fix lint err
Chao-Ma5566 May 13, 2024
426ea09
feat: add get qi func in record
Chao-Ma5566 May 14, 2024
0e0bd65
refactor: change float64 qi to pointer for swap anonymizer
Chao-Ma5566 May 14, 2024
1c8a9ed
feat: add set qi fun for recordd
Chao-Ma5566 May 14, 2024
a007de3
feat: add test get qi and set qi
Chao-Ma5566 May 14, 2024
f7c658e
feat: add float64 qi as arg in newj jsonline record
Chao-Ma5566 May 15, 2024
c6a9469
refactor: remove set qi func
Chao-Ma5566 May 15, 2024
a0e43df
refactor: replace validator in source
Chao-Ma5566 May 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ Types of changes
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [0.3.1]

- `Fixed` panic when a null value in dataset (#22)

## [0.3.0]

- `Added`export configuration in yaml file (#32)
Expand Down
13 changes: 8 additions & 5 deletions cmd/sigo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Copyright (C) 2022 CGI France \n License GPLv3: GNU GPL version 3 <https://gnu.o
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDate, builtBy),
Run: func(cmd *cobra.Command, args []string) {
// nolint: exhaustivestruct
//nolint: exhaustivestruct
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})

definition.flagIsSet(*cmd)
Expand All @@ -94,7 +94,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa
rootCmd.PersistentFlags().
BoolVar(&logs.jsonlog, "log-json", false, "output logs in JSON format")
rootCmd.PersistentFlags().StringVar(&logs.colormode, "color", "auto", "use colors in log outputs : yes, no or auto")
// nolint: gomnd
//nolint: gomnd
rootCmd.PersistentFlags().IntVarP(&definition.k, "k-value", "k", 3, "k-value for k-anonymization")
rootCmd.PersistentFlags().IntVarP(&definition.l, "l-value", "l", 1, "l-value for l-diversity")
rootCmd.PersistentFlags().
Expand All @@ -121,6 +121,7 @@ There is NO WARRANTY, to the extent permitted by law.`, version, commit, buildDa
}
}

//nolint: funlen
func run(definition pdef, logs logs) {
initLog(logs, definition.entropy)

Expand Down Expand Up @@ -175,15 +176,17 @@ func run(definition pdef, logs logs) {
err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), definition.k, definition.l,
len(definition.qi), newAnonymizer(definition.method, definition.args), sink, debugger)
if err != nil {
panic(err)
log.Err(err).Msg("Anonymize failed")
log.Warn().Msg("End SIGO")
os.Exit(1)
}

if logs.profiling {
cpuProfiler.Stop()
}
}

// nolint: cyclop
//nolint: cyclop
func initLog(logs logs, entropy bool) {
color := false

Expand All @@ -200,7 +203,7 @@ func initLog(logs logs, entropy bool) {
if logs.jsonlog {
logger = zerolog.New(os.Stderr)
} else {
// nolint: exhaustivestruct
//nolint: exhaustivestruct
logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, NoColor: !color})
}

Expand Down
37 changes: 34 additions & 3 deletions internal/infra/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
package infra

import (
"encoding/json"
"errors"
"fmt"
"io"
"strconv"

"github.com/cgi-fr/jsonline/pkg/jsonline"
"github.com/cgi-fr/sigo/pkg/sigo"
Expand All @@ -35,14 +38,42 @@ type JSONLineRecord struct {
sensitives *[]string
}

func (jlr JSONLineRecord) QuasiIdentifer() []float64 {
func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) {
result := []float64{}

for _, key := range *jlr.quasiIdentifers {
result = append(result, (*jlr.row).GetFloat64(key))
value, _ := (*jlr.row).Get(key)
if value == nil {
//nolint: goerr113
err := errors.New("null value in dataset")

return []float64{}, err
}

var val float64
switch t := value.(type) {
case int:
val = float64(t)
case string:
//nolint: gomnd
val, _ = strconv.ParseFloat(t, 64)
case float32:
val = float64(t)
case json.Number:
val, _ = t.Float64()
case float64:
val = t
default:
//nolint: goerr113
err := fmt.Errorf("unsupported type: %T", t)

return []float64{}, err
}

result = append(result, val)
}

return result
return result, nil
}

func (jlr JSONLineRecord) Sensitives() []interface{} {
Expand Down
34 changes: 29 additions & 5 deletions pkg/sigo/anonymizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ type (
}
)

func (ar AnonymizedRecord) QuasiIdentifer() []float64 {
func (ar AnonymizedRecord) QuasiIdentifer() ([]float64, error) {
return ar.original.QuasiIdentifer()
}

Expand Down Expand Up @@ -205,7 +205,14 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re
bottom := q.Q1
top := q.Q3

val := rec.QuasiIdentifer()[i]
recVals, err := rec.QuasiIdentifer()
if err != nil {
log.Err(err).Msg("Cannot cast quasi-identifier to float64")
log.Warn().Int("return", 1).Msg("End SIGO")
os.Exit(1)
}

val := recVals[i]

switch {
case val < bottom:
Expand All @@ -228,7 +235,14 @@ func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Rec
mask := map[string]interface{}{}

for i, key := range qi {
val := rec.QuasiIdentifer()[i]
recVals, err := rec.QuasiIdentifer()
if err != nil {
log.Err(err).Msg("Cannot cast quasi-identifier to float64")
log.Warn().Int("return", 1).Msg("End SIGO")
os.Exit(1)
}

val := recVals[i]

laplaceVal := Scaling(val, values[key], laplace)
gaussianVal := Scaling(val, values[key], gaussian)
Expand Down Expand Up @@ -282,6 +296,7 @@ func (a SwapAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Reco
func (a SwapAnonymizer) Swap(clus Cluster, qi []string) {
// retrieve the cluster values for each qi
values := listValues(clus, qi)

swapVal := make(map[string][]float64)

for _, key := range qi {
Expand Down Expand Up @@ -416,7 +431,8 @@ func (r Reidentification) Statistics(idCluster string, q string) (mean float64,

// ComputeSimilarity computes the similarity score between the record rec and the anonymized cluster data.
func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster,
qi []string, s []string) map[float64]interface{} {
qi []string, s []string,
) map[float64]interface{} {
scores := make(map[float64]interface{})

x := make(map[string]interface{})
Expand Down Expand Up @@ -453,7 +469,15 @@ func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) {

for _, record := range clus.Records() {
for i, key := range qi {
mapValues[key] = append(mapValues[key], record.QuasiIdentifer()[i])
recVals, err := record.QuasiIdentifer()
if err != nil {
log.Err(err).Msg("Cannot cast quasi-identifier to float64")
log.Warn().Int("return", 1).Msg("End SIGO")
os.Exit(1)
}

val := recVals[i]
mapValues[key] = append(mapValues[key], val)
}
}

Expand Down
8 changes: 6 additions & 2 deletions pkg/sigo/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ import (
)

func Anonymize(source RecordSource, factory GeneralizerFactory,
k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error {
k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger,
) error {
generalizer := factory.New(k, l, dim, source.QuasiIdentifer())
count := 0

Expand All @@ -42,7 +43,10 @@ func Anonymize(source RecordSource, factory GeneralizerFactory,
log.Info().Msgf("%v individuals to anonymize", count)
log.Info().Msg("Tree building")

generalizer.Build()
err := generalizer.Build()
if err != nil {
return err
}

log.Info().Msg("Cluster Anonymization")

Expand Down
44 changes: 44 additions & 0 deletions pkg/sigo/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,47 @@ func BenchmarkLongClustering(b *testing.B) {
jsonBytes.Close()
}
}

func TestNullValueShouldReturnError(t *testing.T) {
t.Parallel()

sourceText := `{"x":0, "y":0, "foo":"bar"}
{"x":null, "y":1, "foo":"bar"}
{"x":0, "y":null, "foo":"bar"}
{"x":2, "y":1, "foo":"null"}
{"x":3, "y":2, "foo":"baz"}
{"x":2, "y":3, "foo":"baz"}`

expectedMessage := "null value in dataset"

source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"})
assert.Nil(t, err)

result := []map[string]interface{}{}
sink := infra.NewSliceDictionariesSink(&result)
err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink,
sigo.NewSequenceDebugger("clusterID"))
assert.Equal(t, expectedMessage, err.Error())
}

func TestWrongTypeShouldReturnError(t *testing.T) {
t.Parallel()

sourceText := `{"x":0, "y":false, "foo":"bar"}
{"x":0, "y":1, "foo":"bar"}
{"x":0, "y":2, "foo":"bar"}
{"x":2, "y":1, "foo":"baz"}
{"x":3, "y":2, "foo":"baz"}
{"x":2, "y":3, "foo":"baz"}`

expectedMessage := "unsupported type: bool"

source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"})
assert.Nil(t, err)

result := []map[string]interface{}{}
sink := infra.NewSliceDictionariesSink(&result)
err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink,
sigo.NewSequenceDebugger("clusterID"))
assert.Equal(t, expectedMessage, err.Error())
}
2 changes: 1 addition & 1 deletion pkg/sigo/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type InfosRecord struct {
infos map[string]interface{}
}

func (ir InfosRecord) QuasiIdentifer() []float64 {
func (ir InfosRecord) QuasiIdentifer() ([]float64, error) {
return ir.original.QuasiIdentifer()
}

Expand Down
Loading
Loading