Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(scan): alias and filters #4

Merged
merged 10 commits into from
Mar 27, 2024
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ Types of changes
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [0.2.0]

- `Added` flag `--include` (short `-i`) to only scan/dump a specific list of fields, this flag is repeatable
- `Added` flag `--alias` (short `-a`) to rename fields on the fly, this flag is repeatable
- `Added` flag `--watch` (short `-w`) to the dump command
- `Fixed` self reference link are no longer counted in the links counter while scanning

## [0.1.0]

- `Added` initial version
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,34 @@ $ silo scan my-silo < input.jsonl

Analysis data is persisted on disk on the `my-silo` path relative to the current directory.

#### passthrough stdin to stdout

Use `--passthrough` (short : `-p`) to pass input to stdout instead of diplaying informations.

```console
$ silo scan my-silo --passthrough < input.jsonl
{"ID_CLIENT":"0001","EMAIL_CLIENT":"[email protected]","ACCOUNT_NUMBER":null}
{"ID_CLIENT":null,"EMAIL_CLIENT":null,"ACCOUNT_NUMBER":"C01"}
```

#### include only specific fields/columns

Use `--include <fieldname>` (short : `-i <fieldname>`, repeatable) to select only given columns to scan.

```console
$ silo scan my-silo --include ID_CLIENT --include EMAIL_CLIENT < input.jsonl
⣾ Scanned 5 rows, found 15 links (4084 row/s) [0s]
```

#### rename fields/columns on the fly

Use `--alias <fieldname>=<alias>` (short : `-a <fieldname>=<alias>`, repeatable) to rename fields before storing links.

```console
$ silo scan my-silo --alias ID_CLIENT=CLIENT --alias EMAIL_CLIENT=EMAIL < input.jsonl
⣾ Scanned 5 rows, found 15 links (4084 row/s) [0s]
```

### silo dump

The silo dump command is used to dump each connected entity into a file. This allows users to create a referential of all entities discovered within the JSONLine data. Here's how to use it:
Expand Down
25 changes: 21 additions & 4 deletions internal/app/cli/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,53 @@ import (
)

func NewDumpCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.File) *cobra.Command {
var (
include []string
watch bool
)

cmd := &cobra.Command{ //nolint:exhaustruct
Use: "dump path",
Short: "Dump silo database stored in given path into stdout",
Example: " " + parent + " dump clients",
Args: cobra.ExactArgs(1),
Run: func(_ *cobra.Command, args []string) {
if err := dump(args[0]); err != nil {
if err := dump(args[0], include, watch); err != nil {
log.Fatal().Err(err).Int("return", 1).Msg("end SILO")
}
},
}

cmd.Flags().StringSliceVarP(&include, "include", "i", []string{}, "include only these columns, exclude all others")
cmd.Flags().BoolVarP(&watch, "watch", "w", false, "watch statistics about dumped entities in stderr")

cmd.Flags().SortFlags = false

cmd.SetOut(stdout)
cmd.SetErr(stderr)
cmd.SetIn(stdin)

return cmd
}

func dump(path string) error {
func dump(path string, include []string, watch bool) error {
backend, err := infra.NewBackend(path)
if err != nil {
return fmt.Errorf("%w", err)
}

defer backend.Close()

driver := silo.NewDriver(backend, infra.NewDumpJSONLine())
driver := silo.NewDriver(backend, infra.NewDumpJSONLine(), silo.WithKeys(include))

if watch {
observer := infra.NewDumpObserver()
defer observer.Close()

if err := driver.Dump(); err != nil {
if err := driver.Dump(observer); err != nil {
return fmt.Errorf("%w", err)
}
} else if err := driver.Dump(); err != nil {
return fmt.Errorf("%w", err)
}

Expand Down
18 changes: 13 additions & 5 deletions internal/app/cli/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,29 @@ import (
)

func NewScanCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.File) *cobra.Command {
var passthrough bool
var (
passthrough bool
include []string
aliases map[string]string
)

cmd := &cobra.Command{ //nolint:exhaustruct
Use: "scan path",
Short: "Ingest data from stdin and update silo database stored in given path",
Example: " lino pull database --table client | " + parent + " scan clients",
Example: " " + parent + " scan clients < clients.jsonl",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
if err := scan(cmd, args[0], passthrough); err != nil {
if err := scan(cmd, args[0], passthrough, include, aliases); err != nil {
log.Fatal().Err(err).Int("return", 1).Msg("end SILO")
}
},
}

cmd.Flags().BoolVarP(&passthrough, "passthrough", "p", false, "pass stdin to stdout")
cmd.Flags().StringSliceVarP(&include, "include", "i", []string{}, "include only these columns, exclude all others")
cmd.Flags().StringToStringVarP(&aliases, "alias", "a", map[string]string{}, "use given aliases for each columns")

cmd.Flags().SortFlags = false

cmd.SetOut(stdout)
cmd.SetErr(stderr)
Expand All @@ -51,15 +59,15 @@ func NewScanCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.F
return cmd
}

func scan(cmd *cobra.Command, path string, passthrough bool) error {
func scan(cmd *cobra.Command, path string, passthrough bool, include []string, aliases map[string]string) error {
backend, err := infra.NewBackend(path)
if err != nil {
return fmt.Errorf("%w", err)
}

defer backend.Close()

driver := silo.NewDriver(backend, nil)
driver := silo.NewDriver(backend, nil, silo.WithKeys(include), silo.WithAliases(aliases))

var reader silo.DataRowReader

Expand Down
87 changes: 87 additions & 0 deletions internal/infra/dump_observer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// Copyright (C) 2024 CGI France
//
// This file is part of SILO.
//
// SILO is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// SILO is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with SILO. If not, see <http://www.gnu.org/licenses/>.

package infra

import (
"fmt"
"os"
"time"

"github.com/cgi-fr/silo/pkg/silo"
"github.com/schollz/progressbar/v3"
)

type DumpObserver struct {
countTotal int
countComplete int
countConsistent int
countInconsistent int
countEmpty int
bar *progressbar.ProgressBar
}

func NewDumpObserver() *DumpObserver {
//nolint:gomnd
pgb := progressbar.NewOptions(-1,
progressbar.OptionSetDescription("Dumping ... "),
progressbar.OptionSetItsString("entity"),
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionShowIts(),
progressbar.OptionSpinnerType(11),
progressbar.OptionThrottle(time.Millisecond*10),
progressbar.OptionOnCompletion(func() { fmt.Fprintln(os.Stderr) }),
// progressbar.OptionShowDescriptionAtLineEnd(),
)

return &DumpObserver{
countTotal: 0,
countComplete: 0,
countConsistent: 0,
countInconsistent: 0,
countEmpty: 0,
bar: pgb,
}
}

func (o *DumpObserver) Entity(status silo.Status, _ map[string]int) {
o.countTotal++

switch status {
case silo.StatusEntityComplete:
o.countComplete++
case silo.StatusEntityConsistent:
o.countConsistent++
case silo.StatusEntityInconsistent:
o.countInconsistent++
case silo.StatusEntityEmpty:
o.countEmpty++
}

_ = o.bar.Add(1)

o.bar.Describe(fmt.Sprintf("Dumped %d entities / complete=%d / consistent=%d / inconsistent=%d / empty=%d",
o.countTotal,
o.countComplete,
o.countConsistent,
o.countInconsistent,
o.countEmpty))
}

func (o *DumpObserver) Close() {
_ = o.bar.Close()
}
52 changes: 52 additions & 0 deletions pkg/silo/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (C) 2024 CGI France
//
// This file is part of SILO.
//
// SILO is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// SILO is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with SILO. If not, see <http://www.gnu.org/licenses/>.

package silo

import "errors"

type config struct {
include map[string]bool
includeList []string
aliases map[string]string
}

func newConfig() *config {
config := config{
include: map[string]bool{},
includeList: []string{},
aliases: map[string]string{},
}

return &config
}

func (cfg *config) validate() error {
var errs []error

for key := range cfg.aliases {
if _, ok := cfg.include[key]; !ok && len(cfg.include) > 0 {
errs = append(errs, &ConfigScanAliasIsNotIncludedError{alias: key})
}
}

if len(errs) != 0 {
return errors.Join(errs...)
}

return nil
}
4 changes: 4 additions & 0 deletions pkg/silo/driven.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ type ScanObserver interface {
IngestedRow(row DataRow)
IngestedLink(link DataLink)
}

type DumpObserver interface {
Entity(status Status, counts map[string]int)
}
Loading
Loading