Skip to content

Commit

Permalink
Add example, docs & more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
eadanfahey committed May 28, 2020
1 parent f416fa0 commit 7ff5be3
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 22 deletions.
80 changes: 78 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,88 @@
# FastCDC-Go

A Go implementation of the FastCDC content-defined chunking algorithm.
FastCDC-Go is a Go library implementing the [FastCDC](#references) content-defined chunking algorithm.

Install:
```
go get -u github.com/iotafs/fastcdc-go
```

## Example

```go
import (
"bytes"
"fmt"
"log"
"math/rand"
"io"

"github.com/iotafs/fastcdc-go"
)

opts := fastcdc.Options{
MinSize: 256 * 1024
AverageSize: 1 * 1024 * 1024
MaxSize: 4 * 1024 * 1024
}

data := make([]byte, 10 * 1024 * 1024)
rand.Read(data)
chunker, _ := fastcdc.NewChunker(bytes.NewReader(data), opts)

for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}

fmt.Printf("%x %d\n", chunk.Data[:10], chunk.Length)
}
```

## Command line tool

This package also includes a useful CLI for testing the chunking output. Install it by running:

```
go install ./cmd/fastcdc
```

Example:
```bash
# Outputs the position and size of each chunk to stdout
fastcdc -csv -file random.txt
```

## Performance

FastCDC-Go is fast. Chunking speed on an Intel i5 7200U is >1GiB/s. Compared to [`restic/chunker`](https://github.com/restic/chunker), another CDC library for Go, it's about 2.9 times faster.

Benchmark ([code](https://gist.github.com/eadanfahey/ce2ba2733028e2b3b62a479ba2b9f62a)):
```
BenchmarkRestic-4 23384482467 ns/op 448.41 MB/s 8943320 B/op 15 allocs/op
BenchmarkFastCDC-4 8080957045 ns/op 1297.59 MB/s 16777336 B/op 3 allocs/op
```

## Normalization

A key feature of FastCDC is chunk size normalization. Normalization helps to improve the distribution of chunk sizes, increasing the number of chunks close to the target average size and reducing the number of chunks clipped by the maximum chunk size, as compared to the [Rabin-based](https://en.wikipedia.org/wiki/Rabin_fingerprint) chunking algorithm used in `restic/chunker`.

The histograms below show the chunk size distribution for `fastcdc-go` and `restic/chunker` on 1GiB of random data, each with average chunk size 1MiB, minimum chunk size 256 KiB and maximum chunk size 4MiB. The normalization level for `fastcdc-go` is set to 2.

![](./img/fastcdcgo_norm2_dist.png) ![](./img/restic_dist.png)

Compared the `restic/chunker`, the distribution of `fastcdc-go` is less skewed (standard deviation 345KiB vs. 964KiB).

## License

FastCDC-Go is licensed unser the Apache 2.0 License. See [LICENSE](./LICENSE) for details.

## References

- Xia, Wen, et al. "Fastcdc: a fast and efficient content-defined chunking approach for data deduplication." 2016 USENIX Annual Technical Conference
- Xia, Wen, et al. "Fastcdc: a fast and efficient content-defined chunking approach for data deduplication." 2016 USENIX Annual Technical Conference
[pdf](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf)

4 changes: 4 additions & 0 deletions cmd/cli/main.go → cmd/fastcdc/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ func main() {
if err == io.EOF {
break
}
if err != nil {
fmt.Fprintf(os.Stderr, "ERROR: %v\n", err)
os.Exit(1)
}
if *csv {
fmt.Printf("%d,%d\n", chunk.Offset, chunk.Length)
} else {
Expand Down
5 changes: 5 additions & 0 deletions doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/*
Package fastcdc is a Go implementation of the FastCDC content defined chunking algorithm.
See https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf for details.
*/
package fastcdc
26 changes: 11 additions & 15 deletions fastcdc.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ type Options struct {
// AverageSize * 4.
MaxSize int

// (Optional) Sets the the chunk normalization level to improve the distribution of
// chunk sizes. It must take one of the values: 1, 2 or 3, unless DisableNormalization
// is set, in which case it's ignored. By default, it's set to 2.
// (Optional) Sets the chunk normalization level. It may be set to 1, 2 or 3,
// unless DisableNormalization is set, in which case it's ignored. By default,
// it's set to 2.
Normalization int

// (Optional) DisableNormalization turns normalization off. By default, it's set to
Expand Down Expand Up @@ -78,7 +78,7 @@ func (opts *Options) setDefaults() {
if opts.BufSize == 0 {
opts.BufSize = opts.MaxSize * 2
}
if opts.Normalization == 0 {
if !opts.DisableNormalization && opts.Normalization == 0 {
opts.Normalization = 2
}
}
Expand Down Expand Up @@ -110,11 +110,6 @@ func NewChunker(rd io.Reader, opts Options) (*Chunker, error) {
table[i] = table[i] ^ opts.Seed
}

if opts.BufSize == 0 {
opts.BufSize = opts.MaxSize * 2
}
buf := make([]byte, opts.BufSize)

normalization := opts.Normalization
if opts.DisableNormalization {
normalization = 0
Expand All @@ -130,9 +125,10 @@ func NewChunker(rd io.Reader, opts Options) (*Chunker, error) {
maskS: (1 << smallBits) - 1,
maskL: (1 << largeBits) - 1,
rd: rd,
buf: buf,
cursor: len(buf),
buf: make([]byte, opts.BufSize),
cursor: opts.BufSize,
}

return chunker, nil
}

Expand Down Expand Up @@ -215,6 +211,9 @@ func (c *Chunker) nextChunk(data []byte) (int, uint64) {
}

func (opts Options) validate() error {
if opts.AverageSize == 0 {
return errors.New("option AverageSize is required")
}
if opts.MinSize < minSize || opts.MinSize > maxSize {
return errors.New("option MinSize must be in range 64B to 1GiB")
}
Expand All @@ -227,10 +226,7 @@ func (opts Options) validate() error {
if opts.AverageSize > opts.MaxSize || opts.AverageSize < opts.MinSize {
return errors.New("option AverageSize must be betweeen MinSize and MaxSize")
}
if opts.MaxSize-opts.MinSize < opts.AverageSize {
return errors.New("difference between options MinSize and MaxSize must be greater than AverageSize")
}
if !opts.DisableNormalization && (opts.Normalization < 0 || opts.Normalization > 4) {
if !opts.DisableNormalization && (opts.Normalization <= 0 || opts.Normalization > 4) {
return errors.New("option Normalization must be 0, 1, 2 or 3")
}
if opts.BufSize <= opts.MaxSize {
Expand Down
53 changes: 53 additions & 0 deletions fastcdc_example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package fastcdc_test

import (
"bytes"
"crypto/md5"
"fmt"
"io"
"log"
"math/rand"

"github.com/iotafs/fastcdc-go"
)

func Example_basic() {

data := make([]byte, 10*1024*1024)
rand.Seed(4542)
rand.Read(data)
rd := bytes.NewReader(data)

chunker, err := fastcdc.NewChunker(rd, fastcdc.Options{
AverageSize: 1024 * 1024, // target 1 MiB average chunk size
})
if err != nil {
log.Fatal(err)
}

fmt.Printf("%-32s %s\n", "CHECKSUM", "CHUNK SIZE")

for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}

fmt.Printf("%x %d\n", md5.Sum(chunk.Data), chunk.Length)
}

// Output:
// CHECKSUM CHUNK SIZE
// d5bb40f862d68f4c3a2682e6d433f0d7 1788060
// 113a0aa2023d7dce6a2aac1f807b5bd2 1117240
// 5b9147b10d4fe6f96282da481ce848ca 1180487
// dcc4644befb599fa644635b0c5a1ea2c 1655501
// 224db3de422ad0dd2c840e3e24e0cb03 363172
// e071658eccda587789f1dabb6f773851 1227750
// 215868103f0b4ea7f715e179e5b9a6c7 1451026
// 21e65e40970ec22f5b13ddf60493b746 1150129
// b8209a1dbef955ef64636af796450252 552395
}
43 changes: 38 additions & 5 deletions fastcdc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (

var defaultOpts = Options{
AverageSize: 1024,
Seed: 84372,
Seed: 84372,
}

var defaultData = randBytes(4321, 7291)
Expand Down Expand Up @@ -77,7 +77,7 @@ func TestChunking(t *testing.T) {
}
}

func TestChunkingBasic(t *testing.T) {
func TestChunkingRandom(t *testing.T) {
data := randBytes(1e6, 63)
chunker, err := NewChunker(bytes.NewReader(data), defaultOpts)
assertNoError(t, err)
Expand Down Expand Up @@ -113,7 +113,9 @@ func TestChunkingBasic(t *testing.T) {
func TestMinSize(t *testing.T) {
// Test with data smaller than min chunk size
data := randBytes(10, 51)
chunker, err := NewChunker(bytes.NewReader(data), defaultOpts)
opts := defaultOpts
opts.DisableNormalization = true
chunker, err := NewChunker(bytes.NewReader(data), opts)
assertNoError(t, err)

c, err := chunker.Next()
Expand All @@ -131,9 +133,37 @@ func TestMinSize(t *testing.T) {
}
}

func TestOptionsValidation(t *testing.T) {
avg := 1024 * 64
testOpts := []Options{
// AverageSize not set
{},
// MinSize too small
{AverageSize: avg, MinSize: 1},
// MaxSize too big
{AverageSize: avg, MaxSize: maxSize + 1},
// MaxSize less than MinSize
{AverageSize: avg, MaxSize: avg / 2, MinSize: avg * 2},
// AverageSize less than MinSize
{AverageSize: avg, MinSize: 2 * avg, MaxSize: 4 * avg},
// Bad normalization
{AverageSize: avg, Normalization: 100},
// BufSize too small
{AverageSize: avg, BufSize: 1},
}
var r bytes.Reader

for i, opts := range testOpts {
_, err := NewChunker(&r, opts)
if err == nil {
t.Fatalf("%d: expected error", i)
}
}
}

func BenchmarkFastCDC(b *testing.B) {
// 10GiB data total to chunk
n := 100
// total of 10GiB of data to chunk
n := 10
benchData := randBytes(100*1024*1024, 345)
r := newLoopReader(benchData, n)

Expand All @@ -149,6 +179,9 @@ func BenchmarkFastCDC(b *testing.B) {
if err == io.EOF {
break
}
if err != nil {
b.Fatal(err)
}
}
b.SetBytes(int64(n * len(benchData)))
}
Expand Down
Binary file added img/fastcdcgo_norm2_dist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/restic_dist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 7ff5be3

Please sign in to comment.