Add example, docs & more tests

jotfs · May 28, 2020 · 7ff5be3 · 7ff5be3
1 parent f416fa0
commit 7ff5be3
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -1,12 +1,88 @@
 # FastCDC-Go
 
-A Go implementation of the FastCDC content-defined chunking algorithm.
+FastCDC-Go is a Go library implementing the [FastCDC](#references) content-defined chunking algorithm.
+
+Install: 
+```
+go get -u github.com/iotafs/fastcdc-go
+```
+
+## Example
+
+```go
+import (
+  "bytes"
+  "fmt"
+  "log"
+  "math/rand"
+  "io"
+
+  "github.com/iotafs/fastcdc-go"
+)
+
+opts := fastcdc.Options{
+  MinSize:     256 * 1024
+  AverageSize: 1 * 1024 * 1024
+  MaxSize:     4 * 1024 * 1024
+}
+
+data := make([]byte, 10 * 1024 * 1024)
+rand.Read(data)
+chunker, _ := fastcdc.NewChunker(bytes.NewReader(data), opts)
+
+for {
+  chunk, err := chunker.Next()
+  if err == io.EOF {
+    break
+  }
+  if err != nil {
+    log.Fatal(err)
+  }
+
+  fmt.Printf("%x  %d\n", chunk.Data[:10], chunk.Length)
+}
+```
+
+## Command line tool
+
+This package also includes a useful CLI for testing the chunking output. Install it by running:
+
+```
+go install ./cmd/fastcdc
+```
+
+Example:
+```bash
+# Outputs the position and size of each chunk to stdout 
+fastcdc -csv -file random.txt
+```
+
+## Performance
+
+FastCDC-Go is fast. Chunking speed on an Intel i5 7200U is >1GiB/s. Compared to [`restic/chunker`](https://github.com/restic/chunker), another CDC library for Go, it's about 2.9 times faster.
+
+Benchmark ([code](https://gist.github.com/eadanfahey/ce2ba2733028e2b3b62a479ba2b9f62a)):
+```
+BenchmarkRestic-4     23384482467 ns/op	 448.41 MB/s	 8943320 B/op   15 allocs/op
+BenchmarkFastCDC-4    8080957045 ns/op	1297.59 MB/s	16777336 B/op    3 allocs/op
+```
+
+## Normalization
+
+A key feature of FastCDC is chunk size normalization. Normalization helps to improve the distribution of chunk sizes, increasing the number of chunks close to the target average size and reducing the number of chunks clipped by the maximum chunk size, as compared to the [Rabin-based](https://en.wikipedia.org/wiki/Rabin_fingerprint) chunking algorithm used in `restic/chunker`.
+
+The histograms below show the chunk size distribution for `fastcdc-go` and `restic/chunker` on 1GiB of random data, each with average chunk size 1MiB, minimum chunk size 256 KiB and maximum chunk size 4MiB. The normalization level for `fastcdc-go` is set to 2.
+
+![](./img/fastcdcgo_norm2_dist.png) ![](./img/restic_dist.png)
+
+Compared the `restic/chunker`, the distribution of `fastcdc-go` is less skewed (standard deviation 345KiB vs. 964KiB).
 
 ## License
 
 FastCDC-Go is licensed unser the Apache 2.0 License. See [LICENSE](./LICENSE) for details.
 
 ## References
 
-  - Xia, Wen, et al. "Fastcdc: a fast and efficient content-defined chunking approach for data deduplication." 2016 USENIX Annual Technical Conference 
+  - Xia, Wen, et al. "Fastcdc: a fast and efficient content-defined chunking approach for data deduplication." 2016 USENIX Annual Technical Conference
+  [pdf](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf)
 
diff --git a/cmd/cli/main.go → cmd/fastcdc/main.go b/cmd/cli/main.go → cmd/fastcdc/main.go
@@ -56,6 +56,10 @@ func main() {
 		if err == io.EOF {
 			break
 		}
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "ERROR: %v\n", err)
+			os.Exit(1)
+		}
 		if *csv {
 			fmt.Printf("%d,%d\n", chunk.Offset, chunk.Length)
 		} else {

diff --git a/doc.go b/doc.go
@@ -0,0 +1,5 @@
+/*
+Package fastcdc is a Go implementation of the FastCDC content defined chunking algorithm.
+See https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf for details.
+*/
+package fastcdc
diff --git a/fastcdc.go b/fastcdc.go
@@ -48,9 +48,9 @@ type Options struct {
 	// AverageSize * 4.
 	MaxSize int
 
-	// (Optional) Sets the the chunk normalization level to improve the distribution of
-	// chunk sizes. It must take one of the values: 1, 2 or 3, unless DisableNormalization
-	// is set, in which case it's ignored. By default, it's set to 2.
+	// (Optional) Sets the chunk normalization level. It may be set to 1, 2 or 3,
+	// unless DisableNormalization is set, in which case it's ignored. By default,
+	// it's set to 2.
 	Normalization int
 
 	// (Optional) DisableNormalization turns normalization off. By default, it's set to
@@ -78,7 +78,7 @@ func (opts *Options) setDefaults() {
 	if opts.BufSize == 0 {
 		opts.BufSize = opts.MaxSize * 2
 	}
-	if opts.Normalization == 0 {
+	if !opts.DisableNormalization && opts.Normalization == 0 {
 		opts.Normalization = 2
 	}
 }
@@ -110,11 +110,6 @@ func NewChunker(rd io.Reader, opts Options) (*Chunker, error) {
 		table[i] = table[i] ^ opts.Seed
 	}
 
-	if opts.BufSize == 0 {
-		opts.BufSize = opts.MaxSize * 2
-	}
-	buf := make([]byte, opts.BufSize)
-
 	normalization := opts.Normalization
 	if opts.DisableNormalization {
 		normalization = 0
@@ -130,9 +125,10 @@ func NewChunker(rd io.Reader, opts Options) (*Chunker, error) {
 		maskS:    (1 << smallBits) - 1,
 		maskL:    (1 << largeBits) - 1,
 		rd:       rd,
-		buf:      buf,
-		cursor:   len(buf),
+		buf:      make([]byte, opts.BufSize),
+		cursor:   opts.BufSize,
 	}
+
 	return chunker, nil
 }
 
@@ -215,6 +211,9 @@ func (c *Chunker) nextChunk(data []byte) (int, uint64) {
 }
 
 func (opts Options) validate() error {
+	if opts.AverageSize == 0 {
+		return errors.New("option AverageSize is required")
+	}
 	if opts.MinSize < minSize || opts.MinSize > maxSize {
 		return errors.New("option MinSize must be in range 64B to 1GiB")
 	}
@@ -227,10 +226,7 @@ func (opts Options) validate() error {
 	if opts.AverageSize > opts.MaxSize || opts.AverageSize < opts.MinSize {
 		return errors.New("option AverageSize must be betweeen MinSize and MaxSize")
 	}
-	if opts.MaxSize-opts.MinSize < opts.AverageSize {
-		return errors.New("difference between options MinSize and MaxSize must be greater than AverageSize")
-	}
-	if !opts.DisableNormalization && (opts.Normalization < 0 || opts.Normalization > 4) {
+	if !opts.DisableNormalization && (opts.Normalization <= 0 || opts.Normalization > 4) {
 		return errors.New("option Normalization must be 0, 1, 2 or 3")
 	}
 	if opts.BufSize <= opts.MaxSize {

diff --git a/fastcdc_example_test.go b/fastcdc_example_test.go
@@ -0,0 +1,53 @@
+package fastcdc_test
+
+import (
+	"bytes"
+	"crypto/md5"
+	"fmt"
+	"io"
+	"log"
+	"math/rand"
+
+	"github.com/iotafs/fastcdc-go"
+)
+
+func Example_basic() {
+
+	data := make([]byte, 10*1024*1024)
+	rand.Seed(4542)
+	rand.Read(data)
+	rd := bytes.NewReader(data)
+
+	chunker, err := fastcdc.NewChunker(rd, fastcdc.Options{
+		AverageSize: 1024 * 1024, // target 1 MiB average chunk size
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	fmt.Printf("%-32s  %s\n", "CHECKSUM", "CHUNK SIZE")
+
+	for {
+		chunk, err := chunker.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		fmt.Printf("%x  %d\n", md5.Sum(chunk.Data), chunk.Length)
+	}
+
+	// Output:
+	// CHECKSUM                          CHUNK SIZE
+	// d5bb40f862d68f4c3a2682e6d433f0d7  1788060
+	// 113a0aa2023d7dce6a2aac1f807b5bd2  1117240
+	// 5b9147b10d4fe6f96282da481ce848ca  1180487
+	// dcc4644befb599fa644635b0c5a1ea2c  1655501
+	// 224db3de422ad0dd2c840e3e24e0cb03  363172
+	// e071658eccda587789f1dabb6f773851  1227750
+	// 215868103f0b4ea7f715e179e5b9a6c7  1451026
+	// 21e65e40970ec22f5b13ddf60493b746  1150129
+	// b8209a1dbef955ef64636af796450252  552395
+}
diff --git a/fastcdc_test.go b/fastcdc_test.go
@@ -11,7 +11,7 @@ import (
 
 var defaultOpts = Options{
 	AverageSize: 1024,
-	Seed:       84372,
+	Seed:        84372,
 }
 
 var defaultData = randBytes(4321, 7291)
@@ -77,7 +77,7 @@ func TestChunking(t *testing.T) {
 	}
 }
 
-func TestChunkingBasic(t *testing.T) {
+func TestChunkingRandom(t *testing.T) {
 	data := randBytes(1e6, 63)
 	chunker, err := NewChunker(bytes.NewReader(data), defaultOpts)
 	assertNoError(t, err)
@@ -113,7 +113,9 @@ func TestChunkingBasic(t *testing.T) {
 func TestMinSize(t *testing.T) {
 	// Test with data smaller than min chunk size
 	data := randBytes(10, 51)
-	chunker, err := NewChunker(bytes.NewReader(data), defaultOpts)
+	opts := defaultOpts
+	opts.DisableNormalization = true
+	chunker, err := NewChunker(bytes.NewReader(data), opts)
 	assertNoError(t, err)
 
 	c, err := chunker.Next()
@@ -131,9 +133,37 @@ func TestMinSize(t *testing.T) {
 	}
 }
 
+func TestOptionsValidation(t *testing.T) {
+	avg := 1024 * 64
+	testOpts := []Options{
+		// AverageSize not set
+		{},
+		// MinSize too small
+		{AverageSize: avg, MinSize: 1},
+		// MaxSize too big
+		{AverageSize: avg, MaxSize: maxSize + 1},
+		// MaxSize less than MinSize
+		{AverageSize: avg, MaxSize: avg / 2, MinSize: avg * 2},
+		// AverageSize less than MinSize
+		{AverageSize: avg, MinSize: 2 * avg, MaxSize: 4 * avg},
+		// Bad normalization
+		{AverageSize: avg, Normalization: 100},
+		// BufSize too small
+		{AverageSize: avg, BufSize: 1},
+	}
+	var r bytes.Reader
+
+	for i, opts := range testOpts {
+		_, err := NewChunker(&r, opts)
+		if err == nil {
+			t.Fatalf("%d: expected error", i)
+		}
+	}
+}
+
 func BenchmarkFastCDC(b *testing.B) {
-	// 10GiB data total to chunk
-	n := 100
+	// total of 10GiB of data to chunk
+	n := 10
 	benchData := randBytes(100*1024*1024, 345)
 	r := newLoopReader(benchData, n)
 
@@ -149,6 +179,9 @@ func BenchmarkFastCDC(b *testing.B) {
 		if err == io.EOF {
 			break
 		}
+		if err != nil {
+			b.Fatal(err)
+		}
 	}
 	b.SetBytes(int64(n * len(benchData)))
 }

diff --git a/img/fastcdcgo_norm2_dist.png b/img/fastcdcgo_norm2_dist.png
diff --git a/img/restic_dist.png b/img/restic_dist.png