Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance for small inputs #4

Merged
merged 5 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions fastcdc.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ type Chunker struct {

buf []byte
cursor int
end int
offset int
eof bool
}
Expand Down Expand Up @@ -167,13 +168,23 @@ func NewChunker(rd io.Reader, opts Options) (*Chunker, error) {
rd: rd,
buf: make([]byte, opts.BufSize),
cursor: opts.BufSize,
end: opts.BufSize,
}

return chunker, nil
}

// Reset reinitializes the chunker with a new reader
func (c *Chunker) Reset(rd io.Reader) {
c.rd = rd
c.offset = 0
c.eof = false
c.cursor = len(c.buf)
c.end = len(c.buf)
}

func (c *Chunker) fillBuffer() error {
n := len(c.buf) - c.cursor
n := c.end - c.cursor
if n >= c.maxSize {
return nil
}
Expand All @@ -183,14 +194,14 @@ func (c *Chunker) fillBuffer() error {
c.cursor = 0

if c.eof {
c.buf = c.buf[:n]
c.end = n
return nil
}

// Fill the rest of the buffer
m, err := io.ReadFull(c.rd, c.buf[n:])
if err == io.EOF || err == io.ErrUnexpectedEOF {
c.buf = c.buf[:n+m]
c.end = n+m
c.eof = true
} else if err != nil {
return err
Expand All @@ -204,11 +215,11 @@ func (c *Chunker) Next() (Chunk, error) {
if err := c.fillBuffer(); err != nil {
return Chunk{}, err
}
if len(c.buf) == 0 {
if c.end == 0 {
return Chunk{}, io.EOF
}

length, fp := c.nextChunk(c.buf[c.cursor:])
length, fp := c.nextChunk(c.buf[c.cursor:c.end])

chunk := Chunk{
Offset: c.offset,
Expand Down
104 changes: 104 additions & 0 deletions fastcdc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ var defaultOpts = Options{
}

var defaultData = randBytes(4321, 7291)
var resetData = randBytes(1234, 1729)

// Chunks generated with defaultOpts from defaultData
var defaultChunks = []Chunk{
Expand Down Expand Up @@ -43,6 +44,14 @@ var defaultChunks = []Chunk{
Data: decodeHex("f137c8c9cc9d5bf8e9194dc4ca95e069c0c537b69e9f4d327bc39883a342c15d90c362fd68ab7443f58eeb18dc4a6f1834662ddab5800bdb3d95c49697667c027050fc53e0928a0047ff8dac3b41407456560f98b22805cf9dea6dd458fe0611351c2d8a9f738a38ad6336a9f54f167afde26ccb8bfa4e83261e58c79a880ab4dfd1d69556fca8d9b317a445dd4b5a4fe7ec483d01f3726f7402303abca81df9f1761029edb87e0c52a3ed1852193772ae582a883865b2a014cdde703295f1e413e9149ea44b52d82e4795c676a1046519dab66342d5ca2c28d526b08a70b36d968e3517762b5936486982e251bba6325a812fdeb14cc76f3e7c3e98e8fbabd9e2e61fa17029c946811371351ed0b294ac8013a534aa25bbba3b3c547f70649a66a36b948bcbfe6879f389dcfcf6a564c47e7a1abd051860f6426db6f23f815faa29b484d3323772b616da282253a321f369e1961e1304db3e484cb4357a184f087281955450da9f55bcc502876991f8863918152fe02ebb6ca13a06852fb1ef6d627e0387056bc2f5b0fb7758ede4d494be94860398eb6d8af22898b33e08628f6822a710a7c1775da705f02688e7647940beeabe09381688cd80623fbbe6720451a785d9dd010e03cc10f84aa291e0760940e3ff901742060832725730947783c15671e75b732e99d3e5f0360ef5715113bc9f63d9c5069550297a43f512195e48440ebc8523a71c3ed7bd71ab0718d70a4b8b059d54b278962c6bced7cbb398c9ea65791fd4cbede0bfa26f06a9e2d64a7b455e0e9618e66f4c552507f5bacad9936415ef170acafb69469f4f5abdc68e919024c9254a9c7257aa90eb9e86d7b742b87bad69a9b60559a9eaa1409e5e0faf1414884de6829a52399cd181f32be59b81911eece3e6c6ba33"),
},
}
var resetChunks []Chunk = []Chunk{
{
Offset: 0,
Length: 1234,
Fingerprint: 4311516274091363630,
Data: decodeHex("387a518e109f76ee1fedd636dbc7e8d3d240f2a68c16c15b128e80172106886c1789c62d80fbd67e24fc3dd28fb7bdec1c3461e1ea9ac7c90156244f25e8a14f2e25fa29ceb081bbd1bbf3ac0a6e1f0358f90d993e643844b0309c60b4622d3000ea729db93f149c87d558e04af5de4bfa497401ddfc403249ed9903c4097cd2244ffc73c448b56d7db4962dab55d2df6f74408baa19fc5b18c28e0d0c2bd07d733031381ee33b77403b227e49f55a53e1d2c561fcd2cb98c6188552d06e167d60a685a2e2d51139daedda5033ecf080ab86407be70312458dcb47c39cd8586ccdc9c3c4e08e731ef84d37e1f445d958e7580c6de64d8ce6358557e5089e27810b5c9c7fd15ccb8575e0616bb66cde78b67d399c866a9903fe5387f4a11984eec8607e795e51cae9bad177c5bd83862ddf6014465fc3beaa518016ae559f441cf736dffacf042b57587e72db0ebf68971749212873b1b4710e5144b39fa8ad76e7ce039fb92c9f3f63e296434f9e273b3a8cbce07431e2436af2fab0e2511f7a14acb683c5f1f073d51609333339b7fbb4a1c54abaca7e8e6d0ac10df2827af2ecfb993929db4295cf22675b6315b1db98584d4ee2a91afc9a666ed6cabeccdbaec6ac05d5ec18031db42e93264674a6ca4e5c4b4b180f1f9c1b12c7b5979591078449002aebc2d275ea859ea8c6d38087c558e3db28ea3db0730b6722dac763b55820e90cd0b76cf6ddeb1ade14af5824fb3faeb84e28baf5ac40a6490a58788dea56a8d662135fcf9f8f36352335476c437269fc75c185c6775a23e2b2669068e183353771be80efe5508ff88aae99db088daa42ccc0027f23bb7bef8f6c8769f8a1bfc67086fb105a32e35a7040883158d40d1ce1fba8647d731abcadd21a6f82d64b5cd5cb198588fe9fcd642f1c20e1a0e98a89805d03cba7bd4c05ff5105d11449e15f7ed6716412747ea79335929dd2aa03fda4f994e7bf105c5ccb194f67e75399cfbf0e7942035bb0fd535ba5a67d61edb1e9d1ac814ce65a7b427ad346f930918f204bbef63f148092608dd4b76faab5594c2715a7c464ff109479e272a5e995676c3ac035390d0b35ce2fecd6a97cc2a718ff7a3a14e0ef8ace998802c031641c3735b8416355851129e25a3ca476a15af03d6d59427b7f5fcab7ae3429db87e57d657c5fd428243ba33e752a06ba609bf5361aa66de3f7374cd8f60daac41248424c0c361fae92869b93e68c41b9503f686d63e320ee0449c9a0d9f4d8e540e29f533e4a876ee58694ada344e2df1d77e057aa4e5fb36f38b8f08102bf8dac67ad22ce5f95fd833b5bb98a922a1211464ce10707b1ef564005eea856dd7c46a12076ea79cb265761c9b5c095e1a7121cf12555923fdfceaaf26215da9334d0e38b7292227c7649255ae48feba4761f7df85d8e639b4b0d145aced87e78b8b09d49fd62f9becc298cc0f6ba8cbf5d0d9d8e52a377aba200e47a38a801f9ad0da953ccd584d051db214284feecec28c05f0abfb5d383efc3bb94e5a6308541468b64e18fc90ecc7f90e3d78ef9531d46feacb0c1ec831e167e984aa2e4d9b58fefca1c37efd820ccf992d5206cf1565c2306da61fb6c4bd5c679543dddb7a7e4505fc5138939d9b833378369a14842337fb681247e10ad78306d5e763aa3956d904e838689f88de6f532264b070cf2eaa52ccf565e96f8685f7a7f57d7062e806beac9baa327d866e528a110a5"),
},
}

func TestChunking(t *testing.T) {
chunker, err := NewChunker(bytes.NewReader(defaultData), defaultOpts)
Expand All @@ -69,6 +78,29 @@ func TestChunking(t *testing.T) {
t.Errorf("chunk %d: data not equal to expected, actual: %s", i, hex.EncodeToString(c.Data))
}
}

chunker.Reset(bytes.NewReader(resetData))
for i := 0; ; i++ {
c, err := chunker.Next()
if err == io.EOF {
break
}
assertNoError(t, err)
cdef := resetChunks[i]

if c.Offset != cdef.Offset {
t.Errorf("chunk %d: expected offset %d but received %d", i, cdef.Offset, c.Offset)
}
if c.Length != cdef.Length {
t.Errorf("chunk %d: expected length %d but received %d", i, cdef.Length, c.Length)
}
if c.Fingerprint != cdef.Fingerprint {
t.Errorf("chunk %d: expected fp %d but received %d", i, cdef.Fingerprint, c.Fingerprint)
}
if !bytes.Equal(cdef.Data, c.Data) {
t.Errorf("chunk %d: data not equal to expected, actual: %s", i, hex.EncodeToString(c.Data))
}
}
}

func TestChunkingRandom(t *testing.T) {
Expand Down Expand Up @@ -180,6 +212,78 @@ func BenchmarkFastCDC(b *testing.B) {
b.SetBytes(int64(n * len(benchData)))
}

type bencSpec struct {
size int
name string
}

var bSizes = []bencSpec{
{1 << 10, "1k"},
{4 << 10, "4k"},
{16 << 10, "16k"},
{32 << 10, "32k"},
{64 << 10, "64k"},
{128 << 10, "128k"},
{256 << 10, "256k"},
{512 << 10, "512k"},
{1 << 20, "1M"},
{4 << 20, "4M"},
{16 << 20, "16M"},
{32 << 20, "32M"},
{64 << 20, "64M"},
{128 << 20, "128M"},
{512 << 20, "512M"},
{1 << 30, "1G"},
}

func BenchmarkFastCDCSize(b *testing.B) {
for _, s := range bSizes {
s := s
b.Run(s.name, func(b *testing.B) {
benchmarkFastCDCSize(b, s.size)
})
}
}

func benchmarkFastCDCSize(b *testing.B, size int) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, size)
rng.Read(data)

r := bytes.NewReader(data)
b.SetBytes(int64(size))
b.ReportAllocs()
b.ResetTimer()

cnkr, err := NewChunker(r, Options{
AverageSize: 1 * miB,
})
if err != nil {
b.Fatal(err)
}

var res uint64
var nchks int64

for i := 0; i < b.N; i++ {
r.Reset(data)
cnkr.Reset(r)

for {
chunk, err := cnkr.Next()
if err != nil {
if err == io.EOF {
break
}
b.Fatal(err)
}
res = res + uint64(len(chunk.Data))
nchks++
}
}
b.ReportMetric(float64(nchks)/float64(b.N), "chunks")
}

// loopReader implements io.Reader, looping over a provided buffer a given number of
// times.
type loopReader struct {
Expand Down