From 99b9fc121943865f17a2212b24c7dae83df065c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emil=20W=C3=A5reus?= Date: Fri, 16 Feb 2024 13:21:17 +0100 Subject: [PATCH 1/4] initial snippet implementation --- internal/fingerprint/snippet.go | 146 ++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 internal/fingerprint/snippet.go diff --git a/internal/fingerprint/snippet.go b/internal/fingerprint/snippet.go new file mode 100644 index 00000000..04c945f1 --- /dev/null +++ b/internal/fingerprint/snippet.go @@ -0,0 +1,146 @@ +package fingerprint + +import ( + "crypto/md5" + "encoding/hex" + "fmt" + "hash/crc32" + "io/ioutil" + "path/filepath" + "strings" +) + +const ( + Gram = 30 + Window = 64 + MinFileSize = 256 + MaxPostSize = 64 * 1024 // 64k + MaxLongLineChars = 1000 + ASCII0 = 48 + ASCII9 = 57 + ASCIILF = 10 + ASCIIBackslash = 92 + MaxCRC32 = 4294967295 + SkipSnippetExtSize = 29 +) + +var SkipSnippetExt = map[string]bool{ + ".exe": true, ".zip": true, // Add all extensions as in the Python example +} + +type Winnowing struct { + sizeLimit bool + skipSnippets bool + maxPostSize int + allExtensions bool + obfuscate bool + fileMap map[string]string + crc8MaximTable []uint8 +} + +func NewWinnowing(sizeLimit, skipSnippets, allExtensions, obfuscate bool, postSize int) *Winnowing { + return &Winnowing{ + sizeLimit: sizeLimit, + skipSnippets: skipSnippets, + maxPostSize: postSize * 1024, + allExtensions: allExtensions, + obfuscate: obfuscate, + fileMap: make(map[string]string), + crc8MaximTable: make([]uint8, 0), + } +} + +func (w *Winnowing) NormalizeByte(b byte) byte { + if b < ASCII0 || b > ASCII9 { + return 0 + } + return b +} + +func (w *Winnowing) ShouldSkipFile(filePath string) bool { + extension := strings.ToLower(filepath.Ext(filePath)) + if _, ok := SkipSnippetExt[extension]; ok && !w.allExtensions { + return true + } + return false +} + +func (w *Winnowing) ReadFile(filePath string) ([]byte, error) { + if w.ShouldSkipFile(filePath) { + return nil, fmt.Errorf("file skipped due to extension: %s", filePath) + } + content, err := ioutil.ReadFile(filePath) + if err != nil { + return nil, err + } + if len(content) < MinFileSize { + return nil, fmt.Errorf("file ignored due to size: %s", filePath) + } + return content, nil +} + +func (w *Winnowing) GenerateWFP(filePath string) { + content, err := w.ReadFile(filePath) + if err != nil { + fmt.Println("Error reading file:", err) + return + } + fmt.Printf("File: %s, MD5: %s\n", filePath, w.calculateMD5(content)) + + // Normalize content and generate k-grams + normalizedContent := w.normalizeContent(content) + kgrams := w.generateKgrams(normalizedContent, Gram) + + // Calculate hash for each k-gram + hashes := make([]uint32, len(kgrams)) + for i, kgram := range kgrams { + hashes[i] = crc32c(kgram) + } + + // Select minimum hashes within each window of size `Window` + var fingerprints []uint32 + for i := 0; i <= len(hashes)-Window; i++ { + window := hashes[i : i+Window] + minHash := uint32(MaxCRC32) + for _, hash := range window { + if hash < minHash { + minHash = hash + } + } + fingerprints = append(fingerprints, minHash) + } + + // Print fingerprints for demonstration + for i, fingerprint := range fingerprints { + fmt.Printf("Window %d: Hash %x\n", i, fingerprint) + } +} + +func (w *Winnowing) calculateMD5(content []byte) string { + hash := md5.Sum(content) + return hex.EncodeToString(hash[:]) +} + +// Placeholder for the crc32c function +func crc32c(data []byte) uint32 { + // This should be replaced with an actual crc32c implementation + return crc32.ChecksumIEEE(data) +} + +func (w *Winnowing) normalizeContent(content []byte) []byte { + normalized := make([]byte, 0, len(content)) + for _, b := range content { + if (b >= ASCII0 && b <= ASCII9) || (b >= ASCIILF && b <= ASCIIBackslash) { + normalized = append(normalized, b) + } + } + return normalized +} + +func (w *Winnowing) generateKgrams(content []byte, k int) [][]byte { + var kgrams [][]byte + for i := 0; i <= len(content)-k; i++ { + kgrams = append(kgrams, content[i:i+k]) + } + return kgrams +} From 4f3d376922ab042a45e139011eb1800b952c650d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emil=20W=C3=A5reus?= Date: Wed, 21 Feb 2024 20:10:38 +0100 Subject: [PATCH 2/4] snippet implementation --- internal/file/exclusion.go | 2 +- internal/fingerprint/fingerprint.go | 63 ++++- internal/fingerprint/snippet.go | 248 ++++++++++++------ internal/fingerprint/snippet_test.go | 21 ++ internal/fingerprint/testdata/snippet/main.py | 38 +++ 5 files changed, 283 insertions(+), 89 deletions(-) create mode 100644 internal/fingerprint/snippet_test.go create mode 100644 internal/fingerprint/testdata/snippet/main.py diff --git a/internal/file/exclusion.go b/internal/file/exclusion.go index 9bfcc68d..a629e4a4 100644 --- a/internal/file/exclusion.go +++ b/internal/file/exclusion.go @@ -33,7 +33,7 @@ func Exclusions() []string { var EXCLUDED_DIRS_FINGERPRINT = []string{ "nbproject", "nbbuild", "nbdist", "node_modules", - "__pycache__", "_yardoc", "eggs", + "__pycache__", "_yardoc", "eggs", ".git", "wheels", "htmlcov", "__pypackages__"} var EXCLUDED_DIRS_FINGERPRINT_RAW = []string{"**/*.egg-info/**", "**/*venv/**"} diff --git a/internal/fingerprint/fingerprint.go b/internal/fingerprint/fingerprint.go index 55be4b30..28aa7d69 100644 --- a/internal/fingerprint/fingerprint.go +++ b/internal/fingerprint/fingerprint.go @@ -116,12 +116,28 @@ type FileFingerprint struct { path string contentLength int64 fingerprint []byte + snippets []Snippet } func (f FileFingerprint) ToString() string { path := filepath.ToSlash(f.path) + output := fmt.Sprintf("file=%x,%d,%s", f.fingerprint, f.contentLength, path) + + if len(f.snippets) != 0 { + snippetOutput := "" + prevLine := -1 + for _, snippet := range f.snippets { + if snippet.Line == prevLine { + snippetOutput += fmt.Sprintf(",%s", snippet.Hash) + } else { + snippetOutput += fmt.Sprintf("\n%d=%s", snippet.Line, snippet.Hash) + } + prevLine = snippet.Line + } - return fmt.Sprintf("file=%x,%d,%s", f.fingerprint, f.contentLength, path) + output += snippetOutput + } + return output } func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (Fingerprints, error) { @@ -239,28 +255,42 @@ func shouldProcessFile(fileInfo os.FileInfo, exclusions []string, path string) b } func computeHashForFile(filename string) (FileFingerprint, error) { - data, err := os.ReadFile(filename) + rc, err := os.Open(filename) if err != nil { return FileFingerprint{}, err } + defer rc.Close() hasher := newHasher() - if _, err := hasher.Write(data); err != nil { + _, err = io.Copy(hasher, rc) // #nosec + if err != nil { return FileFingerprint{}, err } - contentLength := int64(len(data)) + contentLength := int64(hasher.Size()) if err != nil { return FileFingerprint{}, err } - return FileFingerprint{ + fingerprint := FileFingerprint{ path: filename, contentLength: contentLength, fingerprint: hasher.Sum(nil), - }, nil + } + winnowing := NewWinnowing(nil) + + snippets, err := winnowing.GenerateWFP(filename) + if err != nil { + return FileFingerprint{}, err + } + + if snippets != nil { + fingerprint.snippets = *snippets + } + + return fingerprint, nil } type Fingerprints struct { @@ -327,22 +357,33 @@ func inMemFingerprintingCompressedContent(filename string, exclusions []string) return nil, err } + defer rc.Close() + hasher := newHasher() _, err = io.Copy(hasher, rc) // #nosec if err != nil { - rc.Close() - return nil, err } - fingerprints = append(fingerprints, FileFingerprint{ + fingerprint := FileFingerprint{ path: longFileName, contentLength: int64(f.UncompressedSize64), fingerprint: hasher.Sum(nil), - }) + } + + winnowing := NewWinnowing(nil) + + snippets, err := winnowing.GenerateWFP(longFileName) + if err != nil { + return nil, err + } + + if snippets != nil { + fingerprint.snippets = *snippets + } - rc.Close() + fingerprints = append(fingerprints, fingerprint) } return fingerprints, nil diff --git a/internal/fingerprint/snippet.go b/internal/fingerprint/snippet.go index 04c945f1..1cee4145 100644 --- a/internal/fingerprint/snippet.go +++ b/internal/fingerprint/snippet.go @@ -1,52 +1,115 @@ package fingerprint import ( - "crypto/md5" - "encoding/hex" "fmt" "hash/crc32" - "io/ioutil" + "io" + "os" "path/filepath" - "strings" ) const ( Gram = 30 Window = 64 - MinFileSize = 256 - MaxPostSize = 64 * 1024 // 64k + MinFileSizeDefault = 256 MaxLongLineChars = 1000 ASCII0 = 48 ASCII9 = 57 + ASCIIA = 65 + ASCIIZ = 90 + ASCIIa = 97 + ASCIIz = 122 ASCIILF = 10 ASCIIBackslash = 92 - MaxCRC32 = 4294967295 - SkipSnippetExtSize = 29 + MaxCRC32 = uint32(4294967295) ) -var SkipSnippetExt = map[string]bool{ - ".exe": true, ".zip": true, // Add all extensions as in the Python example +var IncludedExtensions = map[string]bool{ + // C + ".c": true, + ".h": true, + + // C++ + ".cc": true, + ".cpp": true, + ".hpp": true, + + // C# + ".cs": true, + + // Go + ".go": true, + + // Java + ".java": true, + + // Kotlin + ".kt": true, + + // JavaScript + TypeScript + frameworks + ".js": true, + ".ts": true, + ".jsx": true, + ".tsx": true, + ".vue": true, + ".svelte": true, + ".elm": true, + ".coffee": true, + ".litcoffee": true, + ".cjsx": true, + ".iced": true, + ".es": true, + ".es6": true, + ".mjs": true, + + // Ruby + ".rb": true, + + // Rust + ".rs": true, + + // Swift + ".swift": true, + + // Objective-C + ".m": true, + ".mm": true, + + // PHP + ".php": true, + + // Python + ".py": true, + + // CSS + ".css": true, + + // Scala + ".scala": true, } type Winnowing struct { - sizeLimit bool - skipSnippets bool - maxPostSize int - allExtensions bool - obfuscate bool - fileMap map[string]string crc8MaximTable []uint8 + MinFileSize int + results *[]Snippet +} + +type Snippet struct { + Content string + Hash string + Line int } -func NewWinnowing(sizeLimit, skipSnippets, allExtensions, obfuscate bool, postSize int) *Winnowing { +func NewWinnowing(minFileSize *int) *Winnowing { + var MinFileSize int + if minFileSize != nil { + MinFileSize = *minFileSize + } else { + MinFileSize = MinFileSizeDefault + } return &Winnowing{ - sizeLimit: sizeLimit, - skipSnippets: skipSnippets, - maxPostSize: postSize * 1024, - allExtensions: allExtensions, - obfuscate: obfuscate, - fileMap: make(map[string]string), crc8MaximTable: make([]uint8, 0), + MinFileSize: MinFileSize, } } @@ -58,83 +121,114 @@ func (w *Winnowing) NormalizeByte(b byte) byte { } func (w *Winnowing) ShouldSkipFile(filePath string) bool { - extension := strings.ToLower(filepath.Ext(filePath)) - if _, ok := SkipSnippetExt[extension]; ok && !w.allExtensions { + ext := filepath.Ext(filePath) + if _, ok := IncludedExtensions[ext]; !ok { return true } return false } -func (w *Winnowing) ReadFile(filePath string) ([]byte, error) { - if w.ShouldSkipFile(filePath) { - return nil, fmt.Errorf("file skipped due to extension: %s", filePath) - } - content, err := ioutil.ReadFile(filePath) - if err != nil { - return nil, err - } - if len(content) < MinFileSize { - return nil, fmt.Errorf("file ignored due to size: %s", filePath) - } - return content, nil -} +func (w *Winnowing) Write(p []byte) (n int, err error) { + var output []Snippet -func (w *Winnowing) GenerateWFP(filePath string) { - content, err := w.ReadFile(filePath) - if err != nil { - fmt.Println("Error reading file:", err) - return + content := p + content_len := len(content) + if content_len < w.MinFileSize { + return len(p), nil } - fmt.Printf("File: %s, MD5: %s\n", filePath, w.calculateMD5(content)) + line := 1 + window := []uint32{} + gram := []byte{} + last_hash := MaxCRC32 + last_content_window_end := 0 + for i, bt := range content { + + if bt == ASCIILF { + line++ + continue + } - // Normalize content and generate k-grams - normalizedContent := w.normalizeContent(content) - kgrams := w.generateKgrams(normalizedContent, Gram) + btNorm, process := w.normalizeContent(bt) + if !process { + continue + } - // Calculate hash for each k-gram - hashes := make([]uint32, len(kgrams)) - for i, kgram := range kgrams { - hashes[i] = crc32c(kgram) - } + gram = append(gram, btNorm) + + if len(gram) >= Gram { + gramCrc32 := crc32c(gram) + window = append(window, gramCrc32) + + if len(window) >= Window { + minHash := minHash(window) + + if minHash != last_hash { - // Select minimum hashes within each window of size `Window` - var fingerprints []uint32 - for i := 0; i <= len(hashes)-Window; i++ { - window := hashes[i : i+Window] - minHash := uint32(MaxCRC32) - for _, hash := range window { - if hash < minHash { - minHash = hash + // Hashing the hash to balance the distribution + crc := crc32c([]byte{byte(minHash & 0xff), byte((minHash >> 8) & 0xff), byte((minHash >> 16) & 0xff), byte((minHash >> 24) & 0xff)}) + output = append(output, Snippet{Content: string(content[last_content_window_end:i]), Hash: fmt.Sprintf("%x", crc), Line: line}) + last_content_window_end = i + } + last_hash = minHash + window = window[1:] } + gram = gram[1:] } - fingerprints = append(fingerprints, minHash) + } - // Print fingerprints for demonstration - for i, fingerprint := range fingerprints { - fmt.Printf("Window %d: Hash %x\n", i, fingerprint) + w.results = &output + + return len(p), nil +} + +func minHash(window []uint32) uint32 { + min := MaxCRC32 + for _, hash := range window { + if hash < min { + min = hash + } } + return min } -func (w *Winnowing) calculateMD5(content []byte) string { - hash := md5.Sum(content) - return hex.EncodeToString(hash[:]) +func (w *Winnowing) GenerateWFP(filePath string) (*[]Snippet, error) { + rc, err := os.Open(filePath) + if err != nil { + return nil, err + } + defer rc.Close() + + if w.ShouldSkipFile(filePath) { + return nil, nil + } + + _, err = io.Copy(w, rc) + if err != nil { + return nil, err + } + + return w.results, nil } -// Placeholder for the crc32c function func crc32c(data []byte) uint32 { - // This should be replaced with an actual crc32c implementation - return crc32.ChecksumIEEE(data) + // Create a table for the Castagnoli polynomial. + castagnoliTable := crc32.MakeTable(crc32.Castagnoli) + + // crc32.ChecksumIEEE(data) + return crc32.Checksum(data, castagnoliTable) } -func (w *Winnowing) normalizeContent(content []byte) []byte { - normalized := make([]byte, 0, len(content)) - for _, b := range content { - if (b >= ASCII0 && b <= ASCII9) || (b >= ASCIILF && b <= ASCIIBackslash) { - normalized = append(normalized, b) - } +func (w *Winnowing) normalizeContent(b byte) (byte, bool) { + if b < ASCII0 || b > ASCIIz { + return 0, false + } else if b <= ASCII9 || b >= ASCIIa { + return b, true + } else if b >= 65 && b <= 90 { + return b + 32, true + } else { + return 0, false } - return normalized } func (w *Winnowing) generateKgrams(content []byte, k int) [][]byte { diff --git a/internal/fingerprint/snippet_test.go b/internal/fingerprint/snippet_test.go new file mode 100644 index 00000000..04da6fb3 --- /dev/null +++ b/internal/fingerprint/snippet_test.go @@ -0,0 +1,21 @@ +package fingerprint + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGenerateWFP(t *testing.T) { + + minFileSize := 1 + w := NewWinnowing(&minFileSize) + + snippets, err := w.GenerateWFP("testdata/snippet/main.py") + assert.NoError(t, err) + assert.Equal(t, 8, len(*snippets)) + assert.Equal(t, "5e6ddca9", (*snippets)[0].Hash) + assert.Equal(t, 14, (*snippets)[0].Line) + assert.Equal(t, "def test():\n print(\"Hello, World!\")\n\n\ndef test2():\n print(\"Hello, World!2\")\n\n\ndef test3():\n print(\"Hello, World!3\")\n\n\ndef test4():\n print(\"Hello, Worl", (*snippets)[0].Content) + +} diff --git a/internal/fingerprint/testdata/snippet/main.py b/internal/fingerprint/testdata/snippet/main.py new file mode 100644 index 00000000..c908fa05 --- /dev/null +++ b/internal/fingerprint/testdata/snippet/main.py @@ -0,0 +1,38 @@ +def test(): + print("Hello, World!") + + +def test2(): + print("Hello, World!2") + + +def test3(): + print("Hello, World!3") + + +def test4(): + print("Hello, World!4") + + +def test5(): + print("Hello, World!5") + + +def test6(): + print("Hello, World!6") + + +def test7(): + print("Hello, World!7") + + +def test8(): + print("Hello, World!8") + + +def test9(): + print("Hello, World!9") + + +def test10(): + print("Hello, World!10") From dd0a965c5cf4578e6284448b41915d0a3e8b5b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emil=20W=C3=A5reus?= Date: Thu, 22 Feb 2024 14:37:54 +0100 Subject: [PATCH 3/4] add command to snippet fingerprinting --- internal/cmd/fingerprint/fingerprint.go | 8 ++- internal/file/exclusion_test.go | 1 + internal/fingerprint/fingerprint.go | 61 ++++++++++--------- internal/fingerprint/fingerprint_test.go | 12 ++-- internal/fingerprint/snippet.go | 18 +++--- .../testdata/fingerprinter_mock.go | 2 +- internal/scan/scanner.go | 2 +- 7 files changed, 57 insertions(+), 47 deletions(-) diff --git a/internal/cmd/fingerprint/fingerprint.go b/internal/cmd/fingerprint/fingerprint.go index 0d25623f..a56f8864 100644 --- a/internal/cmd/fingerprint/fingerprint.go +++ b/internal/cmd/fingerprint/fingerprint.go @@ -15,9 +15,11 @@ var exclusions = file.DefaultExclusionsFingerprint() const ( ExclusionFlag = "exclusion" FingerprintCompressedContent = "fingerprint-compressed-content" + IncludeSnippet = "snippet" ) var shouldFingerprintCompressedContent bool +var includeSnippet bool func NewFingerprintCmd(fingerprinter fingerprint.IFingerprint) *cobra.Command { @@ -48,7 +50,9 @@ Example: $ debricked files fingerprint . `+exampleFlags) cmd.Flags().BoolVar(&shouldFingerprintCompressedContent, FingerprintCompressedContent, false, `Fingerprint the contents of compressed files by unpacking them in memory, Supported files: `+fmt.Sprintf("%v", fingerprint.FILES_TO_UNPACK)) - + cmd.Flags().BoolVar(&includeSnippet, IncludeSnippet, false, "Snippet fingerprint file content.") + // hide the flag from the help message + _ = cmd.Flags().MarkHidden(IncludeSnippet) viper.MustBindEnv(ExclusionFlag) return cmd @@ -61,7 +65,7 @@ func RunE(f fingerprint.IFingerprint) func(_ *cobra.Command, args []string) erro path = args[0] } - output, err := f.FingerprintFiles(path, exclusions, shouldFingerprintCompressedContent) + output, err := f.FingerprintFiles(path, exclusions, shouldFingerprintCompressedContent, includeSnippet) if err != nil { return err diff --git a/internal/file/exclusion_test.go b/internal/file/exclusion_test.go index c42a0dc5..a019f634 100644 --- a/internal/file/exclusion_test.go +++ b/internal/file/exclusion_test.go @@ -73,6 +73,7 @@ func TestDefaultExclusionsFingerprint(t *testing.T) { filepath.Join("**", "__pycache__", "**"), filepath.Join("**", "_yardoc", "**"), filepath.Join("**", "eggs", "**"), + filepath.Join("**", ".git", "**"), filepath.Join("**", "wheels", "**"), filepath.Join("**", "htmlcov", "**"), filepath.Join("**", "__pypackages__", "**"), diff --git a/internal/fingerprint/fingerprint.go b/internal/fingerprint/fingerprint.go index 28aa7d69..87a0f83d 100644 --- a/internal/fingerprint/fingerprint.go +++ b/internal/fingerprint/fingerprint.go @@ -99,7 +99,7 @@ func isExcludedByEnding(filename string) bool { } type IFingerprint interface { - FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (Fingerprints, error) + FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (Fingerprints, error) } type Fingerprinter struct { @@ -137,10 +137,11 @@ func (f FileFingerprint) ToString() string { output += snippetOutput } + return output } -func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (Fingerprints, error) { +func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (Fingerprints, error) { if len(rootPath) == 0 { rootPath = filepath.Base("") } @@ -158,7 +159,7 @@ func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, f return err } - fingerprintsZip, err := computeHashForFileAndZip(fileInfo, path, exclusions, fingerprintCompressedContent) + fingerprintsZip, err := computeHashForFileAndZip(fileInfo, path, exclusions, fingerprintCompressedContent, includeSnippet) if err != nil { return err } @@ -188,7 +189,7 @@ func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, f return fingerprints, err } -func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []string, fingerprintCompressedContent bool) ([]FileFingerprint, error) { +func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) ([]FileFingerprint, error) { if !shouldProcessFile(fileInfo, exclusions, path) { return nil, nil } @@ -197,7 +198,7 @@ func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []st // If the file should be unzipped, try to unzip and fingerprint it if isCompressedFile(path) && fingerprintCompressedContent { - fingerprintsZip, err := inMemFingerprintingCompressedContent(path, exclusions) + fingerprintsZip, err := inMemFingerprintingCompressedContent(path, exclusions, includeSnippet) if err != nil { if errors.Is(err, zip.ErrFormat) { fmt.Printf("WARNING: Could not unpack and fingerprint contents of compressed file [%s]. Error: %v\n", path, err) @@ -208,7 +209,7 @@ func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []st fingerprints = append(fingerprints, fingerprintsZip...) } - fingerprint, err := computeHashForFile(path) + fingerprint, err := computeHashForFile(path, includeSnippet) if err != nil { return nil, err } @@ -254,7 +255,7 @@ func shouldProcessFile(fileInfo os.FileInfo, exclusions []string, path string) b return !isSymlink } -func computeHashForFile(filename string) (FileFingerprint, error) { +func computeHashForFile(filename string, includeSnippet bool) (FileFingerprint, error) { rc, err := os.Open(filename) if err != nil { return FileFingerprint{}, err @@ -263,31 +264,32 @@ func computeHashForFile(filename string) (FileFingerprint, error) { hasher := newHasher() - _, err = io.Copy(hasher, rc) // #nosec + contentLen, err := io.Copy(hasher, rc) // #nosec if err != nil { return FileFingerprint{}, err } - contentLength := int64(hasher.Size()) - if err != nil { return FileFingerprint{}, err } fingerprint := FileFingerprint{ path: filename, - contentLength: contentLength, + contentLength: contentLen, fingerprint: hasher.Sum(nil), } - winnowing := NewWinnowing(nil) - snippets, err := winnowing.GenerateWFP(filename) - if err != nil { - return FileFingerprint{}, err - } + if includeSnippet { + winnowing := NewWinnowing(nil) - if snippets != nil { - fingerprint.snippets = *snippets + snippets, err := winnowing.GenerateWFP(filename) + if err != nil { + return FileFingerprint{}, err + } + + if snippets != nil { + fingerprint.snippets = *snippets + } } return fingerprint, nil @@ -333,7 +335,7 @@ func isCompressedFile(filename string) bool { return false } -func inMemFingerprintingCompressedContent(filename string, exclusions []string) ([]FileFingerprint, error) { +func inMemFingerprintingCompressedContent(filename string, exclusions []string, includeSnippet bool) ([]FileFingerprint, error) { r, err := zip.OpenReader(filename) if err != nil { @@ -361,26 +363,27 @@ func inMemFingerprintingCompressedContent(filename string, exclusions []string) hasher := newHasher() - _, err = io.Copy(hasher, rc) // #nosec + contentLen, err := io.Copy(hasher, rc) // #nosec if err != nil { return nil, err } fingerprint := FileFingerprint{ path: longFileName, - contentLength: int64(f.UncompressedSize64), + contentLength: contentLen, fingerprint: hasher.Sum(nil), } + if includeSnippet { + winnowing := NewWinnowing(nil) - winnowing := NewWinnowing(nil) - - snippets, err := winnowing.GenerateWFP(longFileName) - if err != nil { - return nil, err - } + snippets, err := winnowing.GenerateWFP(longFileName) + if err != nil { + return nil, err + } - if snippets != nil { - fingerprint.snippets = *snippets + if snippets != nil { + fingerprint.snippets = *snippets + } } fingerprints = append(fingerprints, fingerprint) diff --git a/internal/fingerprint/fingerprint_test.go b/internal/fingerprint/fingerprint_test.go index 2456f57d..04302c78 100644 --- a/internal/fingerprint/fingerprint_test.go +++ b/internal/fingerprint/fingerprint_test.go @@ -151,7 +151,7 @@ func TestFingerprinterInterface(t *testing.T) { func TestFingerprintFiles(t *testing.T) { fingerprinter := NewFingerprinter() - fingerprints, err := fingerprinter.FingerprintFiles("testdata/fingerprinter", []string{}, true) + fingerprints, err := fingerprinter.FingerprintFiles("testdata/fingerprinter", []string{}, true, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -159,7 +159,7 @@ func TestFingerprintFiles(t *testing.T) { assert.Equal(t, "file=634c5485de8e22b27094affadd8a6e3b,21,testdata/fingerprinter/testfile.py", fingerprints.Entries[0].ToString()) // Test no file - fingerprints, err = fingerprinter.FingerprintFiles("", []string{}, true) + fingerprints, err = fingerprinter.FingerprintFiles("", []string{}, true, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -198,11 +198,11 @@ func TestFileFingerprintToString(t *testing.T) { func TestComputeMD5(t *testing.T) { // Test file not found - _, err := computeHashForFile("testdata/fingerprinter/testfile-not-found.py") + _, err := computeHashForFile("testdata/fingerprinter/testfile-not-found.py", false) assert.Error(t, err) // Test file found - entry, err := computeHashForFile("testdata/fingerprinter/testfile.py") + entry, err := computeHashForFile("testdata/fingerprinter/testfile.py", false) assert.NoError(t, err) entryS := fmt.Sprintf("%x", entry.fingerprint) assert.Equal(t, "634c5485de8e22b27094affadd8a6e3b", entryS) @@ -349,7 +349,7 @@ func TestInMemFingerprintingCompressedContent(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { fingerprinter := NewFingerprinter() - fingerprints, err := fingerprinter.FingerprintFiles(tt.path, []string{}, tt.shouldUnzip) + fingerprints, err := fingerprinter.FingerprintFiles(tt.path, []string{}, tt.shouldUnzip, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -376,7 +376,7 @@ func TestComputeHashForFile(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - _, err := computeHashForFile(tt.file) + _, err := computeHashForFile(tt.file, false) if (err != nil) != tt.wantErr { t.Errorf("computeHashForFile() error = %v, wantErr %v", err, tt.wantErr) } diff --git a/internal/fingerprint/snippet.go b/internal/fingerprint/snippet.go index 1cee4145..6b75fe05 100644 --- a/internal/fingerprint/snippet.go +++ b/internal/fingerprint/snippet.go @@ -107,6 +107,7 @@ func NewWinnowing(minFileSize *int) *Winnowing { } else { MinFileSize = MinFileSizeDefault } + return &Winnowing{ crc8MaximTable: make([]uint8, 0), MinFileSize: MinFileSize, @@ -117,14 +118,17 @@ func (w *Winnowing) NormalizeByte(b byte) byte { if b < ASCII0 || b > ASCII9 { return 0 } + return b } func (w *Winnowing) ShouldSkipFile(filePath string) bool { ext := filepath.Ext(filePath) if _, ok := IncludedExtensions[ext]; !ok { + return true } + return false } @@ -134,6 +138,7 @@ func (w *Winnowing) Write(p []byte) (n int, err error) { content := p content_len := len(content) if content_len < w.MinFileSize { + return len(p), nil } line := 1 @@ -145,11 +150,13 @@ func (w *Winnowing) Write(p []byte) (n int, err error) { if bt == ASCIILF { line++ + continue } btNorm, process := w.normalizeContent(bt) if !process { + continue } @@ -189,6 +196,7 @@ func minHash(window []uint32) uint32 { min = hash } } + return min } @@ -200,11 +208,13 @@ func (w *Winnowing) GenerateWFP(filePath string) (*[]Snippet, error) { defer rc.Close() if w.ShouldSkipFile(filePath) { + return nil, nil } _, err = io.Copy(w, rc) if err != nil { + return nil, err } @@ -230,11 +240,3 @@ func (w *Winnowing) normalizeContent(b byte) (byte, bool) { return 0, false } } - -func (w *Winnowing) generateKgrams(content []byte, k int) [][]byte { - var kgrams [][]byte - for i := 0; i <= len(content)-k; i++ { - kgrams = append(kgrams, content[i:i+k]) - } - return kgrams -} diff --git a/internal/fingerprint/testdata/fingerprinter_mock.go b/internal/fingerprint/testdata/fingerprinter_mock.go index 8dc4735c..42d510f1 100644 --- a/internal/fingerprint/testdata/fingerprinter_mock.go +++ b/internal/fingerprint/testdata/fingerprinter_mock.go @@ -14,6 +14,6 @@ func NewFingerprintMock() *FingerprintMock { } } -func (f *FingerprintMock) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (fingerprint.Fingerprints, error) { +func (f *FingerprintMock) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (fingerprint.Fingerprints, error) { return fingerprint.Fingerprints{}, f.error } diff --git a/internal/scan/scanner.go b/internal/scan/scanner.go index e6555dba..cfada318 100644 --- a/internal/scan/scanner.go +++ b/internal/scan/scanner.go @@ -157,7 +157,7 @@ func (dScanner *DebrickedScanner) scanResolve(options DebrickedOptions) error { func (dScanner *DebrickedScanner) scanFingerprint(options DebrickedOptions) error { if options.Fingerprint { - fingerprints, err := dScanner.fingerprint.FingerprintFiles(options.Path, file.DefaultExclusionsFingerprint(), false) + fingerprints, err := dScanner.fingerprint.FingerprintFiles(options.Path, file.DefaultExclusionsFingerprint(), false, false) if err != nil { return err } From 0d1ac461964e3663fdaf630142a00f8720667207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emil=20W=C3=A5reus?= Date: Fri, 23 Feb 2024 10:12:13 +0100 Subject: [PATCH 4/4] fix lint errors --- .gitignore | 1 + internal/fingerprint/fingerprint.go | 90 ++++++++++++++++++----------- internal/fingerprint/snippet.go | 3 +- 3 files changed, 60 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index 162f33f8..0b5c3d93 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ test/resolve/testdata/gradle/*/** **.gradle-init-script.debricked.groovy test/resolve/testdata/gradle/gradle.debricked.lock /mvnproj/target +internal/scan/testdata/npm/result.json diff --git a/internal/fingerprint/fingerprint.go b/internal/fingerprint/fingerprint.go index 87a0f83d..97f03659 100644 --- a/internal/fingerprint/fingerprint.go +++ b/internal/fingerprint/fingerprint.go @@ -335,58 +335,82 @@ func isCompressedFile(filename string) bool { return false } -func inMemFingerprintingCompressedContent(filename string, exclusions []string, includeSnippet bool) ([]FileFingerprint, error) { +type NotProcessableError struct { + Message string +} - r, err := zip.OpenReader(filename) +func (e *NotProcessableError) Error() string { + return e.Message +} + +func processFile(f *zip.File, filename string, exclusions []string, includeSnippet bool) (*FileFingerprint, error) { + longFileName := filepath.Join(filename, f.Name) // #nosec + + if !shouldProcessFile(f.FileInfo(), exclusions, longFileName) { + + return nil, &NotProcessableError{ + Message: "file is not processable", + } + } + rc, err := f.Open() if err != nil { return nil, err } - defer r.Close() - fingerprints := []FileFingerprint{} + defer rc.Close() - for _, f := range r.File { - if filepath.IsAbs(f.Name) || strings.HasPrefix(f.Name, "..") { - continue - } - longFileName := filepath.Join(filename, f.Name) // #nosec + hasher := newHasher() - if !shouldProcessFile(f.FileInfo(), exclusions, longFileName) { - continue - } - rc, err := f.Open() + contentLen, err := io.Copy(hasher, rc) // #nosec + if err != nil { + return nil, err + } + + fingerprint := FileFingerprint{ + path: longFileName, + contentLength: contentLen, + fingerprint: hasher.Sum(nil), + } + if includeSnippet { + winnowing := NewWinnowing(nil) + + snippets, err := winnowing.GenerateWFP(longFileName) if err != nil { return nil, err } - defer rc.Close() + if snippets != nil { + fingerprint.snippets = *snippets + } + } + + return &fingerprint, nil +} - hasher := newHasher() +func inMemFingerprintingCompressedContent(filename string, exclusions []string, includeSnippet bool) ([]FileFingerprint, error) { - contentLen, err := io.Copy(hasher, rc) // #nosec - if err != nil { - return nil, err - } + r, err := zip.OpenReader(filename) + if err != nil { + return nil, err + } + defer r.Close() - fingerprint := FileFingerprint{ - path: longFileName, - contentLength: contentLen, - fingerprint: hasher.Sum(nil), + fingerprints := []FileFingerprint{} + + for _, f := range r.File { + if filepath.IsAbs(f.Name) || strings.HasPrefix(f.Name, "..") { + continue } - if includeSnippet { - winnowing := NewWinnowing(nil) - snippets, err := winnowing.GenerateWFP(longFileName) - if err != nil { + fingerprint, err := processFile(f, filename, exclusions, includeSnippet) + if err != nil { + if _, ok := err.(*NotProcessableError); ok { + continue + } else { return nil, err } - - if snippets != nil { - fingerprint.snippets = *snippets - } } - - fingerprints = append(fingerprints, fingerprint) + fingerprints = append(fingerprints, *fingerprint) } return fingerprints, nil diff --git a/internal/fingerprint/snippet.go b/internal/fingerprint/snippet.go index 6b75fe05..697bd1d8 100644 --- a/internal/fingerprint/snippet.go +++ b/internal/fingerprint/snippet.go @@ -208,8 +208,9 @@ func (w *Winnowing) GenerateWFP(filePath string) (*[]Snippet, error) { defer rc.Close() if w.ShouldSkipFile(filePath) { + emptySnippets := make([]Snippet, 0) - return nil, nil + return &emptySnippets, nil } _, err = io.Copy(w, rc)