diff --git a/.gitignore b/.gitignore index 162f33f8..0b5c3d93 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ test/resolve/testdata/gradle/*/** **.gradle-init-script.debricked.groovy test/resolve/testdata/gradle/gradle.debricked.lock /mvnproj/target +internal/scan/testdata/npm/result.json diff --git a/internal/cmd/fingerprint/fingerprint.go b/internal/cmd/fingerprint/fingerprint.go index 0d25623f..a56f8864 100644 --- a/internal/cmd/fingerprint/fingerprint.go +++ b/internal/cmd/fingerprint/fingerprint.go @@ -15,9 +15,11 @@ var exclusions = file.DefaultExclusionsFingerprint() const ( ExclusionFlag = "exclusion" FingerprintCompressedContent = "fingerprint-compressed-content" + IncludeSnippet = "snippet" ) var shouldFingerprintCompressedContent bool +var includeSnippet bool func NewFingerprintCmd(fingerprinter fingerprint.IFingerprint) *cobra.Command { @@ -48,7 +50,9 @@ Example: $ debricked files fingerprint . `+exampleFlags) cmd.Flags().BoolVar(&shouldFingerprintCompressedContent, FingerprintCompressedContent, false, `Fingerprint the contents of compressed files by unpacking them in memory, Supported files: `+fmt.Sprintf("%v", fingerprint.FILES_TO_UNPACK)) - + cmd.Flags().BoolVar(&includeSnippet, IncludeSnippet, false, "Snippet fingerprint file content.") + // hide the flag from the help message + _ = cmd.Flags().MarkHidden(IncludeSnippet) viper.MustBindEnv(ExclusionFlag) return cmd @@ -61,7 +65,7 @@ func RunE(f fingerprint.IFingerprint) func(_ *cobra.Command, args []string) erro path = args[0] } - output, err := f.FingerprintFiles(path, exclusions, shouldFingerprintCompressedContent) + output, err := f.FingerprintFiles(path, exclusions, shouldFingerprintCompressedContent, includeSnippet) if err != nil { return err diff --git a/internal/file/exclusion.go b/internal/file/exclusion.go index 9bfcc68d..a629e4a4 100644 --- a/internal/file/exclusion.go +++ b/internal/file/exclusion.go @@ -33,7 +33,7 @@ func Exclusions() []string { var EXCLUDED_DIRS_FINGERPRINT = []string{ "nbproject", "nbbuild", "nbdist", "node_modules", - "__pycache__", "_yardoc", "eggs", + "__pycache__", "_yardoc", "eggs", ".git", "wheels", "htmlcov", "__pypackages__"} var EXCLUDED_DIRS_FINGERPRINT_RAW = []string{"**/*.egg-info/**", "**/*venv/**"} diff --git a/internal/file/exclusion_test.go b/internal/file/exclusion_test.go index c42a0dc5..a019f634 100644 --- a/internal/file/exclusion_test.go +++ b/internal/file/exclusion_test.go @@ -73,6 +73,7 @@ func TestDefaultExclusionsFingerprint(t *testing.T) { filepath.Join("**", "__pycache__", "**"), filepath.Join("**", "_yardoc", "**"), filepath.Join("**", "eggs", "**"), + filepath.Join("**", ".git", "**"), filepath.Join("**", "wheels", "**"), filepath.Join("**", "htmlcov", "**"), filepath.Join("**", "__pypackages__", "**"), diff --git a/internal/fingerprint/fingerprint.go b/internal/fingerprint/fingerprint.go index 55be4b30..97f03659 100644 --- a/internal/fingerprint/fingerprint.go +++ b/internal/fingerprint/fingerprint.go @@ -99,7 +99,7 @@ func isExcludedByEnding(filename string) bool { } type IFingerprint interface { - FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (Fingerprints, error) + FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (Fingerprints, error) } type Fingerprinter struct { @@ -116,15 +116,32 @@ type FileFingerprint struct { path string contentLength int64 fingerprint []byte + snippets []Snippet } func (f FileFingerprint) ToString() string { path := filepath.ToSlash(f.path) + output := fmt.Sprintf("file=%x,%d,%s", f.fingerprint, f.contentLength, path) + + if len(f.snippets) != 0 { + snippetOutput := "" + prevLine := -1 + for _, snippet := range f.snippets { + if snippet.Line == prevLine { + snippetOutput += fmt.Sprintf(",%s", snippet.Hash) + } else { + snippetOutput += fmt.Sprintf("\n%d=%s", snippet.Line, snippet.Hash) + } + prevLine = snippet.Line + } + + output += snippetOutput + } - return fmt.Sprintf("file=%x,%d,%s", f.fingerprint, f.contentLength, path) + return output } -func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (Fingerprints, error) { +func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (Fingerprints, error) { if len(rootPath) == 0 { rootPath = filepath.Base("") } @@ -142,7 +159,7 @@ func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, f return err } - fingerprintsZip, err := computeHashForFileAndZip(fileInfo, path, exclusions, fingerprintCompressedContent) + fingerprintsZip, err := computeHashForFileAndZip(fileInfo, path, exclusions, fingerprintCompressedContent, includeSnippet) if err != nil { return err } @@ -172,7 +189,7 @@ func (f *Fingerprinter) FingerprintFiles(rootPath string, exclusions []string, f return fingerprints, err } -func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []string, fingerprintCompressedContent bool) ([]FileFingerprint, error) { +func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) ([]FileFingerprint, error) { if !shouldProcessFile(fileInfo, exclusions, path) { return nil, nil } @@ -181,7 +198,7 @@ func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []st // If the file should be unzipped, try to unzip and fingerprint it if isCompressedFile(path) && fingerprintCompressedContent { - fingerprintsZip, err := inMemFingerprintingCompressedContent(path, exclusions) + fingerprintsZip, err := inMemFingerprintingCompressedContent(path, exclusions, includeSnippet) if err != nil { if errors.Is(err, zip.ErrFormat) { fmt.Printf("WARNING: Could not unpack and fingerprint contents of compressed file [%s]. Error: %v\n", path, err) @@ -192,7 +209,7 @@ func computeHashForFileAndZip(fileInfo os.FileInfo, path string, exclusions []st fingerprints = append(fingerprints, fingerprintsZip...) } - fingerprint, err := computeHashForFile(path) + fingerprint, err := computeHashForFile(path, includeSnippet) if err != nil { return nil, err } @@ -238,29 +255,44 @@ func shouldProcessFile(fileInfo os.FileInfo, exclusions []string, path string) b return !isSymlink } -func computeHashForFile(filename string) (FileFingerprint, error) { - data, err := os.ReadFile(filename) +func computeHashForFile(filename string, includeSnippet bool) (FileFingerprint, error) { + rc, err := os.Open(filename) if err != nil { return FileFingerprint{}, err } + defer rc.Close() hasher := newHasher() - if _, err := hasher.Write(data); err != nil { + contentLen, err := io.Copy(hasher, rc) // #nosec + if err != nil { return FileFingerprint{}, err } - contentLength := int64(len(data)) - if err != nil { return FileFingerprint{}, err } - return FileFingerprint{ + fingerprint := FileFingerprint{ path: filename, - contentLength: contentLength, + contentLength: contentLen, fingerprint: hasher.Sum(nil), - }, nil + } + + if includeSnippet { + winnowing := NewWinnowing(nil) + + snippets, err := winnowing.GenerateWFP(filename) + if err != nil { + return FileFingerprint{}, err + } + + if snippets != nil { + fingerprint.snippets = *snippets + } + } + + return fingerprint, nil } type Fingerprints struct { @@ -303,7 +335,59 @@ func isCompressedFile(filename string) bool { return false } -func inMemFingerprintingCompressedContent(filename string, exclusions []string) ([]FileFingerprint, error) { +type NotProcessableError struct { + Message string +} + +func (e *NotProcessableError) Error() string { + return e.Message +} + +func processFile(f *zip.File, filename string, exclusions []string, includeSnippet bool) (*FileFingerprint, error) { + longFileName := filepath.Join(filename, f.Name) // #nosec + + if !shouldProcessFile(f.FileInfo(), exclusions, longFileName) { + + return nil, &NotProcessableError{ + Message: "file is not processable", + } + } + rc, err := f.Open() + if err != nil { + return nil, err + } + + defer rc.Close() + + hasher := newHasher() + + contentLen, err := io.Copy(hasher, rc) // #nosec + if err != nil { + return nil, err + } + + fingerprint := FileFingerprint{ + path: longFileName, + contentLength: contentLen, + fingerprint: hasher.Sum(nil), + } + if includeSnippet { + winnowing := NewWinnowing(nil) + + snippets, err := winnowing.GenerateWFP(longFileName) + if err != nil { + return nil, err + } + + if snippets != nil { + fingerprint.snippets = *snippets + } + } + + return &fingerprint, nil +} + +func inMemFingerprintingCompressedContent(filename string, exclusions []string, includeSnippet bool) ([]FileFingerprint, error) { r, err := zip.OpenReader(filename) if err != nil { @@ -317,32 +401,16 @@ func inMemFingerprintingCompressedContent(filename string, exclusions []string) if filepath.IsAbs(f.Name) || strings.HasPrefix(f.Name, "..") { continue } - longFileName := filepath.Join(filename, f.Name) // #nosec - - if !shouldProcessFile(f.FileInfo(), exclusions, longFileName) { - continue - } - rc, err := f.Open() - if err != nil { - return nil, err - } - - hasher := newHasher() - _, err = io.Copy(hasher, rc) // #nosec + fingerprint, err := processFile(f, filename, exclusions, includeSnippet) if err != nil { - rc.Close() - - return nil, err + if _, ok := err.(*NotProcessableError); ok { + continue + } else { + return nil, err + } } - - fingerprints = append(fingerprints, FileFingerprint{ - path: longFileName, - contentLength: int64(f.UncompressedSize64), - fingerprint: hasher.Sum(nil), - }) - - rc.Close() + fingerprints = append(fingerprints, *fingerprint) } return fingerprints, nil diff --git a/internal/fingerprint/fingerprint_test.go b/internal/fingerprint/fingerprint_test.go index 2456f57d..04302c78 100644 --- a/internal/fingerprint/fingerprint_test.go +++ b/internal/fingerprint/fingerprint_test.go @@ -151,7 +151,7 @@ func TestFingerprinterInterface(t *testing.T) { func TestFingerprintFiles(t *testing.T) { fingerprinter := NewFingerprinter() - fingerprints, err := fingerprinter.FingerprintFiles("testdata/fingerprinter", []string{}, true) + fingerprints, err := fingerprinter.FingerprintFiles("testdata/fingerprinter", []string{}, true, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -159,7 +159,7 @@ func TestFingerprintFiles(t *testing.T) { assert.Equal(t, "file=634c5485de8e22b27094affadd8a6e3b,21,testdata/fingerprinter/testfile.py", fingerprints.Entries[0].ToString()) // Test no file - fingerprints, err = fingerprinter.FingerprintFiles("", []string{}, true) + fingerprints, err = fingerprinter.FingerprintFiles("", []string{}, true, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -198,11 +198,11 @@ func TestFileFingerprintToString(t *testing.T) { func TestComputeMD5(t *testing.T) { // Test file not found - _, err := computeHashForFile("testdata/fingerprinter/testfile-not-found.py") + _, err := computeHashForFile("testdata/fingerprinter/testfile-not-found.py", false) assert.Error(t, err) // Test file found - entry, err := computeHashForFile("testdata/fingerprinter/testfile.py") + entry, err := computeHashForFile("testdata/fingerprinter/testfile.py", false) assert.NoError(t, err) entryS := fmt.Sprintf("%x", entry.fingerprint) assert.Equal(t, "634c5485de8e22b27094affadd8a6e3b", entryS) @@ -349,7 +349,7 @@ func TestInMemFingerprintingCompressedContent(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { fingerprinter := NewFingerprinter() - fingerprints, err := fingerprinter.FingerprintFiles(tt.path, []string{}, tt.shouldUnzip) + fingerprints, err := fingerprinter.FingerprintFiles(tt.path, []string{}, tt.shouldUnzip, false) assert.NoError(t, err) assert.NotNil(t, fingerprints) assert.NotEmpty(t, fingerprints) @@ -376,7 +376,7 @@ func TestComputeHashForFile(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - _, err := computeHashForFile(tt.file) + _, err := computeHashForFile(tt.file, false) if (err != nil) != tt.wantErr { t.Errorf("computeHashForFile() error = %v, wantErr %v", err, tt.wantErr) } diff --git a/internal/fingerprint/snippet.go b/internal/fingerprint/snippet.go new file mode 100644 index 00000000..697bd1d8 --- /dev/null +++ b/internal/fingerprint/snippet.go @@ -0,0 +1,243 @@ +package fingerprint + +import ( + "fmt" + "hash/crc32" + "io" + "os" + "path/filepath" +) + +const ( + Gram = 30 + Window = 64 + MinFileSizeDefault = 256 + MaxLongLineChars = 1000 + ASCII0 = 48 + ASCII9 = 57 + ASCIIA = 65 + ASCIIZ = 90 + ASCIIa = 97 + ASCIIz = 122 + ASCIILF = 10 + ASCIIBackslash = 92 + MaxCRC32 = uint32(4294967295) +) + +var IncludedExtensions = map[string]bool{ + // C + ".c": true, + ".h": true, + + // C++ + ".cc": true, + ".cpp": true, + ".hpp": true, + + // C# + ".cs": true, + + // Go + ".go": true, + + // Java + ".java": true, + + // Kotlin + ".kt": true, + + // JavaScript + TypeScript + frameworks + ".js": true, + ".ts": true, + ".jsx": true, + ".tsx": true, + ".vue": true, + ".svelte": true, + ".elm": true, + ".coffee": true, + ".litcoffee": true, + ".cjsx": true, + ".iced": true, + ".es": true, + ".es6": true, + ".mjs": true, + + // Ruby + ".rb": true, + + // Rust + ".rs": true, + + // Swift + ".swift": true, + + // Objective-C + ".m": true, + ".mm": true, + + // PHP + ".php": true, + + // Python + ".py": true, + + // CSS + ".css": true, + + // Scala + ".scala": true, +} + +type Winnowing struct { + crc8MaximTable []uint8 + MinFileSize int + results *[]Snippet +} + +type Snippet struct { + Content string + Hash string + Line int +} + +func NewWinnowing(minFileSize *int) *Winnowing { + var MinFileSize int + if minFileSize != nil { + MinFileSize = *minFileSize + } else { + MinFileSize = MinFileSizeDefault + } + + return &Winnowing{ + crc8MaximTable: make([]uint8, 0), + MinFileSize: MinFileSize, + } +} + +func (w *Winnowing) NormalizeByte(b byte) byte { + if b < ASCII0 || b > ASCII9 { + return 0 + } + + return b +} + +func (w *Winnowing) ShouldSkipFile(filePath string) bool { + ext := filepath.Ext(filePath) + if _, ok := IncludedExtensions[ext]; !ok { + + return true + } + + return false +} + +func (w *Winnowing) Write(p []byte) (n int, err error) { + var output []Snippet + + content := p + content_len := len(content) + if content_len < w.MinFileSize { + + return len(p), nil + } + line := 1 + window := []uint32{} + gram := []byte{} + last_hash := MaxCRC32 + last_content_window_end := 0 + for i, bt := range content { + + if bt == ASCIILF { + line++ + + continue + } + + btNorm, process := w.normalizeContent(bt) + if !process { + + continue + } + + gram = append(gram, btNorm) + + if len(gram) >= Gram { + gramCrc32 := crc32c(gram) + window = append(window, gramCrc32) + + if len(window) >= Window { + minHash := minHash(window) + + if minHash != last_hash { + + // Hashing the hash to balance the distribution + crc := crc32c([]byte{byte(minHash & 0xff), byte((minHash >> 8) & 0xff), byte((minHash >> 16) & 0xff), byte((minHash >> 24) & 0xff)}) + output = append(output, Snippet{Content: string(content[last_content_window_end:i]), Hash: fmt.Sprintf("%x", crc), Line: line}) + last_content_window_end = i + } + last_hash = minHash + window = window[1:] + } + gram = gram[1:] + } + + } + + w.results = &output + + return len(p), nil +} + +func minHash(window []uint32) uint32 { + min := MaxCRC32 + for _, hash := range window { + if hash < min { + min = hash + } + } + + return min +} + +func (w *Winnowing) GenerateWFP(filePath string) (*[]Snippet, error) { + rc, err := os.Open(filePath) + if err != nil { + return nil, err + } + defer rc.Close() + + if w.ShouldSkipFile(filePath) { + emptySnippets := make([]Snippet, 0) + + return &emptySnippets, nil + } + + _, err = io.Copy(w, rc) + if err != nil { + + return nil, err + } + + return w.results, nil +} + +func crc32c(data []byte) uint32 { + // Create a table for the Castagnoli polynomial. + castagnoliTable := crc32.MakeTable(crc32.Castagnoli) + + // crc32.ChecksumIEEE(data) + return crc32.Checksum(data, castagnoliTable) +} + +func (w *Winnowing) normalizeContent(b byte) (byte, bool) { + if b < ASCII0 || b > ASCIIz { + return 0, false + } else if b <= ASCII9 || b >= ASCIIa { + return b, true + } else if b >= 65 && b <= 90 { + return b + 32, true + } else { + return 0, false + } +} diff --git a/internal/fingerprint/snippet_test.go b/internal/fingerprint/snippet_test.go new file mode 100644 index 00000000..04da6fb3 --- /dev/null +++ b/internal/fingerprint/snippet_test.go @@ -0,0 +1,21 @@ +package fingerprint + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGenerateWFP(t *testing.T) { + + minFileSize := 1 + w := NewWinnowing(&minFileSize) + + snippets, err := w.GenerateWFP("testdata/snippet/main.py") + assert.NoError(t, err) + assert.Equal(t, 8, len(*snippets)) + assert.Equal(t, "5e6ddca9", (*snippets)[0].Hash) + assert.Equal(t, 14, (*snippets)[0].Line) + assert.Equal(t, "def test():\n print(\"Hello, World!\")\n\n\ndef test2():\n print(\"Hello, World!2\")\n\n\ndef test3():\n print(\"Hello, World!3\")\n\n\ndef test4():\n print(\"Hello, Worl", (*snippets)[0].Content) + +} diff --git a/internal/fingerprint/testdata/fingerprinter_mock.go b/internal/fingerprint/testdata/fingerprinter_mock.go index 8dc4735c..42d510f1 100644 --- a/internal/fingerprint/testdata/fingerprinter_mock.go +++ b/internal/fingerprint/testdata/fingerprinter_mock.go @@ -14,6 +14,6 @@ func NewFingerprintMock() *FingerprintMock { } } -func (f *FingerprintMock) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool) (fingerprint.Fingerprints, error) { +func (f *FingerprintMock) FingerprintFiles(rootPath string, exclusions []string, fingerprintCompressedContent bool, includeSnippet bool) (fingerprint.Fingerprints, error) { return fingerprint.Fingerprints{}, f.error } diff --git a/internal/fingerprint/testdata/snippet/main.py b/internal/fingerprint/testdata/snippet/main.py new file mode 100644 index 00000000..c908fa05 --- /dev/null +++ b/internal/fingerprint/testdata/snippet/main.py @@ -0,0 +1,38 @@ +def test(): + print("Hello, World!") + + +def test2(): + print("Hello, World!2") + + +def test3(): + print("Hello, World!3") + + +def test4(): + print("Hello, World!4") + + +def test5(): + print("Hello, World!5") + + +def test6(): + print("Hello, World!6") + + +def test7(): + print("Hello, World!7") + + +def test8(): + print("Hello, World!8") + + +def test9(): + print("Hello, World!9") + + +def test10(): + print("Hello, World!10") diff --git a/internal/scan/scanner.go b/internal/scan/scanner.go index e6555dba..cfada318 100644 --- a/internal/scan/scanner.go +++ b/internal/scan/scanner.go @@ -157,7 +157,7 @@ func (dScanner *DebrickedScanner) scanResolve(options DebrickedOptions) error { func (dScanner *DebrickedScanner) scanFingerprint(options DebrickedOptions) error { if options.Fingerprint { - fingerprints, err := dScanner.fingerprint.FingerprintFiles(options.Path, file.DefaultExclusionsFingerprint(), false) + fingerprints, err := dScanner.fingerprint.FingerprintFiles(options.Path, file.DefaultExclusionsFingerprint(), false, false) if err != nil { return err }