From 019df1c52fe16675f8021c17443f859c86c73068 Mon Sep 17 00:00:00 2001 From: adammcclenaghan Date: Fri, 18 Oct 2024 10:06:44 +0100 Subject: [PATCH] Expose unindexed directory resolver option - Implements some new functions on unindexed directory which previously had 'panic' for their impl - Fixed RelativeFileByPath for unindexed directory resolver Signed-off-by: adammcclenaghan --- cmd/syft/internal/commands/scan.go | 3 +- cmd/syft/internal/options/catalog.go | 4 + syft/get_source_config.go | 5 + .../fileresolver/unindexed_directory.go | 126 +++++++++++++----- .../fileresolver/unindexed_directory_test.go | 49 +++++++ .../directorysource/directory_source.go | 24 ++-- .../directory_source_provider.go | 29 ++-- .../directorysource/directory_source_test.go | 19 +++ .../sourceproviders/source_provider_config.go | 6 + .../sourceproviders/source_providers.go | 2 +- 10 files changed, 208 insertions(+), 59 deletions(-) diff --git a/cmd/syft/internal/commands/scan.go b/cmd/syft/internal/commands/scan.go index 48fcc8c1ebc..a58d2ef4ace 100644 --- a/cmd/syft/internal/commands/scan.go +++ b/cmd/syft/internal/commands/scan.go @@ -224,7 +224,8 @@ func getSource(ctx context.Context, opts *options.Catalog, userInput string, sou }). WithBasePath(opts.Source.BasePath). WithSources(sources...). - WithDefaultImagePullSource(opts.Source.Image.DefaultPullSource) + WithDefaultImagePullSource(opts.Source.Image.DefaultPullSource). + WithUnindexed(opts.Unindexed) var err error var platform *image.Platform diff --git a/cmd/syft/internal/options/catalog.go b/cmd/syft/internal/options/catalog.go index c7e0e4f6d25..d65b1061881 100644 --- a/cmd/syft/internal/options/catalog.go +++ b/cmd/syft/internal/options/catalog.go @@ -53,6 +53,7 @@ type Catalog struct { Platform string `yaml:"platform" json:"platform" mapstructure:"platform"` Source sourceConfig `yaml:"source" json:"source" mapstructure:"source"` Exclusions []string `yaml:"exclude" json:"exclude" mapstructure:"exclude"` + Unindexed bool `yaml:"unindexed" json:"unindexed" mapstructure:"unindexed"` // configuration for inclusion of unknown information within elements Unknowns unknownsConfig `yaml:"unknowns" mapstructure:"unknowns"` @@ -228,6 +229,9 @@ func (cfg *Catalog) AddFlags(flags clio.FlagSet) { flags.StringVarP(&cfg.Source.BasePath, "base-path", "", "base directory for scanning, no links will be followed above this directory, and all paths will be reported relative to this directory") + + flags.BoolVarP(&cfg.Unindexed, "unindexed", "", + "whether to index the file system or not, indexing can improve scan times but incurs a memory overhead, default false.") } func (cfg *Catalog) DescribeFields(descriptions fangs.FieldDescriptionSet) { diff --git a/syft/get_source_config.go b/syft/get_source_config.go index 7a163f1d1b6..443400656bf 100644 --- a/syft/get_source_config.go +++ b/syft/get_source_config.go @@ -26,6 +26,11 @@ func (c *GetSourceConfig) WithAlias(alias source.Alias) *GetSourceConfig { return c } +func (c *GetSourceConfig) WithUnindexed(unindexed bool) *GetSourceConfig { + c.SourceProviderConfig = c.SourceProviderConfig.WithUnindexed(unindexed) + return c +} + func (c *GetSourceConfig) WithRegistryOptions(registryOptions *image.RegistryOptions) *GetSourceConfig { c.SourceProviderConfig = c.SourceProviderConfig.WithRegistryOptions(registryOptions) return c diff --git a/syft/internal/fileresolver/unindexed_directory.go b/syft/internal/fileresolver/unindexed_directory.go index 4037cc1a817..2ffdf2210e4 100644 --- a/syft/internal/fileresolver/unindexed_directory.go +++ b/syft/internal/fileresolver/unindexed_directory.go @@ -18,30 +18,32 @@ import ( "github.com/mitchellh/go-homedir" "github.com/spf13/afero" + stereoscopeFile "github.com/anchore/stereoscope/pkg/file" "github.com/anchore/syft/internal/log" - "github.com/anchore/syft/syft/file" + syftFile "github.com/anchore/syft/syft/file" ) -var _ file.Resolver = (*UnindexedDirectory)(nil) -var _ file.WritableResolver = (*UnindexedDirectory)(nil) +var _ syftFile.Resolver = (*UnindexedDirectory)(nil) +var _ syftFile.WritableResolver = (*UnindexedDirectory)(nil) type UnindexedDirectory struct { - ls afero.Lstater - lr afero.LinkReader - base string - dir string - fs afero.Fs + ls afero.Lstater + lr afero.LinkReader + base string + dir string + fs afero.Fs + pathFilters []PathIndexVisitor } -func NewFromUnindexedDirectory(dir string) file.WritableResolver { +func NewFromUnindexedDirectory(dir string) syftFile.WritableResolver { return NewFromUnindexedDirectoryFS(afero.NewOsFs(), dir, "") } -func NewFromRootedUnindexedDirectory(dir string, base string) file.WritableResolver { - return NewFromUnindexedDirectoryFS(afero.NewOsFs(), dir, base) +func NewFromRootedUnindexedDirectory(dir string, base string, pathVisitors ...PathIndexVisitor) syftFile.WritableResolver { + return NewFromUnindexedDirectoryFS(afero.NewOsFs(), dir, base, pathVisitors...) } -func NewFromUnindexedDirectoryFS(fs afero.Fs, dir string, base string) file.WritableResolver { +func NewFromUnindexedDirectoryFS(fs afero.Fs, dir string, base string, pathVisitors ...PathIndexVisitor) syftFile.WritableResolver { ls, ok := fs.(afero.Lstater) if !ok { panic(fmt.Sprintf("unable to get afero.Lstater interface from: %+v", fs)) @@ -70,15 +72,16 @@ func NewFromUnindexedDirectoryFS(fs afero.Fs, dir string, base string) file.Writ } } return UnindexedDirectory{ - base: base, - dir: dir, - fs: fs, - ls: ls, - lr: lr, + base: base, + dir: dir, + fs: fs, + ls: ls, + lr: lr, + pathFilters: pathVisitors, } } -func (u UnindexedDirectory) FileContentsByLocation(location file.Location) (io.ReadCloser, error) { +func (u UnindexedDirectory) FileContentsByLocation(location syftFile.Location) (io.ReadCloser, error) { p := u.absPath(u.scrubInputPath(location.RealPath)) f, err := u.fs.Open(p) if err != nil { @@ -149,11 +152,11 @@ func (u UnindexedDirectory) absPath(p string) string { // - full symlink resolution should be performed on all requests // - only returns locations to files (NOT directories) -func (u UnindexedDirectory) FilesByPath(paths ...string) (out []file.Location, _ error) { +func (u UnindexedDirectory) FilesByPath(paths ...string) (out []syftFile.Location, _ error) { return u.filesByPath(true, false, paths...) } -func (u UnindexedDirectory) filesByPath(resolveLinks bool, includeDirs bool, paths ...string) (out []file.Location, _ error) { +func (u UnindexedDirectory) filesByPath(resolveLinks bool, includeDirs bool, paths ...string) (out []syftFile.Location, _ error) { // sort here for stable output sort.Strings(paths) nextPath: @@ -183,11 +186,11 @@ nextPath: // - full symlink resolution should be performed on all requests // - if multiple paths to the same file are found, the best single match should be returned // - only returns locations to files (NOT directories) -func (u UnindexedDirectory) FilesByGlob(patterns ...string) (out []file.Location, _ error) { +func (u UnindexedDirectory) FilesByGlob(patterns ...string) (out []syftFile.Location, _ error) { return u.filesByGlob(true, false, patterns...) } -func (u UnindexedDirectory) filesByGlob(resolveLinks bool, includeDirs bool, patterns ...string) (out []file.Location, _ error) { +func (u UnindexedDirectory) filesByGlob(resolveLinks bool, includeDirs bool, patterns ...string) (out []syftFile.Location, _ error) { f := unindexedDirectoryResolverFS{ u: u, } @@ -206,14 +209,62 @@ func (u UnindexedDirectory) filesByGlob(resolveLinks bool, includeDirs bool, pat return u.filesByPath(resolveLinks, includeDirs, paths...) } -func (u UnindexedDirectory) FilesByMIMEType(_ ...string) ([]file.Location, error) { - panic("FilesByMIMEType unsupported") +// FilesByMIMEType fetches all of the files which match the provided MIME types. +// This requires walking the filetree from u.base and checking the MIME type of each file we encounter +// Handling of errors while walking is ignored unless a filterFn wants the directory to be skipped. +// TODO: afero.Walk will read all files in a single directory into memory, providing the same lexical ordering +// guarantees that golang's filepath.Walk implementation provides. However, when a single directory contains +// many files, this could cause Syft to run OOM. We could consider using a custom walk function in future +// which uses Readdir with a hardcoded value N >= 1 so that entire directories aren't read into memory each time. +func (u UnindexedDirectory) FilesByMIMEType(types ...string) ([]syftFile.Location, error) { + uniqueLocations := make([]syftFile.Location, 0) + err := afero.Walk(u.fs, u.absPath(u.base), func(p string, fi fs.FileInfo, walkErr error) error { + // Ignore any path for which a filter function returns true + for _, filterFn := range u.pathFilters { + if filterFn == nil { + continue + } + + if filterErr := filterFn(u.base, p, fi, walkErr); filterErr != nil { + if errors.Is(filterErr, fs.SkipDir) { + // signal to walk() to skip this directory entirely (even if we're processing a file) + return filterErr + } + // skip this path but don't affect walk() trajectory + return nil + } + } + + // If we get here, then the MIME type of the file should be checked + mimeType := stereoscopeFile.NewMetadataFromPath(p, fi).MIMEType + for _, mType := range types { + if mimeType == mType { + // Tidy the path, same as AllLocations + p = strings.TrimPrefix(p, u.dir) + if p == "" { + return nil + } + p = strings.TrimPrefix(p, "/") + uniqueLocations = append(uniqueLocations, syftFile.NewLocation(p)) + } + } + + // Continue to walk() + return nil + }) + + if err != nil { + log.Debug(err) + return nil, err + } + + return uniqueLocations, nil } // RelativeFileByPath fetches a single file at the given path relative to the layer squash of the given reference. // This is helpful when attempting to find a file that is in the same layer or lower as another file. -func (u UnindexedDirectory) RelativeFileByPath(l file.Location, p string) *file.Location { - p = path.Clean(path.Join(l.RealPath, p)) +func (u UnindexedDirectory) RelativeFileByPath(l syftFile.Location, p string) *syftFile.Location { + p = path.Clean(p) locs, err := u.filesByPath(true, false, p) if err != nil || len(locs) == 0 { return nil @@ -228,8 +279,8 @@ func (u UnindexedDirectory) RelativeFileByPath(l file.Location, p string) *file. // - NO symlink resolution should be performed on results // - returns locations for any file or directory -func (u UnindexedDirectory) AllLocations(ctx context.Context) <-chan file.Location { - out := make(chan file.Location) +func (u UnindexedDirectory) AllLocations(ctx context.Context) <-chan syftFile.Location { + out := make(chan syftFile.Location) errWalkCanceled := fmt.Errorf("walk canceled") go func() { defer close(out) @@ -240,7 +291,7 @@ func (u UnindexedDirectory) AllLocations(ctx context.Context) <-chan file.Locati } p = strings.TrimPrefix(p, "/") select { - case out <- file.NewLocation(p): + case out <- syftFile.NewLocation(p): return nil case <-ctx.Done(): return errWalkCanceled @@ -253,11 +304,18 @@ func (u UnindexedDirectory) AllLocations(ctx context.Context) <-chan file.Locati return out } -func (u UnindexedDirectory) FileMetadataByLocation(_ file.Location) (file.Metadata, error) { - panic("FileMetadataByLocation unsupported") +func (u UnindexedDirectory) FileMetadataByLocation(loc syftFile.Location) (syftFile.Metadata, error) { + p := u.absPath(u.scrubInputPath(loc.RealPath)) + finfo, err := u.fs.Stat(p) + if err != nil { + return syftFile.Metadata{}, err + } + + metadata := stereoscopeFile.NewMetadataFromPath(p, finfo) + return metadata, nil } -func (u UnindexedDirectory) Write(location file.Location, reader io.Reader) error { +func (u UnindexedDirectory) Write(location syftFile.Location, reader io.Reader) error { filePath := location.RealPath if path.IsAbs(filePath) { filePath = filePath[1:] @@ -266,7 +324,7 @@ func (u UnindexedDirectory) Write(location file.Location, reader io.Reader) erro return afero.WriteReader(u.fs, absPath, reader) } -func (u UnindexedDirectory) newLocation(filePath string, resolveLinks bool) *file.Location { +func (u UnindexedDirectory) newLocation(filePath string, resolveLinks bool) *syftFile.Location { filePath = path.Clean(filePath) virtualPath := filePath @@ -287,7 +345,7 @@ func (u UnindexedDirectory) newLocation(filePath string, resolveLinks bool) *fil } } - l := file.NewVirtualLocation(realPath, virtualPath) + l := syftFile.NewVirtualLocation(realPath, virtualPath) return &l } diff --git a/syft/internal/fileresolver/unindexed_directory_test.go b/syft/internal/fileresolver/unindexed_directory_test.go index d21615fdd4f..55565f59fa5 100644 --- a/syft/internal/fileresolver/unindexed_directory_test.go +++ b/syft/internal/fileresolver/unindexed_directory_test.go @@ -1271,6 +1271,55 @@ func Test_WritableUnindexedDirectoryResolver(t *testing.T) { require.Equal(t, c, string(bytes)) } +func Test_UnindexedDirectoryResolver_FilesByMIMEType(t *testing.T) { + tests := []struct { + fixturePath string + mimeType string + expectedPaths *strset.Set + }{ + { + fixturePath: "./test-fixtures/image-simple", + mimeType: "text/plain", + expectedPaths: strset.New("file-1.txt", "file-2.txt", "target/really/nested/file-3.txt", "Dockerfile"), + }, + } + for _, test := range tests { + t.Run(test.fixturePath, func(t *testing.T) { + resolver := NewFromUnindexedDirectory(test.fixturePath) + locations, err := resolver.FilesByMIMEType(test.mimeType) + assert.NoError(t, err) + assert.Equal(t, test.expectedPaths.Size(), len(locations)) + for _, l := range locations { + assert.True(t, test.expectedPaths.Has(l.RealPath), "does not have path %q", l.RealPath) + } + }) + } +} + +func Test_UnindexedDirectoryResolver_RelativeFileByPath(t *testing.T) { + cases := []struct { + name string + root string + searchFile string + expected *strset.Set + }{ + { + name: "should find nested file from root", + root: "./test-fixtures/image-simple", + searchFile: "target/really/nested/file-3.txt", + expected: strset.New("target/really/nested/file-3.txt"), + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + resolver := NewFromUnindexedDirectory(c.root) + rootLoc := file.NewLocation(c.root) + loc := resolver.RelativeFileByPath(rootLoc, c.searchFile) + assert.True(t, c.expected.Has(loc.Coordinates.RealPath), "does not have path %q", loc.RealPath) + }) + } +} + func testWithTimeout(t *testing.T, timeout time.Duration, test func(*testing.T)) { done := make(chan bool) go func() { diff --git a/syft/source/directorysource/directory_source.go b/syft/source/directorysource/directory_source.go index 0ab06980fe5..226ca8f52e8 100644 --- a/syft/source/directorysource/directory_source.go +++ b/syft/source/directorysource/directory_source.go @@ -21,16 +21,17 @@ import ( var _ source.Source = (*directorySource)(nil) type Config struct { - Path string - Base string - Exclude source.ExcludeConfig - Alias source.Alias + Path string + Base string + Exclude source.ExcludeConfig + Alias source.Alias + Unindexed bool } type directorySource struct { id artifact.ID config Config - resolver *fileresolver.Directory + resolver file.Resolver mutex *sync.Mutex } @@ -145,12 +146,15 @@ func (s *directorySource) FileResolver(_ source.Scope) (file.Resolver, error) { // this should be the only file resolver that might have overlap with where files are cached exclusionFunctions = append(exclusionFunctions, excludeCachePathVisitors()...) - res, err := fileresolver.NewFromDirectory(s.config.Path, s.config.Base, exclusionFunctions...) - if err != nil { - return nil, fmt.Errorf("unable to create directory resolver: %w", err) + if s.config.Unindexed { + s.resolver = fileresolver.NewFromRootedUnindexedDirectory(s.config.Path, s.config.Base) + } else { + res, err := fileresolver.NewFromDirectory(s.config.Path, s.config.Base, exclusionFunctions...) + if err != nil { + return nil, fmt.Errorf("unable to create directory resolver: %w", err) + } + s.resolver = res } - - s.resolver = res } return s.resolver, nil diff --git a/syft/source/directorysource/directory_source_provider.go b/syft/source/directorysource/directory_source_provider.go index 11eed73d864..04109918e14 100644 --- a/syft/source/directorysource/directory_source_provider.go +++ b/syft/source/directorysource/directory_source_provider.go @@ -10,20 +10,22 @@ import ( "github.com/anchore/syft/syft/source" ) -func NewSourceProvider(path string, exclude source.ExcludeConfig, alias source.Alias, basePath string) source.Provider { +func NewSourceProvider(path string, exclude source.ExcludeConfig, alias source.Alias, basePath string, unindexed bool) source.Provider { return &directorySourceProvider{ - path: path, - basePath: basePath, - exclude: exclude, - alias: alias, + path: path, + basePath: basePath, + exclude: exclude, + alias: alias, + unindexed: unindexed, } } type directorySourceProvider struct { - path string - basePath string - exclude source.ExcludeConfig - alias source.Alias + path string + basePath string + exclude source.ExcludeConfig + alias source.Alias + unindexed bool } func (l directorySourceProvider) Name() string { @@ -48,10 +50,11 @@ func (l directorySourceProvider) Provide(_ context.Context) (source.Source, erro return New( Config{ - Path: location, - Base: basePath(l.basePath, location), - Exclude: l.exclude, - Alias: l.alias, + Path: location, + Base: basePath(l.basePath, location), + Exclude: l.exclude, + Alias: l.alias, + Unindexed: l.unindexed, }, ) } diff --git a/syft/source/directorysource/directory_source_test.go b/syft/source/directorysource/directory_source_test.go index 751d4766289..f3b9077da59 100644 --- a/syft/source/directorysource/directory_source_test.go +++ b/syft/source/directorysource/directory_source_test.go @@ -85,6 +85,25 @@ func TestNewFromDirectory(t *testing.T) { } } +func Test_NewFromDirectory_Unindexed(t *testing.T) { + testutil.Chdir(t, "..") // run with source/test-fixtures + + cfg := Config{ + Path: "test-fixtures", + Unindexed: true, + } + + src, err := New(cfg) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, src.Close()) + }) + + resolver, err := src.FileResolver("") + require.NoError(t, err) + require.IsType(t, fileresolver.UnindexedDirectory{}, resolver) +} + func Test_DirectorySource_FilesByGlob(t *testing.T) { testutil.Chdir(t, "..") // run with source/test-fixtures diff --git a/syft/source/sourceproviders/source_provider_config.go b/syft/source/sourceproviders/source_provider_config.go index dbc48bc0ba9..245df86d99a 100644 --- a/syft/source/sourceproviders/source_provider_config.go +++ b/syft/source/sourceproviders/source_provider_config.go @@ -15,6 +15,7 @@ type Config struct { Exclude source.ExcludeConfig DigestAlgorithms []crypto.Hash BasePath string + Unindexed bool } func (c *Config) WithAlias(alias source.Alias) *Config { @@ -22,6 +23,11 @@ func (c *Config) WithAlias(alias source.Alias) *Config { return c } +func (c *Config) WithUnindexed(unindexed bool) *Config { + c.Unindexed = unindexed + return c +} + func (c *Config) WithRegistryOptions(registryOptions *image.RegistryOptions) *Config { c.RegistryOptions = registryOptions return c diff --git a/syft/source/sourceproviders/source_providers.go b/syft/source/sourceproviders/source_providers.go index 107f3ad4825..a5e9f754077 100644 --- a/syft/source/sourceproviders/source_providers.go +++ b/syft/source/sourceproviders/source_providers.go @@ -27,7 +27,7 @@ func All(userInput string, cfg *Config) []collections.TaggedValue[source.Provide // --from file, dir, oci-archive, etc. Join(stereoscopeProviders.Select(FileTag, DirTag)...). Join(tagProvider(filesource.NewSourceProvider(userInput, cfg.Exclude, cfg.DigestAlgorithms, cfg.Alias), FileTag)). - Join(tagProvider(directorysource.NewSourceProvider(userInput, cfg.Exclude, cfg.Alias, cfg.BasePath), DirTag)). + Join(tagProvider(directorysource.NewSourceProvider(userInput, cfg.Exclude, cfg.Alias, cfg.BasePath, cfg.Unindexed), DirTag)). // --from docker, registry, etc. Join(stereoscopeProviders.Select(PullTag)...)