diff --git a/README.md b/README.md index d5aed59..36366f9 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ crawley -headless -delay 0 -depth -1 -dirs only http://some-test.site # installation -- [binaries](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows, just download and run. +- [binaries / deb / rpm](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows. - [archlinux](https://aur.archlinux.org/packages/crawley-bin/) you can use your favourite AUR helper to install it, e. g. `paru -S crawley-bin`. # usage @@ -61,7 +61,7 @@ crawley -headless -delay 0 -depth -1 -dirs only http://some-test.site ``` crawley [flags] url -possible flags: +possible flags with default values: -brute scan html comments diff --git a/cmd/crawley/main.go b/cmd/crawley/main.go index 3069cc4..664a9f8 100644 --- a/cmd/crawley/main.go +++ b/cmd/crawley/main.go @@ -8,7 +8,9 @@ import ( "io" "log" "os" + "path/filepath" "runtime" + "strings" "time" "github.com/s0rg/crawley/pkg/crawler" @@ -17,32 +19,30 @@ import ( const ( appName = "Crawley" + appHelp = "the unix-way web crawler" appSite = "https://github.com/s0rg/crawley" defaultDelay = 150 * time.Millisecond ) +// build-time values. var ( - GitHash string GitTag string + GitHash string BuildDate string defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + GitTag + "-" + GitHash +) - cookies, headers values.Smart - tags, ignored values.Simple - - fDepth = flag.Int("depth", 0, "scan depth (set -1 for unlimited)") - fWorkers = flag.Int("workers", runtime.NumCPU(), "number of workers") - fBrute = flag.Bool("brute", false, "scan html comments") - fNoHeads = flag.Bool("headless", false, "disable pre-flight HEAD requests") - fScanJS = flag.Bool("js", false, "scan js files for endpoints") - fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification") - fSilent = flag.Bool("silent", false, "suppress info and error messages in stderr") - fVersion = flag.Bool("version", false, "show version") - fDirsPolicy = flag.String("dirs", "show", "policy for non-resource urls: show / hide / only") - fProxyAuth = flag.String("proxy-auth", "", "credentials for proxy: user:password") - fRobotsPolicy = flag.String("robots", "ignore", "policy for robots.txt: ignore / crawl / respect") - fUA = flag.String("user-agent", defaultUA, "user-agent string") - fDelay = flag.Duration("delay", defaultDelay, "per-request delay (0 - disable)") +// command-line flags. +var ( + fDepth, fWorkers int + fSilent, fVersion bool + fBrute, fNoHeads bool + fSkipSSL, fScanJS bool + fDirsPolicy, fProxyAuth string + fRobotsPolicy, fUA string + fDelay time.Duration + cookies, headers values.Smart + tags, ignored values.List ) func version() string { @@ -56,6 +56,29 @@ func version() string { ) } +func usage() { + var sb strings.Builder + + const twoCR = "\n\n" + + sb.WriteString(appName) + sb.WriteString(" - ") + sb.WriteString(appHelp) + sb.WriteString(", usage:") + sb.WriteString(twoCR) + + sb.WriteString(filepath.Base(os.Args[0])) + sb.WriteString(" [flags] url") + sb.WriteString(twoCR) + + sb.WriteString("possible flags with default values:") + sb.WriteString(twoCR) + + _, _ = os.Stderr.WriteString(sb.String()) + + flag.PrintDefaults() +} + func puts(s string) { _, _ = os.Stdout.WriteString(s + "\n") } @@ -102,14 +125,14 @@ func loadSmart() (h, c []string, err error) { } func initOptions() (rv []crawler.Option, err error) { - robots, err := crawler.ParseRobotsPolicy(*fRobotsPolicy) + robots, err := crawler.ParseRobotsPolicy(fRobotsPolicy) if err != nil { err = fmt.Errorf("robots policy: %w", err) return } - dirs, err := crawler.ParseDirsPolicy(*fDirsPolicy) + dirs, err := crawler.ParseDirsPolicy(fDirsPolicy) if err != nil { err = fmt.Errorf("dirs policy: %w", err) @@ -124,58 +147,72 @@ func initOptions() (rv []crawler.Option, err error) { } rv = []crawler.Option{ - crawler.WithUserAgent(*fUA), - crawler.WithDelay(*fDelay), - crawler.WithMaxCrawlDepth(*fDepth), - crawler.WithWorkersCount(*fWorkers), - crawler.WithSkipSSL(*fSkipSSL), - crawler.WithBruteMode(*fBrute), + crawler.WithUserAgent(fUA), + crawler.WithDelay(fDelay), + crawler.WithMaxCrawlDepth(fDepth), + crawler.WithWorkersCount(fWorkers), + crawler.WithSkipSSL(fSkipSSL), + crawler.WithBruteMode(fBrute), crawler.WithDirsPolicy(dirs), crawler.WithRobotsPolicy(robots), - crawler.WithoutHeads(*fNoHeads), - crawler.WithScanJS(*fScanJS), + crawler.WithoutHeads(fNoHeads), + crawler.WithScanJS(fScanJS), crawler.WithExtraHeaders(h), crawler.WithExtraCookies(c), crawler.WithTagsFilter(tags.Values), crawler.WithIgnored(ignored.Values), - crawler.WithProxyAuth(*fProxyAuth), + crawler.WithProxyAuth(fProxyAuth), } return rv, nil } -func main() { - flag.Var( - &headers, - "header", +func setupFlags() { + flag.Var(&headers, "header", "extra headers for request, can be used multiple times, accept files with '@'-prefix", ) - flag.Var( - &cookies, - "cookie", + flag.Var(&cookies, "cookie", "extra cookies for request, can be used multiple times, accept files with '@'-prefix", ) - flag.Var( - &tags, - "tag", - "tags filter, single or comma-separated tag names", - ) - flag.Var( - &ignored, - "ignore", - "patterns (in urls) to be ignored in crawl process", - ) + + flag.Var(&tags, "tag", "tags filter, single or comma-separated tag names") + flag.Var(&ignored, "ignore", "patterns (in urls) to be ignored in crawl process") + + flag.IntVar(&fDepth, "depth", 0, "scan depth (set -1 for unlimited)") + flag.IntVar(&fWorkers, "workers", runtime.NumCPU(), "number of workers") + + flag.BoolVar(&fBrute, "brute", false, "scan html comments") + flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests") + flag.BoolVar(&fScanJS, "js", false, "scan js files for endpoints") + flag.BoolVar(&fSkipSSL, "skip-ssl", false, "skip ssl verification") + flag.BoolVar(&fSilent, "silent", false, "suppress info and error messages in stderr") + flag.BoolVar(&fVersion, "version", false, "show version") + + flag.StringVar(&fDirsPolicy, "dirs", crawler.DefaultDirsPolicy, + "policy for non-resource urls: show / hide / only") + flag.StringVar(&fRobotsPolicy, "robots", crawler.DefaultRobotsPolicy, + "policy for robots.txt: ignore / crawl / respect") + flag.StringVar(&fUA, "user-agent", defaultUA, "user-agent string") + flag.StringVar(&fProxyAuth, "proxy-auth", "", "credentials for proxy: user:password") + + flag.DurationVar(&fDelay, "delay", defaultDelay, "per-request delay (0 - disable)") + + flag.Usage = usage +} + +func main() { + setupFlags() flag.Parse() - if *fVersion { + if fVersion { puts(version()) return } if flag.NArg() != 1 { - flag.Usage() + usage() return } @@ -185,7 +222,7 @@ func main() { log.Fatal("[-] options:", err) } - if *fSilent { + if fSilent { log.SetOutput(io.Discard) } diff --git a/pkg/crawler/config_test.go b/pkg/crawler/config_test.go index 5a55e60..ddb6715 100644 --- a/pkg/crawler/config_test.go +++ b/pkg/crawler/config_test.go @@ -190,10 +190,12 @@ func TestString(t *testing.T) { func TestProxyAuth(t *testing.T) { t.Parallel() + const creds = "user:pass" + var ( c = &config{} - opts = []Option{WithProxyAuth("user:pass")} - headers = []string{proxyAuthHdr + ": " + proxyAuthTyp + " dXNlcjpwYXNz"} + opts = []Option{WithProxyAuth(creds)} + headers = []string{proxyAuthHeader(creds)} ) for _, o := range opts { diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index cab5685..6ae86e7 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -46,6 +46,7 @@ const ( type crawlResult struct { URI string + Hash uint64 Flag taskFlag } @@ -92,8 +93,8 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) { defer c.close() - seen := make(set.URI) - seen.Add(uri) + seen := make(set.Set[uint64]) + seen.Add(urlhash(uri)) web := client.New( c.cfg.UserAgent, @@ -122,7 +123,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) { switch { case t.Flag == TaskDone: w-- - case seen.Add(t.URI): + case seen.TryAdd(t.Hash): if t.Flag == TaskCrawl && c.crawl(base, &t) { w++ } @@ -260,14 +261,6 @@ func (c *Crawler) crawlRobots(host *url.URL) { } } -func (c *Crawler) sitemapHandler(s string) { - c.linkHandler(atom.A, s) -} - -func (c *Crawler) jsHandler(s string) { - c.linkHandler(atom.Link, s) -} - func (c *Crawler) isIgnored(v string) (yes bool) { if len(c.cfg.Ignored) == 0 { return @@ -283,7 +276,10 @@ func (c *Crawler) isIgnored(v string) (yes bool) { } func (c *Crawler) linkHandler(a atom.Atom, s string) { - r := crawlResult{URI: s} + r := crawlResult{ + URI: s, + Hash: urlhash(s), + } fetch := (a == atom.A || a == atom.Iframe) || (c.cfg.ScanJS && a == atom.Script) @@ -328,9 +324,13 @@ func (c *Crawler) fetch( Handler: c.linkHandler, }) case isSitemap(uri): - links.ExtractSitemap(body, base, c.sitemapHandler) + links.ExtractSitemap(body, base, func(s string) { + c.linkHandler(atom.A, s) + }) case c.cfg.ScanJS && isJS(content, uri): - links.ExtractJS(body, c.jsHandler) + links.ExtractJS(body, func(s string) { + c.linkHandler(atom.Link, s) + }) } client.Discard(body) diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go index 4c5a18f..f2a6c26 100644 --- a/pkg/crawler/crawler_test.go +++ b/pkg/crawler/crawler_test.go @@ -759,7 +759,7 @@ func TestCrawlerProxyAuth(t *testing.T) { ) ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - creds := r.Header.Get(proxyAuthHdr) + creds := r.Header.Get(proxyAuthKey) if creds == "" { t.Fatal("auth header empty") } @@ -769,7 +769,7 @@ func TestCrawlerProxyAuth(t *testing.T) { t.Fatalf("invalid fields count: %d", len(parts)) } - if !strings.EqualFold(parts[0], proxyAuthTyp) { + if !strings.EqualFold(parts[0], proxyAuthBasic) { t.Fatalf("invalid auth type: %s", parts[0]) } diff --git a/pkg/crawler/options.go b/pkg/crawler/options.go index 784d6b9..580c420 100644 --- a/pkg/crawler/options.go +++ b/pkg/crawler/options.go @@ -1,7 +1,6 @@ package crawler import ( - "encoding/base64" "time" ) @@ -109,9 +108,6 @@ func WithScanJS(v bool) Option { // WithProxyAuth enables proxy credentials. func WithProxyAuth(v string) Option { return func(c *config) { - c.Headers = append( - c.Headers, - proxyAuthHdr+": "+proxyAuthTyp+" "+base64.StdEncoding.EncodeToString([]byte(v)), - ) + c.Headers = append(c.Headers, proxyAuthHeader(v)) } } diff --git a/pkg/crawler/policies.go b/pkg/crawler/policies.go index b18bac3..dbfff85 100644 --- a/pkg/crawler/policies.go +++ b/pkg/crawler/policies.go @@ -5,6 +5,13 @@ import ( "strings" ) +const ( + // DefaultRobotsPolicy is a default policy name for robots handling. + DefaultRobotsPolicy = "ignore" + // DefaultDirsPolicy is a default policy name for non-resource URLs. + DefaultDirsPolicy = "show" +) + // ErrUnknownPolicy is returned when requested policy unknown. var ErrUnknownPolicy = errors.New("unknown policy") @@ -33,21 +40,21 @@ const ( ) // ParseRobotsPolicy parses robots policy from string. -func ParseRobotsPolicy(s string) (a RobotsPolicy, err error) { +func ParseRobotsPolicy(s string) (p RobotsPolicy, err error) { switch strings.ToLower(s) { case "ignore": - a = RobotsIgnore + p = RobotsIgnore case "crawl": - a = RobotsCrawl + p = RobotsCrawl case "respect": - a = RobotsRespect + p = RobotsRespect default: err = ErrUnknownPolicy return } - return a, nil + return p, nil } // ParseDirsPolicy parses dirs policy from string. diff --git a/pkg/crawler/util.go b/pkg/crawler/util.go index 55ca8cc..d1a54b3 100644 --- a/pkg/crawler/util.go +++ b/pkg/crawler/util.go @@ -1,6 +1,9 @@ package crawler import ( + "encoding/base64" + "hash/fnv" + "io" "log" "mime" "net/url" @@ -16,12 +19,13 @@ import ( ) const ( - proxyAuthHdr = "Proxy-Authorization" - proxyAuthTyp = "Basic" - contentType = "Content-Type" - contentHTML = "text/html" - contentJS = "application/javascript" - fileExtJS = ".js" + proxyAuthKey = "Proxy-Authorization" + proxyAuthBasic = "Basic" + + contentType = "Content-Type" + contentHTML = "text/html" + contentJS = "application/javascript" + fileExtJS = ".js" ) var parsableExts = make(set.Set[string]).Load( @@ -37,6 +41,10 @@ var parsableExts = make(set.Set[string]).Load( ".js", ) +func proxyAuthHeader(v string) (rv string) { + return proxyAuthKey + ": " + proxyAuthBasic + " " + base64.StdEncoding.EncodeToString([]byte(v)) +} + func prepareFilter(tags []string) links.TokenFilter { if len(tags) == 0 { return links.AllowALL @@ -182,3 +190,10 @@ func isJS(v, n string) (yes bool) { return webExt(n) == fileExtJS } + +func urlhash(s string) (rv uint64) { + hash := fnv.New64() + _, _ = io.WriteString(hash, strings.ToLower(s)) + + return hash.Sum64() +} diff --git a/pkg/crawler/util_test.go b/pkg/crawler/util_test.go index 1b6568f..8dd620d 100644 --- a/pkg/crawler/util_test.go +++ b/pkg/crawler/util_test.go @@ -226,3 +226,28 @@ func TestIsSitemap(t *testing.T) { } } } + +func TestUrlHash(t *testing.T) { + t.Parallel() + + const val = "http://test/some/path?foo" + + h1, h2 := urlhash(val), urlhash(val) + + if h1 != h2 { + t.Error("hashes mismatch") + } +} + +func TestProxyAuthHeader(t *testing.T) { + t.Parallel() + + const ( + got = "user:pass" + want = "Proxy-Authorization: Basic dXNlcjpwYXNz" + ) + + if rv := proxyAuthHeader(got); rv != want { + t.Errorf("invalid header want: '%s' got: '%s'", want, rv) + } +} diff --git a/pkg/set/set.go b/pkg/set/set.go index 4689c95..35698eb 100644 --- a/pkg/set/set.go +++ b/pkg/set/set.go @@ -1,12 +1,28 @@ package set -type Set[T comparable] map[T]stub +type ( + stub struct{} + + // Set represents hashset for comparable types. + Set[T comparable] map[T]stub +) // Add add value to set, replacing previous instances. func (s Set[T]) Add(v T) { s[v] = stub{} } +// TryAdd takes attempt to add value to set, returns false if value already exists. +func (s Set[T]) TryAdd(v T) (ok bool) { + if s.Has(v) { + return false + } + + s.Add(v) + + return true +} + // Has checks if value is already present in set. func (s Set[T]) Has(v T) (ok bool) { _, ok = s[v] diff --git a/pkg/set/set_test.go b/pkg/set/set_test.go index da50506..0775ac5 100644 --- a/pkg/set/set_test.go +++ b/pkg/set/set_test.go @@ -30,4 +30,12 @@ func TestSet(t *testing.T) { if s.Has(val3) { t.Error("has val3") } + + if !s.TryAdd(val3) { + t.Error("TryAdd(val3) == false") + } + + if s.TryAdd(val3) { + t.Error("TryAdd(val3) == true") + } } diff --git a/pkg/set/stub.go b/pkg/set/stub.go deleted file mode 100644 index 3d99ebb..0000000 --- a/pkg/set/stub.go +++ /dev/null @@ -1,3 +0,0 @@ -package set - -type stub struct{} diff --git a/pkg/set/uri.go b/pkg/set/uri.go deleted file mode 100644 index fbf0421..0000000 --- a/pkg/set/uri.go +++ /dev/null @@ -1,29 +0,0 @@ -package set - -import ( - "hash/fnv" - "io" - "strings" -) - -// URI holds set of uint64 hashes. -type URI map[uint64]stub - -func (u URI) Add(v string) (ok bool) { - h := hash(v) - - if _, ok = u[h]; ok { - return false - } - - u[h] = stub{} - - return true -} - -func hash(s string) (rv uint64) { - hash := fnv.New64() - _, _ = io.WriteString(hash, strings.ToLower(s)) - - return hash.Sum64() -} diff --git a/pkg/set/uri_test.go b/pkg/set/uri_test.go deleted file mode 100644 index 2119e66..0000000 --- a/pkg/set/uri_test.go +++ /dev/null @@ -1,44 +0,0 @@ -package set - -import ( - "testing" -) - -func TestURI(t *testing.T) { - t.Parallel() - - s := make(URI) - - const ( - val1 = "http://test/1" - val2 = "http://test/2" - ) - - if !s.Add(val1) { - t.Errorf("add val1 - step 1 failure") - } - - if !s.Add(val2) { - t.Errorf("add val2 - step 1 failure") - } - - if s.Add(val1) { - t.Errorf("add val1 - step 2 failure") - } - - if s.Add(val2) { - t.Errorf("add val2 - step 2 failure") - } -} - -func TestHash(t *testing.T) { - t.Parallel() - - const val = "http://test/some/path?foo" - - h1, h2 := hash(val), hash(val) - - if h1 != h2 { - t.Error("hashes mismatch") - } -} diff --git a/pkg/values/simple.go b/pkg/values/list.go similarity index 70% rename from pkg/values/simple.go rename to pkg/values/list.go index 54153d8..37100b5 100644 --- a/pkg/values/simple.go +++ b/pkg/values/list.go @@ -2,11 +2,11 @@ package values import "strings" -type Simple struct { +type List struct { Values []string } -func (s *Simple) Set(val string) (err error) { +func (s *List) Set(val string) (err error) { switch { case strings.ContainsRune(val, ','): s.Values = append(s.Values, strings.Split(val, ",")...) @@ -17,6 +17,6 @@ func (s *Simple) Set(val string) (err error) { return } -func (s *Simple) String() (rv string) { +func (s *List) String() (rv string) { return strings.Join(s.Values, ",") } diff --git a/pkg/values/simple_test.go b/pkg/values/list_test.go similarity index 97% rename from pkg/values/simple_test.go rename to pkg/values/list_test.go index 470175f..609cae7 100644 --- a/pkg/values/simple_test.go +++ b/pkg/values/list_test.go @@ -8,7 +8,7 @@ func TestSimpleSet(t *testing.T) { t.Parallel() var ( - l Simple + l List err error ) @@ -40,7 +40,7 @@ func TestSimpleSet(t *testing.T) { func TestSimpleString(t *testing.T) { t.Parallel() - var l Simple + var l List if l.String() != "" { t.Fatal("non-empty result")