Skip to content

Commit

Permalink
Release/v1.5.3 (#40)
Browse files Browse the repository at this point in the history
* code grouming, better usage info
  • Loading branch information
s0rg authored Nov 2, 2022
1 parent ed2cdb2 commit 2daad72
Show file tree
Hide file tree
Showing 16 changed files with 196 additions and 166 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ crawley -headless -delay 0 -depth -1 -dirs only http://some-test.site

# installation

- [binaries](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows, just download and run.
- [binaries / deb / rpm](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows.
- [archlinux](https://aur.archlinux.org/packages/crawley-bin/) you can use your favourite AUR helper to install it, e. g. `paru -S crawley-bin`.

# usage

```
crawley [flags] url
possible flags:
possible flags with default values:
-brute
scan html comments
Expand Down
133 changes: 85 additions & 48 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import (
"io"
"log"
"os"
"path/filepath"
"runtime"
"strings"
"time"

"github.com/s0rg/crawley/pkg/crawler"
Expand All @@ -17,32 +19,30 @@ import (

const (
appName = "Crawley"
appHelp = "the unix-way web crawler"
appSite = "https://github.com/s0rg/crawley"
defaultDelay = 150 * time.Millisecond
)

// build-time values.
var (
GitHash string
GitTag string
GitHash string
BuildDate string
defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + GitTag + "-" + GitHash
)

cookies, headers values.Smart
tags, ignored values.Simple

fDepth = flag.Int("depth", 0, "scan depth (set -1 for unlimited)")
fWorkers = flag.Int("workers", runtime.NumCPU(), "number of workers")
fBrute = flag.Bool("brute", false, "scan html comments")
fNoHeads = flag.Bool("headless", false, "disable pre-flight HEAD requests")
fScanJS = flag.Bool("js", false, "scan js files for endpoints")
fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification")
fSilent = flag.Bool("silent", false, "suppress info and error messages in stderr")
fVersion = flag.Bool("version", false, "show version")
fDirsPolicy = flag.String("dirs", "show", "policy for non-resource urls: show / hide / only")
fProxyAuth = flag.String("proxy-auth", "", "credentials for proxy: user:password")
fRobotsPolicy = flag.String("robots", "ignore", "policy for robots.txt: ignore / crawl / respect")
fUA = flag.String("user-agent", defaultUA, "user-agent string")
fDelay = flag.Duration("delay", defaultDelay, "per-request delay (0 - disable)")
// command-line flags.
var (
fDepth, fWorkers int
fSilent, fVersion bool
fBrute, fNoHeads bool
fSkipSSL, fScanJS bool
fDirsPolicy, fProxyAuth string
fRobotsPolicy, fUA string
fDelay time.Duration
cookies, headers values.Smart
tags, ignored values.List
)

func version() string {
Expand All @@ -56,6 +56,29 @@ func version() string {
)
}

func usage() {
var sb strings.Builder

const twoCR = "\n\n"

sb.WriteString(appName)
sb.WriteString(" - ")
sb.WriteString(appHelp)
sb.WriteString(", usage:")
sb.WriteString(twoCR)

sb.WriteString(filepath.Base(os.Args[0]))
sb.WriteString(" [flags] url")
sb.WriteString(twoCR)

sb.WriteString("possible flags with default values:")
sb.WriteString(twoCR)

_, _ = os.Stderr.WriteString(sb.String())

flag.PrintDefaults()
}

func puts(s string) {
_, _ = os.Stdout.WriteString(s + "\n")
}
Expand Down Expand Up @@ -102,14 +125,14 @@ func loadSmart() (h, c []string, err error) {
}

func initOptions() (rv []crawler.Option, err error) {
robots, err := crawler.ParseRobotsPolicy(*fRobotsPolicy)
robots, err := crawler.ParseRobotsPolicy(fRobotsPolicy)
if err != nil {
err = fmt.Errorf("robots policy: %w", err)

return
}

dirs, err := crawler.ParseDirsPolicy(*fDirsPolicy)
dirs, err := crawler.ParseDirsPolicy(fDirsPolicy)
if err != nil {
err = fmt.Errorf("dirs policy: %w", err)

Expand All @@ -124,58 +147,72 @@ func initOptions() (rv []crawler.Option, err error) {
}

rv = []crawler.Option{
crawler.WithUserAgent(*fUA),
crawler.WithDelay(*fDelay),
crawler.WithMaxCrawlDepth(*fDepth),
crawler.WithWorkersCount(*fWorkers),
crawler.WithSkipSSL(*fSkipSSL),
crawler.WithBruteMode(*fBrute),
crawler.WithUserAgent(fUA),
crawler.WithDelay(fDelay),
crawler.WithMaxCrawlDepth(fDepth),
crawler.WithWorkersCount(fWorkers),
crawler.WithSkipSSL(fSkipSSL),
crawler.WithBruteMode(fBrute),
crawler.WithDirsPolicy(dirs),
crawler.WithRobotsPolicy(robots),
crawler.WithoutHeads(*fNoHeads),
crawler.WithScanJS(*fScanJS),
crawler.WithoutHeads(fNoHeads),
crawler.WithScanJS(fScanJS),
crawler.WithExtraHeaders(h),
crawler.WithExtraCookies(c),
crawler.WithTagsFilter(tags.Values),
crawler.WithIgnored(ignored.Values),
crawler.WithProxyAuth(*fProxyAuth),
crawler.WithProxyAuth(fProxyAuth),
}

return rv, nil
}

func main() {
flag.Var(
&headers,
"header",
func setupFlags() {
flag.Var(&headers, "header",
"extra headers for request, can be used multiple times, accept files with '@'-prefix",
)
flag.Var(
&cookies,
"cookie",
flag.Var(&cookies, "cookie",
"extra cookies for request, can be used multiple times, accept files with '@'-prefix",
)
flag.Var(
&tags,
"tag",
"tags filter, single or comma-separated tag names",
)
flag.Var(
&ignored,
"ignore",
"patterns (in urls) to be ignored in crawl process",
)

flag.Var(&tags, "tag", "tags filter, single or comma-separated tag names")
flag.Var(&ignored, "ignore", "patterns (in urls) to be ignored in crawl process")

flag.IntVar(&fDepth, "depth", 0, "scan depth (set -1 for unlimited)")
flag.IntVar(&fWorkers, "workers", runtime.NumCPU(), "number of workers")

flag.BoolVar(&fBrute, "brute", false, "scan html comments")
flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests")
flag.BoolVar(&fScanJS, "js", false, "scan js files for endpoints")
flag.BoolVar(&fSkipSSL, "skip-ssl", false, "skip ssl verification")
flag.BoolVar(&fSilent, "silent", false, "suppress info and error messages in stderr")
flag.BoolVar(&fVersion, "version", false, "show version")

flag.StringVar(&fDirsPolicy, "dirs", crawler.DefaultDirsPolicy,
"policy for non-resource urls: show / hide / only")
flag.StringVar(&fRobotsPolicy, "robots", crawler.DefaultRobotsPolicy,
"policy for robots.txt: ignore / crawl / respect")
flag.StringVar(&fUA, "user-agent", defaultUA, "user-agent string")
flag.StringVar(&fProxyAuth, "proxy-auth", "", "credentials for proxy: user:password")

flag.DurationVar(&fDelay, "delay", defaultDelay, "per-request delay (0 - disable)")

flag.Usage = usage
}

func main() {
setupFlags()

flag.Parse()

if *fVersion {
if fVersion {
puts(version())

return
}

if flag.NArg() != 1 {
flag.Usage()
usage()

return
}
Expand All @@ -185,7 +222,7 @@ func main() {
log.Fatal("[-] options:", err)
}

if *fSilent {
if fSilent {
log.SetOutput(io.Discard)
}

Expand Down
6 changes: 4 additions & 2 deletions pkg/crawler/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,12 @@ func TestString(t *testing.T) {
func TestProxyAuth(t *testing.T) {
t.Parallel()

const creds = "user:pass"

var (
c = &config{}
opts = []Option{WithProxyAuth("user:pass")}
headers = []string{proxyAuthHdr + ": " + proxyAuthTyp + " dXNlcjpwYXNz"}
opts = []Option{WithProxyAuth(creds)}
headers = []string{proxyAuthHeader(creds)}
)

for _, o := range opts {
Expand Down
28 changes: 14 additions & 14 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (

type crawlResult struct {
URI string
Hash uint64
Flag taskFlag
}

Expand Down Expand Up @@ -92,8 +93,8 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {

defer c.close()

seen := make(set.URI)
seen.Add(uri)
seen := make(set.Set[uint64])
seen.Add(urlhash(uri))

web := client.New(
c.cfg.UserAgent,
Expand Down Expand Up @@ -122,7 +123,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {
switch {
case t.Flag == TaskDone:
w--
case seen.Add(t.URI):
case seen.TryAdd(t.Hash):
if t.Flag == TaskCrawl && c.crawl(base, &t) {
w++
}
Expand Down Expand Up @@ -260,14 +261,6 @@ func (c *Crawler) crawlRobots(host *url.URL) {
}
}

func (c *Crawler) sitemapHandler(s string) {
c.linkHandler(atom.A, s)
}

func (c *Crawler) jsHandler(s string) {
c.linkHandler(atom.Link, s)
}

func (c *Crawler) isIgnored(v string) (yes bool) {
if len(c.cfg.Ignored) == 0 {
return
Expand All @@ -283,7 +276,10 @@ func (c *Crawler) isIgnored(v string) (yes bool) {
}

func (c *Crawler) linkHandler(a atom.Atom, s string) {
r := crawlResult{URI: s}
r := crawlResult{
URI: s,
Hash: urlhash(s),
}

fetch := (a == atom.A || a == atom.Iframe) ||
(c.cfg.ScanJS && a == atom.Script)
Expand Down Expand Up @@ -328,9 +324,13 @@ func (c *Crawler) fetch(
Handler: c.linkHandler,
})
case isSitemap(uri):
links.ExtractSitemap(body, base, c.sitemapHandler)
links.ExtractSitemap(body, base, func(s string) {
c.linkHandler(atom.A, s)
})
case c.cfg.ScanJS && isJS(content, uri):
links.ExtractJS(body, c.jsHandler)
links.ExtractJS(body, func(s string) {
c.linkHandler(atom.Link, s)
})
}

client.Discard(body)
Expand Down
4 changes: 2 additions & 2 deletions pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@ func TestCrawlerProxyAuth(t *testing.T) {
)

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
creds := r.Header.Get(proxyAuthHdr)
creds := r.Header.Get(proxyAuthKey)
if creds == "" {
t.Fatal("auth header empty")
}
Expand All @@ -769,7 +769,7 @@ func TestCrawlerProxyAuth(t *testing.T) {
t.Fatalf("invalid fields count: %d", len(parts))
}

if !strings.EqualFold(parts[0], proxyAuthTyp) {
if !strings.EqualFold(parts[0], proxyAuthBasic) {
t.Fatalf("invalid auth type: %s", parts[0])
}

Expand Down
6 changes: 1 addition & 5 deletions pkg/crawler/options.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package crawler

import (
"encoding/base64"
"time"
)

Expand Down Expand Up @@ -109,9 +108,6 @@ func WithScanJS(v bool) Option {
// WithProxyAuth enables proxy credentials.
func WithProxyAuth(v string) Option {
return func(c *config) {
c.Headers = append(
c.Headers,
proxyAuthHdr+": "+proxyAuthTyp+" "+base64.StdEncoding.EncodeToString([]byte(v)),
)
c.Headers = append(c.Headers, proxyAuthHeader(v))
}
}
17 changes: 12 additions & 5 deletions pkg/crawler/policies.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ import (
"strings"
)

const (
// DefaultRobotsPolicy is a default policy name for robots handling.
DefaultRobotsPolicy = "ignore"
// DefaultDirsPolicy is a default policy name for non-resource URLs.
DefaultDirsPolicy = "show"
)

// ErrUnknownPolicy is returned when requested policy unknown.
var ErrUnknownPolicy = errors.New("unknown policy")

Expand Down Expand Up @@ -33,21 +40,21 @@ const (
)

// ParseRobotsPolicy parses robots policy from string.
func ParseRobotsPolicy(s string) (a RobotsPolicy, err error) {
func ParseRobotsPolicy(s string) (p RobotsPolicy, err error) {
switch strings.ToLower(s) {
case "ignore":
a = RobotsIgnore
p = RobotsIgnore
case "crawl":
a = RobotsCrawl
p = RobotsCrawl
case "respect":
a = RobotsRespect
p = RobotsRespect
default:
err = ErrUnknownPolicy

return
}

return a, nil
return p, nil
}

// ParseDirsPolicy parses dirs policy from string.
Expand Down
Loading

0 comments on commit 2daad72

Please sign in to comment.