Skip to content

Commit

Permalink
Feature/css parser (#83)
Browse files Browse the repository at this point in the history
* js/css parsers
  • Loading branch information
s0rg authored Nov 19, 2023
1 parent 678d2b0 commit 1618fbf
Show file tree
Hide file tree
Showing 42 changed files with 434 additions and 129 deletions.
6 changes: 5 additions & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ linters:
- tagliatelle
- nosnakecase
- exhaustruct
- inamedparam
- exhaustive
- varnamelen
- interfacer
Expand All @@ -40,6 +41,8 @@ linters-settings:
govet:
check-shadowing: true
enable-all: true
cyclop:
max-complexity: 15
gocritic:
enabled-tags:
- performance
Expand All @@ -49,7 +52,7 @@ linters-settings:

issues:
exclude-rules:
- path: pkg/client/http.go
- path: internal/client/http.go
text: "G402" # G402: TLS InsecureSkipVerify set true.
linters:
- gosec
Expand All @@ -59,6 +62,7 @@ issues:
- ifshort
- cyclop
- funlen
- dupl
- path: cmd/crawley/main.go
linters:
- nakedret
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,20 @@ Crawls web pages and prints any link it can find.

# features

- fast html SAX-parser (powered by `golang.org/x/net/html`)
- fast html SAX-parser (powered by [x/net/html](https://golang.org/x/net/html))
- js/css lexical parsers (powered by [tdewolff/parse](https://github.com/tdewolff/parse)) - extract api endpoints from js code and `url()` properties
- small (below 1500 SLOC), idiomatic, 100% test covered codebase
- grabs most of useful resources urls (pics, videos, audios, forms, etc...)
- found urls are streamed to stdout and guranteed to be unique (with fragments omitted)
- scan depth (limited by starting host and path, by default - 0) can be configured
- can crawl rules and sitemaps from `robots.txt`
- can be polite - crawl rules and sitemaps from `robots.txt`
- `brute` mode - scan html comments for urls (this can lead to bogus results)
- make use of `HTTP_PROXY` / `HTTPS_PROXY` environment values + handles proxy auth (use `HTTP_PROXY="socks5://127.0.0.1:1080/" crawley` for socks5)
- directory-only scan mode (aka `fast-scan`)
- user-defined cookies, in curl-compatible format (i.e. `-cookie "ONE=1; TWO=2" -cookie "ITS=ME" -cookie @cookie-file`)
- user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file`
- tag filter - allow to specify tags to crawl for (single: `-tag a -tag form`, multiple: `-tag a,form`, or mixed)
- url ignore - allow to ignore urls with matched substrings from crawling (i.e.: `-ignore logout`)
- js parser - extract api endpoints from js code, this done by regexp, so results can be messy

# examples

Expand Down Expand Up @@ -64,26 +64,28 @@ crawley [flags] url
possible flags with default values:
-all
scan all known sources (js/css/...)
-brute
scan html comments
-cookie value
extra cookies for request, can be used multiple times, accept files with '@'-prefix
-css
scan css for urls
-delay duration
per-request delay (0 - disable) (default 150ms)
-depth int
scan depth (-1 - unlimited)
scan depth (set -1 for unlimited)
-dirs string
policy for non-resource urls: show / hide / only (default "show")
-header value
extra headers for request, can be used multiple times, accept files with '@'-prefix
-headless
disable pre-flight HEAD requests
-help
this flags (and their defaults) description
-ignore value
patterns (in urls) to be ignored in crawl process
-js
scan js files for endpoints
scan js code for endpoints
-proxy-auth string
credentials for proxy: user:password
-robots string
Expand All @@ -93,15 +95,15 @@ possible flags with default values:
-skip-ssl
skip ssl verification
-tag value
tags filter, single or comma-separated tag names allowed
tags filter, single or comma-separated tag names
-timeout duration
request timeout (min: 1 second, max: 10 minutes) (default 5s)
-user-agent string
user-agent string
-version
show version
-workers int
number of workers (default - number of CPU cores)
number of workers (default - number of CPU cores)
```

# flags autocompletion
Expand Down
4 changes: 2 additions & 2 deletions SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

| Version | Supported |
| ------- | ------------------ |
| 1.6.x | :white_check_mark: |
| < 1.6 | :x: |
| 1.7.x | :white_check_mark: |
| < 1.7 | :x: |

## Reporting a Vulnerability

Expand Down
22 changes: 16 additions & 6 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ import (

"github.com/s0rg/compflag"

"github.com/s0rg/crawley/pkg/crawler"
"github.com/s0rg/crawley/pkg/values"
"github.com/s0rg/crawley/internal/crawler"
"github.com/s0rg/crawley/internal/values"
)

const (
Expand All @@ -40,6 +40,7 @@ var (
fSilent, fVersion bool
fBrute, fNoHeads bool
fSkipSSL, fScanJS bool
fScanCSS, fScanALL bool
fDirsPolicy, fProxyAuth string
fRobotsPolicy, fUA string
fDelay time.Duration
Expand Down Expand Up @@ -116,7 +117,7 @@ func loadSmart() (h, c []string, err error) {
return h, c, nil
}

func initOptions() (rv []crawler.Option, err error) {
func parseFlags() (rv []crawler.Option, err error) {
robots, err := crawler.ParseRobotsPolicy(fRobotsPolicy)
if err != nil {
err = fmt.Errorf("robots policy: %w", err)
Expand All @@ -138,6 +139,12 @@ func initOptions() (rv []crawler.Option, err error) {
return
}

scanJS, scanCSS := fScanJS, fScanCSS

if fScanALL {
scanJS, scanCSS = true, true
}

rv = []crawler.Option{
crawler.WithUserAgent(fUA),
crawler.WithDelay(fDelay),
Expand All @@ -148,7 +155,8 @@ func initOptions() (rv []crawler.Option, err error) {
crawler.WithDirsPolicy(dirs),
crawler.WithRobotsPolicy(robots),
crawler.WithoutHeads(fNoHeads),
crawler.WithScanJS(fScanJS),
crawler.WithScanJS(scanJS),
crawler.WithScanCSS(scanCSS),
crawler.WithExtraHeaders(uheaders),
crawler.WithExtraCookies(ucookies),
crawler.WithTagsFilter(tags.Values),
Expand All @@ -174,9 +182,11 @@ func setupFlags() {
flag.IntVar(&fDepth, "depth", 0, "scan depth (set -1 for unlimited)")
flag.IntVar(&fWorkers, "workers", runtime.NumCPU(), "number of workers")

flag.BoolVar(&fScanALL, "all", false, "scan all known sources (js/css/...)")
flag.BoolVar(&fBrute, "brute", false, "scan html comments")
flag.BoolVar(&fScanCSS, "css", false, "scan css for urls")
flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests")
flag.BoolVar(&fScanJS, "js", false, "scan js files for endpoints")
flag.BoolVar(&fScanJS, "js", false, "scan js code for endpoints")
flag.BoolVar(&fSkipSSL, "skip-ssl", false, "skip ssl verification")
flag.BoolVar(&fSilent, "silent", false, "suppress info and error messages in stderr")
flag.BoolVar(&fVersion, "version", false, "show version")
Expand Down Expand Up @@ -215,7 +225,7 @@ func main() {
return
}

opts, err := initOptions()
opts, err := parseFlags()
if err != nil {
log.Fatal("[-] options:", err)
}
Expand Down
5 changes: 3 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
module github.com/s0rg/crawley

go 1.21.3
go 1.21.4

require (
github.com/s0rg/compflag v1.1.0
github.com/s0rg/set v1.2.0
golang.org/x/net v0.17.0
github.com/tdewolff/parse/v2 v2.7.5
golang.org/x/net v0.18.0
)
8 changes: 6 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@ github.com/s0rg/compflag v1.1.0 h1:xhCUPLy+5Ue/Q9I/nIcLti2Ul6P42JYx4UvtYoDXmlQ=
github.com/s0rg/compflag v1.1.0/go.mod h1:XMntVpc3+jpmBe0s8xo4w9swH8T9ARGkMC9HFiDRoUw=
github.com/s0rg/set v1.2.0 h1:53b207YMktNQJXYei/oHuTR5oOO2e9+eieZOncYsh9g=
github.com/s0rg/set v1.2.0/go.mod h1:xz3nDbjF4nyMLvAHvmE7rigXpNrKKTsi6iANznIB1/4=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
github.com/tdewolff/parse/v2 v2.7.5 h1:RdcN3Ja6zAMSvnxxO047xRoWexX3RrXKi3H6EQHzXto=
github.com/tdewolff/parse/v2 v2.7.5/go.mod h1:3FbJWZp3XT9OWVN3Hmfp0p/a08v4h8J9W1aghka0soA=
github.com/tdewolff/test v1.0.11-0.20231101010635-f1265d231d52 h1:gAQliwn+zJrkjAHVcBEYW/RFvd2St4yYimisvozAYlA=
github.com/tdewolff/test v1.0.11-0.20231101010635-f1265d231d52/go.mod h1:6DAvZliBAAnD7rhVgwaM7DE5/d9NMOAJ09SqYqeK4QE=
golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg=
golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ=
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
13 changes: 9 additions & 4 deletions pkg/crawler/config.go → internal/crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"strings"
"time"

"github.com/s0rg/crawley/pkg/client"
"github.com/s0rg/crawley/internal/client"
)

const (
Expand All @@ -28,6 +28,7 @@ type config struct {
Brute bool
NoHEAD bool
ScanJS bool
ScanCSS bool
}

func (c *config) validate() {
Expand All @@ -46,12 +47,16 @@ func (c *config) String() (rv string) {
sb.WriteString(" brute: on")
}

if c.Delay > 0 {
fmt.Fprintf(&sb, " delay: %s", c.Delay)
}

if c.ScanJS {
sb.WriteString(" js: on")
sb.WriteString(" +js")
}

if c.Delay > 0 {
fmt.Fprintf(&sb, " delay: %s", c.Delay)
if c.ScanCSS {
sb.WriteString(" +css")
}

return sb.String()
Expand Down
17 changes: 11 additions & 6 deletions pkg/crawler/config_test.go → internal/crawler/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"testing"
"time"

"github.com/s0rg/crawley/pkg/client"
"github.com/s0rg/crawley/internal/client"
)

func TestValidate(t *testing.T) {
Expand Down Expand Up @@ -153,10 +153,11 @@ func TestString(t *testing.T) {
t.Parallel()

c := &config{
Client: client.Config{Workers: 13},
Depth: 666,
Brute: true,
ScanJS: true,
Client: client.Config{Workers: 13},
Depth: 666,
Brute: true,
ScanJS: true,
ScanCSS: true,
}

c.validate()
Expand All @@ -175,10 +176,14 @@ func TestString(t *testing.T) {
t.Error("1 - bad brute mode")
}

if !strings.Contains(v, "js: on") {
if !strings.Contains(v, "+js") {
t.Error("1 - bad js mode")
}

if !strings.Contains(v, "+css") {
t.Error("1 - bad css mode")
}

if strings.Contains(v, "delay") {
t.Error("1 - delay found")
}
Expand Down
Loading

0 comments on commit 1618fbf

Please sign in to comment.