-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
70 lines (58 loc) · 1.52 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package main
import (
"flag"
"fmt"
"time"
"net/url"
"os"
"github.com/rtlong/web-spider/spider"
)
var (
concurrency = flag.Int("c", 100, "Max number of simultaneous open connections")
jsonOutput = flag.Bool("j", false, "Dump output as JSON to get much more information that the default summary output")
depth = flag.Int("d", 20, "Maximum depth of spidering (-1 indicates no limit)")
redundancy = flag.Int("r", 1, "Max number of fetches per URL")
maxURLs = flag.Int("m", 200000, "Max number of unique URLs to request")
timeout = flag.Duration("t", time.Second*20, "Timeout for any request")
seedURL url.URL
logger Logger
)
func main() {
flag.Parse()
if *jsonOutput {
logger = new(JSONLogger)
} else {
logger = new(PlaintextLogger)
}
logger.SetOutput(os.Stdout)
seedURL, err := url.Parse(flag.Arg(0))
if err != nil {
logger.Fatal(fmt.Sprintf("Failed to parse input URL: %s", err))
} else if !seedURL.IsAbs() {
logger.Fatal("You must supply a URL to start with")
}
results := make(chan *spider.Result)
s := spider.Spider{
Fetcher: &spider.SimpleHTMLFetcher{
Timeout: *timeout,
},
Results: results,
MaxDepth: *depth,
Concurrency: *concurrency,
Redundancy: *redundancy,
MaxURLs: *maxURLs,
LinkFilterFunc: func(l spider.Link) bool {
return l.URL.Host == seedURL.Host
},
}
go func() {
s.Crawl(seedURL)
close(s.Results)
}()
logResults(results)
}
func logResults(results <-chan *spider.Result) {
for r := range results {
logger.PrintResult(r)
}
}