-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl.go
155 lines (129 loc) · 3.08 KB
/
url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Copyright 2021 Wayback Archiver. All rights reserved.
// Use of this source code is governed by the MIT
// license that can be found in the LICENSE file.
/*
Package helper handles common functions for the waybackk application in Golang.
*/
package helper // import "github.com/wabarc/helper"
import (
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
"mvdan.cc/xurls/v2"
)
// MatchURL is extract URL from text, returns []string always.
func MatchURL(text string) []string {
urls := []string{}
rx := xurls.Strict()
matches := rx.FindAllString(text, -1)
for _, el := range matches {
urls = append(urls, strip(el))
}
return urls
}
// MatchURLFallback is extract URL from text, and convert to
// Google cache endpoint if not found, returns []string always.
func MatchURLFallback(text string) []string {
urls := []string{}
rx := xurls.Strict()
matches := rx.FindAllString(text, -1)
cache := "https://webcache.googleusercontent.com/search?q=cache:"
for _, el := range matches {
uri := strip(el)
if NotFound(uri) {
uri = cache + uri
}
urls = append(urls, uri)
}
return urls
}
// IsURL returns a result of validation for string.
func IsURL(str string) bool {
u, err := url.Parse(str)
if err != nil {
return false
}
return u.Scheme != "" && strings.Contains(u.Host, ".")
}
// NotFound returns a result of URI status is 404
func NotFound(uri string) bool {
if _, err := url.Parse(uri); err != nil {
return true
}
req, err := http.NewRequest(http.MethodHead, uri, nil)
if err != nil {
return true
}
ua := `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.093 Safari/537.36`
req.Header.Set("User-Agent", ua)
noRedirect := func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
client := &http.Client{Timeout: 10 * time.Second, CheckRedirect: noRedirect}
resp, err := client.Do(req)
if err != nil {
return true
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusNotFound
}
func strip(link string) string {
u, err := url.Parse(link)
if err != nil {
return ""
}
var p = strings.HasPrefix
var e = strings.EqualFold
var maps = map[string]func(string, string) bool{
"utm_": p,
"at_custom": p,
"at_medium": p,
"weibo_id": e,
"fbclid": e,
"chksm": e,
}
queries := u.Query()
for key := range queries {
for prefix, v := range maps {
if v(key, prefix) {
queries.Del(key)
}
}
}
u.RawQuery = queries.Encode()
return u.String()
}
// RealURI returns final URL
func RealURI(u *url.URL) *url.URL {
resp, err := http.Head(u.String())
if err != nil {
return u
}
defer resp.Body.Close()
return resp.Request.URL
}
func TinyURL(link string) string {
_, err := url.Parse(link)
if err != nil {
return ""
}
resp, err := http.Get("https://tinyurl.com/api-create.php?url=" + link)
if err != nil {
return ""
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return ""
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return ""
}
final := string(body)
if final != "Error" {
return final
}
return ""
}