-
Notifications
You must be signed in to change notification settings - Fork 1
/
App.java
91 lines (73 loc) · 2.2 KB
/
App.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package scrapper.scrapperDemo;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class App {
public static void main(String[] args) {
String baseURL = "https://www.soa.ac.in";
String fileNameString = getCleanedURL(baseURL) + ".txt";
String domain=getCleanedURL(baseURL);
Set<String> processedURLs = new TreeSet<String>();
PriorityQueue<String> priorityQueue = new PriorityQueue<String>();
priorityQueue.add(baseURL);
while (!priorityQueue.isEmpty()) {
try {
String currentURL = priorityQueue.remove();
if (!processedURLs.add(currentURL))
continue;
FileWriter f = new FileWriter(fileNameString, true);
Document document = Jsoup.connect(currentURL).get();
String text = document.text();
System.out.println(text);
f.append(currentURL);
f = breakIT(f);
f.append(text);
f = breakIT(f);
f.close();
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
System.out.println(element.attr("href"));
String hrefLINK = element.attr("href");
if ( !(hrefLINK.length() > 4 && hrefLINK.substring(0, 4).equals("http")) ) {
hrefLINK = baseURL + hrefLINK;
}
if(getCleanedURL(hrefLINK).equals(domain))
priorityQueue.add(hrefLINK);
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
static FileWriter breakIT(FileWriter f) throws IOException {
f.append("\n");
f.append("\n");
f.append("\n");
f.append("\n");
f.append("\n");
f.append("\n");
return f;
}
static String getCleanedURL(String url) {
int index = url.indexOf("//");
url = url.substring(index + 2, url.length());
if (url.substring(0, 3).equals("www")) {
url = url.substring(4, url.length());
}
try {
int index2 = url.indexOf("/");
url = url.substring(0, index2);
} catch (Exception e) {
// TODO: handle exception
}
return url;
}
}