-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.java
93 lines (85 loc) · 3.38 KB
/
main.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
public class ExtractLinks {
public static void main(String[] args) {
String input = "urls.txt";
String outputDir = "/path/to/text";
ArrayList<String> urls = readUrlsFromFile(input);
int numCopied = 0;
for (String url : urls) {
String htmlContent = downloadHtml(url);
String links = extractLinks(htmlContent);
saveLinksToFile(links, url, outputDir);
deleteUrl(url, input);
numCopied++;
System.out.printf("Links from %s have been copied%n", url);
}
System.out.printf("Total %d links copied and removed from input file%n", numCopied);
}
public static ArrayList<String> readUrlsFromFile(String filename) {
ArrayList<String> urls = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(filename))) {
String line;
while ((line = reader.readLine()) != null) {
urls.add(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
}
return urls;
}
public static String downloadHtml(String url) {
StringBuilder htmlContent = new StringBuilder();
try {
URL urlObj = new URL(url);
HttpURLConnection conn = (HttpURLConnection) urlObj.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
htmlContent.append(inputLine);
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
return htmlContent.toString();
}
public static String extractLinks(String htmlContent) {
StringBuilder links = new StringBuilder();
Document doc = Jsoup.parse(htmlContent);
Elements aTags = doc.select("a[href]");
for (Element aTag : aTags) {
links.append(aTag.attr("href")).append("\n");
}
return links.toString();
}
public static void saveLinksToFile(String links, String url, String outputDir) {
String pageName = url.substring(url.lastIndexOf('/') + 1);
String filename = outputDir + "/URLs_" + pageName + "_" + url.replaceAll("[/:]", "_") + ".txt";
File file = new File(filename);
file.getParentFile().mkdirs();
try (PrintWriter writer = new PrintWriter(new FileWriter(filename))) {
writer.write(links);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void deleteUrl(String url, String input) {
ArrayList<String> urls = readUrlsFromFile(input);
try (PrintWriter writer = new PrintWriter(new FileWriter(input))) {
for (String line : urls) {
if (!line.trim().equals(url)) {
writer.println(line);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}