Fixes Issues mentioned in IPMC Vote (#1417)

* Excludes BSD w/ Nuclear Option License brought in via Tika Image Parser Package * Remove references to digitalpebble.com (README, HTML) * Remove references to digitalpebble.com (sitemaps, src issue refs) * Replace tripadvisor sitemap example with stormcrawler.apache.org content Replace guardian.rss feed with stormcrawler.apache.org
apache · Nov 22, 2024 · 94bfadc · 94bfadc
1 parent 5ce9006
commit 94bfadc
Show file tree

Hide file tree

Showing 53 changed files with 661 additions and 4,222 deletions.
diff --git a/README.md b/README.md
@@ -13,8 +13,6 @@ NOTE: These instructions assume that you have [Apache Maven](https://maven.apach
 
 StormCrawler requires Java 11 or above. To execute tests, it requires you to have a locally installed and working Docker environment.
 
-DigitalPebble's [Ansible-Storm](https://github.com/DigitalPebble/ansible-storm) repository contains resources to install Apache Storm using Ansible. Alternatively, this [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project should help you run Apache Storm on Docker.
-
 Once Storm is installed, the easiest way to get started is to generate a new StormCrawler project following the instructions below: 
 
 ```shell

diff --git a/THIRD-PARTY.txt b/THIRD-PARTY.txt
@@ -306,10 +306,6 @@ List of third-party dependencies grouped by their license type.
         * Protocol Buffers [Core] (com.google.protobuf:protobuf-java:3.22.3 - https://developers.google.com/protocol-buffers/protobuf-java/)
         * Protocol Buffers [Core] (com.google.protobuf:protobuf-java:3.25.3 - https://developers.google.com/protocol-buffers/protobuf-java/)
 
-    BSD 3-clause License w/nuclear disclaimer
-
-        * Java Advanced Imaging Image I/O Tools API core (standalone) (com.github.jai-imageio:jai-imageio-core:1.4.0 - https://github.com/jai-imageio/jai-imageio-core)
-
     BSD License
 
         * curvesapi (com.github.virtuald:curvesapi:1.08 - https://github.com/virtuald/curvesapi)

diff --git a/archetype/src/main/resources/archetype-resources/README.md b/archetype/src/main/resources/archetype-resources/README.md
@@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content.
 
 # Prerequisites
 
-You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively, 
-the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker. 
+You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. 
 
 You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so:
 

diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
@@ -509,7 +509,7 @@ public void run() {
                     metadata = new Metadata();
                 }
 
-                // https://github.com/DigitalPebble/storm-crawler/issues/813
+                // https://github.com/apache/incubator-stormcrawler/issues/813
                 metadata.remove("fetch.exception");
 
                 boolean asap = false;
@@ -568,7 +568,7 @@ public void run() {
                     }
 
                     // has found sitemaps
-                    // https://github.com/DigitalPebble/storm-crawler/issues/710
+                    // https://github.com/apache/incubator-stormcrawler/issues/710
                     // note: we don't care if the sitemap URLs where actually
                     // kept
                     boolean foundSitemap = (rules.getSitemaps().size() > 0);
@@ -732,7 +732,7 @@ public void run() {
                             mergedMD.setValue("_redirTo", redirection);
                         }
 
-                        // https://github.com/DigitalPebble/storm-crawler/issues/954
+                        // https://github.com/apache/incubator-stormcrawler/issues/954
                         if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
                             emitOutlink(fit.t, url, redirection, mergedMD);
                         }

diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -347,7 +347,7 @@ public void execute(Tuple tuple) {
                     LOG.info("Found redir in {} to {}", url, redirection);
                     metadata.setValue("_redirTo", redirection);
 
-                    // https://github.com/DigitalPebble/storm-crawler/issues/954
+                    // https://github.com/apache/incubator-stormcrawler/issues/954
                     if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
                         emitOutlink(tuple, new URL(url), redirection, metadata);
                     }

diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java
@@ -256,7 +256,7 @@ public void execute(Tuple input) {
             metadata = new Metadata();
         }
 
-        // https://github.com/DigitalPebble/storm-crawler/issues/813
+        // https://github.com/apache/incubator-stormcrawler/issues/813
         metadata.remove("fetch.exception");
 
         URL url;
@@ -326,7 +326,7 @@ public void execute(Tuple input) {
             }
 
             // has found sitemaps
-            // https://github.com/DigitalPebble/storm-crawler/issues/710
+            // https://github.com/apache/incubator-stormcrawler/issues/710
             // note: we don't care if the sitemap URLs where actually
             // kept
             boolean foundSitemap = (rules.getSitemaps().size() > 0);

diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java
@@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter {
     /** Nutch 1098 - finds URL encoded parts of the URL */
     private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})");
 
-    /** https://github.com/DigitalPebble/storm-crawler/issues/401 * */
+    /** https://github.com/apache/incubator-stormcrawler/issues/401 * */
     private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})");
 
     // charset used for encoding URLs before escaping

diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -112,7 +112,7 @@ public void loadJSONResources(InputStream inputStream)
 
         // if it contains a single object
         // jump directly to its content
-        // https://github.com/DigitalPebble/storm-crawler/issues/1013
+        // https://github.com/apache/incubator-stormcrawler/issues/1013
         if (rootNode.size() == 1 && rootNode.isObject()) {
             rootNode = rootNode.fields().next().getValue();
         }

diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java
@@ -36,7 +36,7 @@
  * </pre>
  *
  * Will be replaced by <a href=
- * "https://github.com/DigitalPebble/storm-crawler/issues/711">MetadataFilter to filter based on
+ * "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on
  * multiple key values</a>
  *
  * @since 1.14

diff --git a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java
@@ -207,7 +207,7 @@ public void execute(Tuple tuple) {
         if (!status.equals(Status.FETCH_ERROR)) {
             metadata.remove(Constants.fetchErrorCountParamName);
         }
-        // https://github.com/DigitalPebble/storm-crawler/issues/415
+        // https://github.com/apache/incubator-stormcrawler/issues/415
         // remove error related key values in case of success
         if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) {
             metadata.remove(Constants.STATUS_ERROR_CAUSE);

diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java
@@ -58,7 +58,7 @@ public class ProtocolResponse {
 
     /**
      * @since 1.17
-     * @see <a href="https://github.com/DigitalPebble/storm-crawler/issues/776">Issue 776</a>
+     * @see <a href="https://github.com/apache/incubator-stormcrawler/issues/776">Issue 776</a>
      */
     public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix";
 

diff --git a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java
@@ -186,7 +186,7 @@ private static String getCharsetFromMeta(byte buffer[], int maxlength) {
         int start = html.indexOf("<meta charset=\"");
         if (start != -1) {
             int end = html.indexOf('"', start + 15);
-            // https://github.com/DigitalPebble/storm-crawler/issues/870
+            // https://github.com/apache/incubator-stormcrawler/issues/870
             // try on a slightly larger section of text if it is trimmed
             if (end == -1 && ((maxlength + 10) < buffer.length)) {
                 return getCharsetFromMeta(buffer, maxlength + 10);

diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java
@@ -40,7 +40,7 @@ void setupParserBolt() {
     }
 
     private void checkOutput() {
-        Assertions.assertEquals(170, output.getEmitted(Constants.StatusStreamName).size());
+        Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
         List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0);
         Assertions.assertEquals(3, fields.size());
     }
@@ -51,7 +51,7 @@ void testFeedParsing() throws IOException {
         Metadata metadata = new Metadata();
         // specify that it is a Feed file
         metadata.setValue(FeedParserBolt.isFeedKey, "true");
-        parse("http://www.guardian.com/Feed.xml", "guardian.rss", metadata);
+        parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
         checkOutput();
     }
 
@@ -66,7 +66,7 @@ void testFeedParsingNoMT() throws IOException {
         Metadata metadata = new Metadata();
         // set mime-type
         metadata.setValue("http." + HttpHeaders.CONTENT_TYPE, "application/rss+xml");
-        parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
+        parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
         checkOutput();
     }
 
@@ -78,15 +78,15 @@ void testFeedParsingDetextBytes() throws IOException {
         bolt.prepare(
                 parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
         Metadata metadata = new Metadata();
-        parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
+        parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
         checkOutput();
     }
 
     @Test
     void testNonFeedParsing() throws IOException {
         prepareParserBolt("test.parsefilters.json");
         // do not specify that it is a feed file
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html", new Metadata());
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", new Metadata());
         Assertions.assertEquals(1, output.getEmitted().size());
     }
 }
diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
@@ -119,7 +119,7 @@ void setupParserBolt() {
     void testNoScriptInText() throws IOException {
         bolt.prepare(
                 new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
         List<Object> parsedTuple = output.getEmitted().remove(0);
         // check in the metadata that the values match
         String text = (String) parsedTuple.get(3);
@@ -133,9 +133,9 @@ void testNoScriptInText() throws IOException {
     void testNoFollowOutlinks() throws IOException {
         bolt.prepare(
                 new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
-        Assertions.assertEquals(10, statusTuples.size());
+        Assertions.assertEquals(25, statusTuples.size());
     }
 
     @Test
@@ -144,7 +144,7 @@ void testHTTPRobots() throws IOException {
                 new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
         Metadata metadata = new Metadata();
         metadata.setValues("X-Robots-Tag", new String[] {"noindex", "nofollow"});
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata);
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", metadata);
         List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
         // no outlinks at all
         Assertions.assertEquals(0, statusTuples.size());
@@ -170,7 +170,7 @@ void testRobotsMetaProcessor() throws IOException {
                 new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
         for (int i = 0; i < tests.length; i++) {
             byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
-            parse("http://www.digitalpebble.com", bytes, new Metadata());
+            parse("http://stormcrawler.apache.org", bytes, new Metadata());
             Assertions.assertEquals(1, output.getEmitted().size());
             List<Object> parsedTuple = output.getEmitted().remove(0);
             // check in the metadata that the values match
@@ -205,7 +205,7 @@ void testHTMLRedir() throws IOException {
     void testExecuteWithOutlinksLimit() throws IOException {
         stormConf.put("parser.emitOutlinks.max.per.page", 5);
         bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
         // outlinks being limited by property
         Assertions.assertEquals(5, statusTuples.size());
@@ -215,10 +215,10 @@ void testExecuteWithOutlinksLimit() throws IOException {
     void testExecuteWithOutlinksLimitDisabled() throws IOException {
         stormConf.put("parser.emitOutlinks.max.per.page", -1);
         bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
-        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
         // outlinks NOT being limited by property, since is disabled with -1
-        Assertions.assertEquals(10, statusTuples.size());
+        Assertions.assertEquals(25, statusTuples.size());
     }
 
     @Test