Skip to content

Commit

Permalink
Fixes Issues mentioned in IPMC Vote (#1417)
Browse files Browse the repository at this point in the history
* Excludes BSD w/ Nuclear Option License brought in via Tika Image Parser Package

* Remove references to digitalpebble.com (README, HTML)

* Remove references to digitalpebble.com (sitemaps, src issue refs)

* Replace tripadvisor sitemap example with stormcrawler.apache.org content
Replace guardian.rss feed with stormcrawler.apache.org
  • Loading branch information
rzo1 authored Nov 22, 2024
1 parent 5ce9006 commit 94bfadc
Show file tree
Hide file tree
Showing 53 changed files with 661 additions and 4,222 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ NOTE: These instructions assume that you have [Apache Maven](https://maven.apach

StormCrawler requires Java 11 or above. To execute tests, it requires you to have a locally installed and working Docker environment.

DigitalPebble's [Ansible-Storm](https://github.com/DigitalPebble/ansible-storm) repository contains resources to install Apache Storm using Ansible. Alternatively, this [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project should help you run Apache Storm on Docker.

Once Storm is installed, the easiest way to get started is to generate a new StormCrawler project following the instructions below:

```shell
Expand Down
4 changes: 0 additions & 4 deletions THIRD-PARTY.txt
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,6 @@ List of third-party dependencies grouped by their license type.
* Protocol Buffers [Core] (com.google.protobuf:protobuf-java:3.22.3 - https://developers.google.com/protocol-buffers/protobuf-java/)
* Protocol Buffers [Core] (com.google.protobuf:protobuf-java:3.25.3 - https://developers.google.com/protocol-buffers/protobuf-java/)

BSD 3-clause License w/nuclear disclaimer

* Java Advanced Imaging Image I/O Tools API core (standalone) (com.github.jai-imageio:jai-imageio-core:1.4.0 - https://github.com/jai-imageio/jai-imageio-core)

BSD License

* curvesapi (com.github.virtuald:curvesapi:1.08 - https://github.com/virtuald/curvesapi)
Expand Down
3 changes: 1 addition & 2 deletions archetype/src/main/resources/archetype-resources/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content.

# Prerequisites

You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively,
the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker.
You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help.

You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ public void run() {
metadata = new Metadata();
}

// https://github.com/DigitalPebble/storm-crawler/issues/813
// https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");

boolean asap = false;
Expand Down Expand Up @@ -568,7 +568,7 @@ public void run() {
}

// has found sitemaps
// https://github.com/DigitalPebble/storm-crawler/issues/710
// https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
Expand Down Expand Up @@ -732,7 +732,7 @@ public void run() {
mergedMD.setValue("_redirTo", redirection);
}

// https://github.com/DigitalPebble/storm-crawler/issues/954
// https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(fit.t, url, redirection, mergedMD);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ public void execute(Tuple tuple) {
LOG.info("Found redir in {} to {}", url, redirection);
metadata.setValue("_redirTo", redirection);

// https://github.com/DigitalPebble/storm-crawler/issues/954
// https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(tuple, new URL(url), redirection, metadata);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ public void execute(Tuple input) {
metadata = new Metadata();
}

// https://github.com/DigitalPebble/storm-crawler/issues/813
// https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");

URL url;
Expand Down Expand Up @@ -326,7 +326,7 @@ public void execute(Tuple input) {
}

// has found sitemaps
// https://github.com/DigitalPebble/storm-crawler/issues/710
// https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter {
/** Nutch 1098 - finds URL encoded parts of the URL */
private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})");

/** https://github.com/DigitalPebble/storm-crawler/issues/401 * */
/** https://github.com/apache/incubator-stormcrawler/issues/401 * */
private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})");

// charset used for encoding URLs before escaping
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public void loadJSONResources(InputStream inputStream)

// if it contains a single object
// jump directly to its content
// https://github.com/DigitalPebble/storm-crawler/issues/1013
// https://github.com/apache/incubator-stormcrawler/issues/1013
if (rootNode.size() == 1 && rootNode.isObject()) {
rootNode = rootNode.fields().next().getValue();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
* </pre>
*
* Will be replaced by <a href=
* "https://github.com/DigitalPebble/storm-crawler/issues/711">MetadataFilter to filter based on
* "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on
* multiple key values</a>
*
* @since 1.14
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ public void execute(Tuple tuple) {
if (!status.equals(Status.FETCH_ERROR)) {
metadata.remove(Constants.fetchErrorCountParamName);
}
// https://github.com/DigitalPebble/storm-crawler/issues/415
// https://github.com/apache/incubator-stormcrawler/issues/415
// remove error related key values in case of success
if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) {
metadata.remove(Constants.STATUS_ERROR_CAUSE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class ProtocolResponse {

/**
* @since 1.17
* @see <a href="https://github.com/DigitalPebble/storm-crawler/issues/776">Issue 776</a>
* @see <a href="https://github.com/apache/incubator-stormcrawler/issues/776">Issue 776</a>
*/
public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ private static String getCharsetFromMeta(byte buffer[], int maxlength) {
int start = html.indexOf("<meta charset=\"");
if (start != -1) {
int end = html.indexOf('"', start + 15);
// https://github.com/DigitalPebble/storm-crawler/issues/870
// https://github.com/apache/incubator-stormcrawler/issues/870
// try on a slightly larger section of text if it is trimmed
if (end == -1 && ((maxlength + 10) < buffer.length)) {
return getCharsetFromMeta(buffer, maxlength + 10);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void setupParserBolt() {
}

private void checkOutput() {
Assertions.assertEquals(170, output.getEmitted(Constants.StatusStreamName).size());
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0);
Assertions.assertEquals(3, fields.size());
}
Expand All @@ -51,7 +51,7 @@ void testFeedParsing() throws IOException {
Metadata metadata = new Metadata();
// specify that it is a Feed file
metadata.setValue(FeedParserBolt.isFeedKey, "true");
parse("http://www.guardian.com/Feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

Expand All @@ -66,7 +66,7 @@ void testFeedParsingNoMT() throws IOException {
Metadata metadata = new Metadata();
// set mime-type
metadata.setValue("http." + HttpHeaders.CONTENT_TYPE, "application/rss+xml");
parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

Expand All @@ -78,15 +78,15 @@ void testFeedParsingDetextBytes() throws IOException {
bolt.prepare(
parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
Metadata metadata = new Metadata();
parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

@Test
void testNonFeedParsing() throws IOException {
prepareParserBolt("test.parsefilters.json");
// do not specify that it is a feed file
parse("http://www.digitalpebble.com", "digitalpebble.com.html", new Metadata());
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", new Metadata());
Assertions.assertEquals(1, output.getEmitted().size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ void setupParserBolt() {
void testNoScriptInText() throws IOException {
bolt.prepare(
new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
parse("http://www.digitalpebble.com", "digitalpebble.com.html");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
List<Object> parsedTuple = output.getEmitted().remove(0);
// check in the metadata that the values match
String text = (String) parsedTuple.get(3);
Expand All @@ -133,9 +133,9 @@ void testNoScriptInText() throws IOException {
void testNoFollowOutlinks() throws IOException {
bolt.prepare(
new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
parse("http://www.digitalpebble.com", "digitalpebble.com.html");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
Assertions.assertEquals(10, statusTuples.size());
Assertions.assertEquals(25, statusTuples.size());
}

@Test
Expand All @@ -144,7 +144,7 @@ void testHTTPRobots() throws IOException {
new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
Metadata metadata = new Metadata();
metadata.setValues("X-Robots-Tag", new String[] {"noindex", "nofollow"});
parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata);
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", metadata);
List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
// no outlinks at all
Assertions.assertEquals(0, statusTuples.size());
Expand All @@ -170,7 +170,7 @@ void testRobotsMetaProcessor() throws IOException {
new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
for (int i = 0; i < tests.length; i++) {
byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
parse("http://www.digitalpebble.com", bytes, new Metadata());
parse("http://stormcrawler.apache.org", bytes, new Metadata());
Assertions.assertEquals(1, output.getEmitted().size());
List<Object> parsedTuple = output.getEmitted().remove(0);
// check in the metadata that the values match
Expand Down Expand Up @@ -205,7 +205,7 @@ void testHTMLRedir() throws IOException {
void testExecuteWithOutlinksLimit() throws IOException {
stormConf.put("parser.emitOutlinks.max.per.page", 5);
bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
parse("http://www.digitalpebble.com", "digitalpebble.com.html");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
// outlinks being limited by property
Assertions.assertEquals(5, statusTuples.size());
Expand All @@ -215,10 +215,10 @@ void testExecuteWithOutlinksLimit() throws IOException {
void testExecuteWithOutlinksLimitDisabled() throws IOException {
stormConf.put("parser.emitOutlinks.max.per.page", -1);
bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
parse("http://www.digitalpebble.com", "digitalpebble.com.html");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName);
// outlinks NOT being limited by property, since is disabled with -1
Assertions.assertEquals(10, statusTuples.size());
Assertions.assertEquals(25, statusTuples.size());
}

@Test
Expand Down
Loading

0 comments on commit 94bfadc

Please sign in to comment.