diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java index de8d77784..8e6b4755f 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java @@ -68,8 +68,8 @@ void testSitemapIndexParsing() throws IOException { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.tripadvisor.com/sitemap-index.xml", - "tripadvisor.sitemap.index.xml", + "http://stormcrawler.apache.org/sitemap-index.xml", + "stormcrawler.sitemap.index.xml", metadata); for (List fields : output.getEmitted(Constants.StatusStreamName)) { Metadata parsedMetadata = (Metadata) fields.get(1); @@ -85,8 +85,8 @@ void testGzipSitemapParsing() throws IOException { Metadata metadata = new Metadata(); // specify that it is a sitemap file metadata.setValue(SiteMapParserBolt.isSitemapKey, "true"); - parse("https://www.tripadvisor.com/sitemap.xml.gz", "tripadvisor.sitemap.xml.gz", metadata); - Assertions.assertEquals(50001, output.getEmitted(Constants.StatusStreamName).size()); + parse("https://stormcrawler.apache.org/sitemap.xml.gz", "stormcrawler.sitemap.xml.gz", metadata); + Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size()); } @Test diff --git a/core/src/test/resources/tripadvisor.sitemap.index.xml b/core/src/test/resources/stormcrawler.sitemap.index.xml similarity index 50% rename from core/src/test/resources/tripadvisor.sitemap.index.xml rename to core/src/test/resources/stormcrawler.sitemap.index.xml index 798cd8fb7..a1d80d80c 100644 --- a/core/src/test/resources/tripadvisor.sitemap.index.xml +++ b/core/src/test/resources/stormcrawler.sitemap.index.xml @@ -18,24 +18,24 @@ specific language governing permissions and limitations under the License. --> - - https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806509-en_US-hotel_review-1686849999.xml.gz - 2023-06-15T17:26:39Z - - - https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806530-en_US-hotel_review-1686850054.xml.gz - 2023-06-15T17:27:34Z - - - https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806537-en_US-hotel_review-1686850072.xml.gz - 2023-06-15T17:27:52Z - - - https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1841024-en_US-hotel_review-1694976638.xml.gz - 2023-09-17T18:50:38Z - - \ No newline at end of file + + https://stormcrawler.apache.org/sitemap-001.xml.gz + 2024-10-19T11:21:53Z + + + https://stormcrawler.apache.org/sitemap-002.xml.gz + 2024-10-19T11:21:53Z + + + https://stormcrawler.apache.org/sitemap-003.xml.gz + 2024-10-19T11:21:53Z + + + https://stormcrawler.apache.org/sitemap-004.xml.gz + 2024-10-19T11:21:53Z + + diff --git a/core/src/test/resources/stormcrawler.sitemap.xml.gz b/core/src/test/resources/stormcrawler.sitemap.xml.gz new file mode 100644 index 000000000..fe4ae7abe Binary files /dev/null and b/core/src/test/resources/stormcrawler.sitemap.xml.gz differ diff --git a/core/src/test/resources/tripadvisor.sitemap.xml.gz b/core/src/test/resources/tripadvisor.sitemap.xml.gz deleted file mode 100644 index e6f53717d..000000000 Binary files a/core/src/test/resources/tripadvisor.sitemap.xml.gz and /dev/null differ