Skip to content

Commit

Permalink
Replace tripadvisor sitemap example with stormcrawler.apache.org content
Browse files Browse the repository at this point in the history
  • Loading branch information
rzo1 committed Nov 22, 2024
1 parent ee50de8 commit fa509a5
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ void testSitemapIndexParsing() throws IOException {
// and its mime-type
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://www.tripadvisor.com/sitemap-index.xml",
"tripadvisor.sitemap.index.xml",
"http://stormcrawler.apache.org/sitemap-index.xml",
"stormcrawler.sitemap.index.xml",
metadata);
for (List<Object> fields : output.getEmitted(Constants.StatusStreamName)) {
Metadata parsedMetadata = (Metadata) fields.get(1);
Expand All @@ -85,8 +85,8 @@ void testGzipSitemapParsing() throws IOException {
Metadata metadata = new Metadata();
// specify that it is a sitemap file
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
parse("https://www.tripadvisor.com/sitemap.xml.gz", "tripadvisor.sitemap.xml.gz", metadata);
Assertions.assertEquals(50001, output.getEmitted(Constants.StatusStreamName).size());
parse("https://stormcrawler.apache.org/sitemap.xml.gz", "stormcrawler.sitemap.xml.gz", metadata);
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ specific language governing permissions and limitations
under the License.
-->
<sitemapindex
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd">
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806509-en_US-hotel_review-1686849999.xml.gz</loc>
<lastmod>2023-06-15T17:26:39Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806530-en_US-hotel_review-1686850054.xml.gz</loc>
<lastmod>2023-06-15T17:27:34Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806537-en_US-hotel_review-1686850072.xml.gz</loc>
<lastmod>2023-06-15T17:27:52Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1841024-en_US-hotel_review-1694976638.xml.gz</loc>
<lastmod>2023-09-17T18:50:38Z</lastmod>
</sitemap>
</sitemapindex>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-001.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-002.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-003.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-004.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
</sitemapindex>
Binary file added core/src/test/resources/stormcrawler.sitemap.xml.gz
Binary file not shown.
Binary file removed core/src/test/resources/tripadvisor.sitemap.xml.gz
Binary file not shown.

0 comments on commit fa509a5

Please sign in to comment.