Skip to content

Commit

Permalink
Export line-too-long information from the line parser method.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 557406552
  • Loading branch information
garyillyes committed May 22, 2024
1 parent 3f3cf4d commit da0b84f
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 24 deletions.
18 changes: 6 additions & 12 deletions reporting_robots.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ static const std::vector<std::string> kUnsupportedTags = {
"clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"};

void RobotsParsingReporter::Digest(int line_num,
RobotsParsedLine::RobotsTagName parsed_tag,
bool is_typo) {
RobotsParsedLine::RobotsTagName parsed_tag) {
if (line_num > last_line_seen_) {
last_line_seen_ = line_num;
}
Expand All @@ -30,7 +29,6 @@ void RobotsParsingReporter::Digest(int line_num,

RobotsParsedLine& line = robots_parse_results_[line_num];
line.line_num = line_num;
line.is_typo = is_typo;
line.tag_name = parsed_tag;
}

Expand All @@ -53,23 +51,19 @@ void RobotsParsingReporter::HandleRobotsStart() {
void RobotsParsingReporter::HandleRobotsEnd() {}
void RobotsParsingReporter::HandleUserAgent(int line_num,
absl::string_view line_value) {
Digest(line_num, RobotsParsedLine::kUserAgent,
false /* typo info not yet available */);
Digest(line_num, RobotsParsedLine::kUserAgent);
}
void RobotsParsingReporter::HandleAllow(int line_num,
absl::string_view line_value) {
Digest(line_num, RobotsParsedLine::kAllow,
false /* we don't accept typos for sallow keys */);
Digest(line_num, RobotsParsedLine::kAllow);
}
void RobotsParsingReporter::HandleDisallow(int line_num,
absl::string_view line_value) {
Digest(line_num, RobotsParsedLine::kDisallow,
false /* typo info not yet available */);
Digest(line_num, RobotsParsedLine::kDisallow);
}
void RobotsParsingReporter::HandleSitemap(int line_num,
absl::string_view line_value) {
Digest(line_num, RobotsParsedLine::kSitemap,
false /* we don't accept typos for sitemap keys */);
Digest(line_num, RobotsParsedLine::kSitemap);
}
void RobotsParsingReporter::HandleUnknownAction(int line_num,
absl::string_view action,
Expand All @@ -79,7 +73,7 @@ void RobotsParsingReporter::HandleUnknownAction(int line_num,
? RobotsParsedLine::kUnused
: RobotsParsedLine::kUnknown;
unused_directives_++;
Digest(line_num, rtn, false /* by definition these can't be typos */);
Digest(line_num, rtn);
}

} // namespace googlebot
7 changes: 2 additions & 5 deletions reporting_robots.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#ifndef THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
#define THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_

#include <algorithm>
#include <string>
#include <vector>

#include "absl/container/btree_map.h"
#include "absl/log/check.h"
#include "absl/strings/string_view.h"
#include "robots.h"

namespace googlebot {
Expand Down Expand Up @@ -62,8 +60,7 @@ class RobotsParsingReporter : public googlebot::RobotsParseHandler {
}

private:
void Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag,
bool is_typo);
void Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag);

// Indexed and sorted by line number.
absl::btree_map<int, RobotsParsedLine> robots_parse_results_;
Expand Down
67 changes: 65 additions & 2 deletions reporting_robots_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ std::string LineMetadataToString(const RobotsParseHandler::LineMetadata& line) {
" has_directive: ", line.has_directive,
" has_comment: ", line.has_comment,
" is_comment: ", line.is_comment,
" is_acceptable_typo: ", line.is_acceptable_typo, " }");
" is_acceptable_typo: ", line.is_acceptable_typo,
" is_line_too_long: ", line.is_line_too_long, " }");
}

std::string TagNameToString(RobotsParsedLine::RobotsTagName tag_name) {
Expand Down Expand Up @@ -77,7 +78,10 @@ bool operator==(const RobotsParseHandler::LineMetadata& lhs,
const RobotsParseHandler::LineMetadata& rhs) {
return lhs.is_empty == rhs.is_empty &&
lhs.has_directive == rhs.has_directive &&
lhs.has_comment == rhs.has_comment && lhs.is_comment == rhs.is_comment;
lhs.has_comment == rhs.has_comment &&
lhs.is_comment == rhs.is_comment &&
lhs.is_acceptable_typo == rhs.is_acceptable_typo &&
lhs.is_line_too_long == rhs.is_line_too_long;
}

bool operator==(const RobotsParsedLine& lhs, const RobotsParsedLine& rhs) {
Expand Down Expand Up @@ -317,3 +321,62 @@ TEST(RobotsUnittest, LinesNumbersAreCountedCorrectly) {
EXPECT_EQ(4, report.valid_directives());
EXPECT_EQ(7, report.last_line_seen());
}

TEST(RobotsUnittest, LinesTooLongReportedCorrectly) {
RobotsParsingReporter report;
const int kMaxLineLen = 2084 * 8;
std::string allow = "allow: /\n";
std::string disallow = "disallow: ";
std::string robotstxt = "user-agent: foo\n";
std::string longline = "/x/";
while (longline.size() < kMaxLineLen) {
absl::StrAppend(&longline, "a");
}
absl::StrAppend(&robotstxt, disallow, longline, "\n", allow);

googlebot::ParseRobotsTxt(robotstxt, &report);
EXPECT_EQ(3, report.valid_directives());
EXPECT_EQ(4, report.last_line_seen());
EXPECT_EQ(report.parse_results().size(), report.last_line_seen());
std::vector<absl::string_view> lines = absl::StrSplit(robotstxt, '\n');

// For line "user-agent: foo\n" // 1
expectLineToParseTo(
lines, report.parse_results(),
RobotsParsedLine{.line_num = 1,
.tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
.is_typo = false,
.metadata = RobotsParseHandler::LineMetadata{
.is_empty = false,
.has_comment = false,
.is_comment = false,
.has_directive = true,
.is_line_too_long = false,
}});
// For line "disallow: /x/a[...]a\n" // 2
expectLineToParseTo(
lines, report.parse_results(),
RobotsParsedLine{.line_num = 2,
.tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
.is_typo = false,
.metadata = RobotsParseHandler::LineMetadata{
.is_empty = false,
.has_comment = false,
.is_comment = false,
.has_directive = true,
.is_line_too_long = true,
}});
// For line "allow: /\n" // 3
expectLineToParseTo(
lines, report.parse_results(),
RobotsParsedLine{.line_num = 3,
.tag_name = RobotsParsedLine::RobotsTagName::kAllow,
.is_typo = false,
.metadata = RobotsParseHandler::LineMetadata{
.is_empty = false,
.has_comment = false,
.is_comment = false,
.has_directive = true,
.is_line_too_long = false,
}});
}
18 changes: 13 additions & 5 deletions robots.cc
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ class RobotsTxtParser {
RobotsParseHandler::LineMetadata* metadata);
static void StripWhitespaceSlowly(char ** s);

void ParseAndEmitLine(int current_line, char* line);
void ParseAndEmitLine(int current_line, char* line,
bool* line_too_long_strict);
bool NeedEscapeValueForKey(const Key& key);

absl::string_view robots_body_;
Expand Down Expand Up @@ -382,13 +383,15 @@ void RobotsTxtParser::GetKeyAndValueFrom(
}
}

void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line) {
void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line,
bool* line_too_long_strict) {
char* string_key;
char* value;
RobotsParseHandler::LineMetadata line_metadata;
// Note that `string_key` and `value` are only set when
// `line_metadata->has_directive == true`.
GetKeyAndValueFrom(&string_key, &value, line, &line_metadata);
line_metadata.is_line_too_long = *line_too_long_strict;
if (!line_metadata.has_directive) {
handler_->ReportLineMetadata(current_line, line_metadata);
return;
Expand Down Expand Up @@ -416,12 +419,14 @@ void RobotsTxtParser::Parse() {
// that max url length of 2KB. We want some padding for
// UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
// If so, we can ignore the chars on a line past that.
const int kMaxLineLen = 2083 * 8;
const int kBrowserMaxLineLen = 2083;
const int kMaxLineLen = kBrowserMaxLineLen * 8;
// Allocate a buffer used to process the current line.
char* const line_buffer = new char[kMaxLineLen];
// last_line_pos is the last writeable pos within the line array
// (only a final '\0' may go here).
const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
bool line_too_long_strict = false;
char* line_pos = line_buffer;
int line_num = 0;
size_t bom_pos = 0;
Expand All @@ -442,6 +447,8 @@ void RobotsTxtParser::Parse() {
// Put in next spot on current line, as long as there's room.
if (line_pos < line_buffer_end) {
*(line_pos++) = ch;
} else {
line_too_long_strict = true;
}
} else { // Line-ending character char case.
*line_pos = '\0';
Expand All @@ -450,15 +457,16 @@ void RobotsTxtParser::Parse() {
const bool is_CRLF_continuation =
(line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
if (!is_CRLF_continuation) {
ParseAndEmitLine(++line_num, line_buffer);
ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
line_too_long_strict = false;
}
line_pos = line_buffer;
last_was_carriage_return = (ch == 0x0D);
}
}
}
*line_pos = '\0';
ParseAndEmitLine(++line_num, line_buffer);
ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
handler_->HandleRobotsEnd();
delete [] line_buffer;
}
Expand Down
2 changes: 2 additions & 0 deletions robots.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ class RobotsParseHandler {
// of the directive. See the key functions in ParsedRobotsKey for accepted
// typos.
bool is_acceptable_typo = false;
// Indicates that the line is too long, specifically over 2083 * 8 bytes.
bool is_line_too_long = false;
};

virtual void ReportLineMetadata(int line_num, const LineMetadata& metadata) {}
Expand Down

0 comments on commit da0b84f

Please sign in to comment.