Skip to content

Commit

Permalink
Fix potential integer overflow in string literal parsing
Browse files Browse the repository at this point in the history
When "backtracking" the goal really is for the index to be the same on the next iteration of the loop. So, instead of subtracting in a defer and unconditionally incrementing in the while loop continue expression, we instead just avoid incrementing when "backtracking" to avoid the possible integer overflow in certain rare scenarios.
  • Loading branch information
squeek502 committed Oct 30, 2023
1 parent bb6023a commit cd4e15c
Showing 1 changed file with 32 additions and 15 deletions.
47 changes: 32 additions & 15 deletions src/literals.zig
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,14 @@ pub const IterativeStringParser = struct {
.wide => 4,
};

while (self.code_page.codepointAt(self.index, self.source)) |codepoint| : (self.index += codepoint.byte_len) {
var backtrack: bool = undefined;
while (self.code_page.codepointAt(self.index, self.source)) |codepoint| : ({
if (!backtrack) self.index += codepoint.byte_len;
}) {
backtrack = false;
const c = codepoint.value;
var backtrack = false;
defer {
if (backtrack) {
self.index -= codepoint.byte_len;
} else {
if (!backtrack) {
if (c == '\t') {
self.column += columnsUntilTabStop(self.column, 8);
} else {
Expand Down Expand Up @@ -213,10 +214,12 @@ pub const IterativeStringParser = struct {
.newline => switch (c) {
'\r', ' ', '\t', '\n', '\x0b', '\x0c', '\xa0' => {},
else => {
// backtrack so that we handle the current char properly
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
// and we set backtrack so column count is handled correctly
backtrack = true;

// <space><newline>
self.index += codepoint.byte_len;
self.pending_codepoint = '\n';
return .{ .codepoint = ' ' };
},
Expand Down Expand Up @@ -263,9 +266,10 @@ pub const IterativeStringParser = struct {
else => switch (self.declared_string_type) {
.wide => {}, // invalid escape sequences are skipped in wide strings
.ascii => {
// backtrack so that we handle the current char properly
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
// and we set backtrack so column count is handled correctly
backtrack = true;
self.index += codepoint.byte_len;
return .{ .codepoint = '\\' };
},
},
Expand All @@ -277,9 +281,10 @@ pub const IterativeStringParser = struct {
'\r' => {},
'\n' => state = .escaped_newlines,
else => {
// backtrack so that we handle the current char properly
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
// and we set backtrack so column count is handled correctly
backtrack = true;
self.index += codepoint.byte_len;
return .{ .codepoint = '\\' };
},
},
Expand All @@ -306,14 +311,16 @@ pub const IterativeStringParser = struct {
}
},
else => {
// backtrack so that we handle the current char properly
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
// and we set backtrack so column count is handled correctly
backtrack = true;

// write out whatever byte we have parsed so far
const escaped_value = switch (self.declared_string_type) {
.ascii => @as(u8, @truncate(string_escape_n)),
.wide => string_escape_n,
};
self.index += codepoint.byte_len;
return .{ .codepoint = escaped_value, .from_escaped_integer = true };
},
},
Expand All @@ -332,15 +339,17 @@ pub const IterativeStringParser = struct {
}
},
else => {
// backtrack so that we handle the current char properly
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
// and we set backtrack so column count is handled correctly
backtrack = true;

// write out whatever byte we have parsed so far
// (even with 0 actual digits, \x alone parses to 0)
const escaped_value = switch (self.declared_string_type) {
.ascii => @as(u8, @truncate(string_escape_n)),
.wide => string_escape_n,
};
self.index += codepoint.byte_len;
return .{ .codepoint = escaped_value, .from_escaped_integer = true };
},
},
Expand Down Expand Up @@ -651,6 +660,14 @@ test "parse quoted ascii string with utf8 code page" {
.{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 },
.{ .output_code_page = .utf8 },
));

// This used to cause integer overflow when reconsuming the 4-byte long codepoint
// after the escaped CRLF pair.
try std.testing.expectEqualSlices(u8, "\u{10348}", try parseQuotedAsciiString(
arena,
.{ .slice = "\"\\\r\n\u{10348}\"", .code_page = .utf8 },
.{ .output_code_page = .utf8 },
));
}

test "parse quoted wide string" {
Expand Down

0 comments on commit cd4e15c

Please sign in to comment.