Skip to content

Commit

Permalink
Improve sanitization of broken comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaiter committed Jan 19, 2024
1 parent f25c910 commit 36adcfe
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -171,28 +171,19 @@ public int read(char[] cbuf, int off, int len) throws IOException {
}

if (cbuf[startElem + 1] == '!') {
boolean illegal = (
// Comment?
(cbuf[startElem + 2] == '-' && cbuf[startElem + 3] != '-')
// Doctype?
|| ((cbuf[startElem + 2] == 'D' || cbuf[startElem + 2] == 'd')
&& (endElem - startElem) < 12)
// CDATA?
|| (cbuf[startElem + 2] == '[' && (endElem - startElem) < 10));
if (illegal) {
boolean isComment = isLegalComment(cbuf, startElem, endElem);
boolean isDoctype =
(cbuf[startElem + 2] == 'D' || cbuf[startElem + 2] == 'd')
&& ((endElem - startElem) >= 12);
boolean isCdata = (cbuf[startElem + 2] == '[') && ((endElem - startElem) >= 10);
if (!isComment && !isDoctype && !isCdata) {
cbuf[startElem] = '_';
cbuf[endElem] = '-';
continue;
cbuf[endElem] = '_';
}
}
}

if (cbuf[startElem + 1] == '!' && cbuf[startElem + 2] == '-'
&& cbuf[startElem + 3] == '-') {
// Comment, nothing to do
continue;
}
} else if (cbuf[startElem + 1] == '!' && cbuf[startElem + 2] == '-' && cbuf[startElem + 3] == '-') {
// Comment, nothing to do
if (isLegalComment(cbuf, startElem, endElem)) {
continue;
}

Expand Down Expand Up @@ -363,6 +354,13 @@ private static boolean isLegalEntity(char[] cbuf, int startIdx, int endIdx) {
return true;
}

private static boolean isLegalComment(char[] cbuf, int startIdx, int endIdx) {
if (cbuf[startIdx + 1] != '!' || cbuf[startIdx + 2] != '-' || cbuf[startIdx + 3] != '-') {
return false;
}
return cbuf[endIdx - 2] == '-' && cbuf[endIdx - 1] == '-';
}

@Override
public Optional<String> getSource() {
if (this.input instanceof SourceAwareReader) {
Expand Down
29 changes: 28 additions & 1 deletion src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,6 @@ public void testBrokenEntities() throws IOException {
"contains(.//lst[@name='87372']//arr[@name='snippets']/lst/str[@name='text']/text(), 'dt_HiFi-i?cBßflpedx1ttonI-iii;_;ikW')");
}


public void testBrokenPIs() throws IOException {
Path ocrPath = Paths.get("src/test/resources/data/hocr_broken_pis.html");
assertU(
Expand Down Expand Up @@ -589,6 +588,34 @@ public void testBrokenPIs() throws IOException {
"contains(.//lst[@name='87373']//arr[@name='snippets']/lst/str[@name='text']/text(), '_?«»«?_i_5t»_?».')");
}

public void testBrokenComment() throws IOException {
Path ocrPath = Paths.get("src/test/resources/data/hocr_broken_comment.html");
assertU(
adoc(
"ocr_text_stored",
new String(Files.readAllBytes(ocrPath), StandardCharsets.UTF_8),
"id",
"87374"));
assertU(commit());
SolrQueryRequest req =
xmlQ(
"q",
"eiiigekaufs",
"hl.snippets",
"4096",
"hl.weightMatches",
"true",
"hl.ocr.contextSize",
"4",
"df",
"ocr_text_stored",
"hl.ocr.fl",
"ocr_text_stored");
assertQ(
req,
"contains(.//lst[@name='87374']//arr[@name='snippets']/lst/str[@name='text']/text(), \"!'_!--,,,_\")");
}

public void testHlQParam() {
SolrQueryRequest req = xmlQ("q", "ocr_text:\"nathanael brush\"", "hl.q", "nathanael");
assertQ(
Expand Down
Loading

0 comments on commit 36adcfe

Please sign in to comment.