diff --git a/samples/bugs/Issue479.pdf b/samples/bugs/Issue479.pdf new file mode 100644 index 00000000..b304d677 Binary files /dev/null and b/samples/bugs/Issue479.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 9a647837..86b157da 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -269,8 +269,11 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref ) { $valid_crs = true; } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) { - // first object number in the subsection - $index_first = (int) ($sarr[($k + 1)][1][0][1]); + // initialize list for: first object number in the subsection / number of objects + $index_blocks = []; + for ($m = 0; $m < \count($sarr[($k + 1)][1]); $m += 2) { + $index_blocks[] = [$sarr[($k + 1)][1][$m][1], $sarr[($k + 1)][1][$m + 1][1]]; + } } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) { // get previous xref offset $prevxref = (int) ($sarr[($k + 1)][1]); @@ -432,8 +435,9 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref } // fill xref - if (isset($index_first)) { - $obj_num = $index_first; + if (isset($index_blocks)) { + // load the first object number of the first /Index entry + $obj_num = $index_blocks[0][0]; } else { $obj_num = 0; } @@ -463,6 +467,21 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref break; } ++$obj_num; + if (isset($index_blocks)) { + // reduce the number of remaining objects + --$index_blocks[0][1]; + if (0 == $index_blocks[0][1]) { + // remove the actual used /Index entry + array_shift($index_blocks); + if (0 < \count($index_blocks)) { + // load the first object number of the following /Index entry + $obj_num = $index_blocks[0][0]; + } else { + // if there are no more entries, remove $index_blocks to avoid actions on an empty array + unset($index_blocks); + } + } + } } } // end decoding data if (isset($prevxref)) { diff --git a/tests/Integration/PageTest.php b/tests/Integration/PageTest.php index 3f136f57..8d0670ce 100644 --- a/tests/Integration/PageTest.php +++ b/tests/Integration/PageTest.php @@ -486,12 +486,17 @@ public function testGetPages() $document = $this->getParserInstance()->parseFile($filename); $pages = $document->getPages(); - // This should actually be 3 pages, but as long as the cause for issue #331 - // has not been found and the issue is not fixed, we'll settle for 2 here. - // We still test for the count, so in case the bug should be fixed - // unknowingly, we don't forget to resolve the issue as well and make sure - // this assertion is present. - $this->assertCount(2, $pages); + /* + * The problem of issue #331 is fixed by the pull request of the issue #479. + * The original Issue331.pdf was modified so for the updated version (actual + * version) a new xref was added and now the valid /Index has the following value: + * [1 1 3 1 7 1 175 1 178 1 219 2] + * This means, that there a 6 pairs containing the values for 'first object id' + * and 'number of objects'. Till now only the first entry was used and so the + * objects of all following entries gots a wrong id. + * By the fix of issue #479 now the expected number of pages is counted. + */ + $this->assertCount(3, $pages); foreach ($pages as $page) { $this->assertTrue($page instanceof Page); diff --git a/tests/Integration/RawData/RawDataParserTest.php b/tests/Integration/RawData/RawDataParserTest.php index 7f15870f..f5b16fa9 100644 --- a/tests/Integration/RawData/RawDataParserTest.php +++ b/tests/Integration/RawData/RawDataParserTest.php @@ -119,4 +119,35 @@ public function testDecodeObjectHeaderIssue405() $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText()); } + + /** + * Tests buggy behavior of decodeXrefStream. + * + * When PDF has more than one entry in the /Index area (for example by changing + * the document description), only the first entry is used. + * If the fix is not used the array returned by getDetails() contains only the entry + * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title', + * 'Subject' (which come from the 'Info' object) are not listed, because the + * 'Info' object gets a wrong object id during parsing the data into the xref structure. + * So the object id listed at the /Info entry is not valid and the data of the info object + * cannot be loaded during executing Document::buildDetails(). + * + * @see https://github.com/smalot/pdfparser/pull/479 + */ + public function testDecodeXrefStreamIssue479() + { + $filename = $this->rootDir.'/samples/bugs/Issue479.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $details = $document->getDetails(); + + $this->assertArrayHasKey('Author', $details); + $this->assertArrayHasKey('CreationDate', $details); + $this->assertArrayHasKey('Creator', $details); + $this->assertArrayHasKey('ModDate', $details); + $this->assertArrayHasKey('Producer', $details); + $this->assertArrayHasKey('Subject', $details); + $this->assertArrayHasKey('Title', $details); + } }