diff --git a/samples/bugs/Issue454.pdf b/samples/bugs/Issue454.pdf new file mode 100644 index 00000000..ec681890 Binary files /dev/null and b/samples/bugs/Issue454.pdf differ diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index 93899bd7..506a9457 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -224,54 +224,147 @@ public function getText(self $page = null): string return ''; } + /** + * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document + * + * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the + * pdf file was generated by FPDF/Fpfi. + * + * @return bool true is the current page is a FPDI/FPDF document + */ + public function isFpdf(): bool + { + if (\array_key_exists('Producer', $this->document->getDetails()) && + \is_string($this->document->getDetails()['Producer']) && + str_starts_with($this->document->getDetails()['Producer'], 'FPDF')) { + return true; + } + + return false; + } + + /** + * Return the page number of the PDF document of the page object + * + * @return int the page number + */ + public function getPageNumber(): int + { + $pages = $this->document->getPages(); + $numOfPages = \count($pages); + for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) { + if ($pages[$pageNum] === $this) { + break; + } + } + + return $pageNum; + } + + /** + * Return the Object of the page if the document is a FPDF/FPDI document + * + * If the document was generated by FPDF/FPDI it returns the + * PDFObject of the given page + * + * @return PDFObject The PDFObject for the page + */ + public function getPDFObjectForFpdf(): PDFObject + { + $pageNum = $this->getPageNumber(); + $xObjects = $this->getXObjects(); + + return $xObjects[$pageNum]; + } + + /** + * Return a new PDFObject of the document created with FPDF/FPDI + * + * For a document generated by FPDF/FPDI, it generates a + * new PDFObject for that document + * + * @return PDFObject The PDFObject + */ + public function createPDFObjectForFpdf(): PDFObject + { + $pdfObject = $this->getPDFObjectForFpdf(); + $new_content = $pdfObject->getContent(); + $header = $pdfObject->getHeader(); + $config = $pdfObject->config; + + return new PDFObject($pdfObject->document, $header, $new_content, $config); + } + + /** + * Return page if document is a FPDF/FPDI document + * + * @return Page The page + */ + public function createPageForFpdf(): self + { + $pdfObject = $this->getPDFObjectForFpdf(); + $new_content = $pdfObject->getContent(); + $header = $pdfObject->getHeader(); + $config = $pdfObject->config; + + return new self($pdfObject->document, $header, $new_content, $config); + } + public function getTextArray(self $page = null): array { - if ($contents = $this->get('Contents')) { - if ($contents instanceof ElementMissing) { - return []; - } elseif ($contents instanceof ElementNull) { - return []; - } elseif ($contents instanceof PDFObject) { - $elements = $contents->getHeader()->getElements(); + if ($this->isFpdf()) { + $pdfObject = $this->getPDFObjectForFpdf(); + $newPdfObject = $this->createPDFObjectForFpdf(); - if (is_numeric(key($elements))) { - $new_content = ''; + return $newPdfObject->getTextArray($pdfObject); + } else { + if ($contents = $this->get('Contents')) { + if ($contents instanceof ElementMissing) { + return []; + } elseif ($contents instanceof ElementNull) { + return []; + } elseif ($contents instanceof PDFObject) { + $elements = $contents->getHeader()->getElements(); + + if (is_numeric(key($elements))) { + $new_content = ''; + + /** @var PDFObject $element */ + foreach ($elements as $element) { + if ($element instanceof ElementXRef) { + $new_content .= $element->getObject()->getContent(); + } else { + $new_content .= $element->getContent(); + } + } - /** @var PDFObject $element */ - foreach ($elements as $element) { - if ($element instanceof ElementXRef) { - $new_content .= $element->getObject()->getContent(); - } else { - $new_content .= $element->getContent(); + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); + } else { + try { + $contents->getTextArray($this); + } catch (\Throwable $e) { + return $contents->getTextArray(); } } + } elseif ($contents instanceof ElementArray) { + // Create a virtual global content. + $new_content = ''; - $header = new Header([], $this->document); - $contents = new PDFObject($this->document, $header, $new_content, $this->config); - } else { - try { - $contents->getTextArray($this); - } catch (\Throwable $e) { - return $contents->getTextArray(); + /** @var PDFObject $content */ + foreach ($contents->getContent() as $content) { + $new_content .= $content->getContent()."\n"; } - } - } elseif ($contents instanceof ElementArray) { - // Create a virtual global content. - $new_content = ''; - /** @var PDFObject $content */ - foreach ($contents->getContent() as $content) { - $new_content .= $content->getContent()."\n"; + $header = new Header([], $this->document); + $contents = new PDFObject($this->document, $header, $new_content, $this->config); } - $header = new Header([], $this->document); - $contents = new PDFObject($this->document, $header, $new_content, $this->config); + return $contents->getTextArray($this); } - return $contents->getTextArray($this); + return []; } - - return []; } /** @@ -300,6 +393,9 @@ public function extractRawData(): array } } } else { + if ($this->isFpdf()) { + $content = $this->getPDFObjectForFpdf(); + } $sectionsText = $content->getSectionsText($content->getContent()); foreach ($sectionsText as $sectionText) { $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => '']; @@ -329,6 +425,10 @@ public function extractDecodedRawData(array $extractedRawData = null): array } $currentFont = null; /** @var Font $currentFont */ $clippedFont = null; + $fpdfPage = null; + if ($this->isFpdf()) { + $fpdfPage = $this->createPageForFpdf(); + } foreach ($extractedRawData as &$command) { if ('Tj' == $command['o'] || 'TJ' == $command['o']) { $data = $command['c']; @@ -371,7 +471,8 @@ public function extractDecodedRawData(array $extractedRawData = null): array } } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) { $fontId = explode(' ', $command['c'])[0]; - $currentFont = $this->getFont($fontId); + // If document is a FPDI/FPDF the $page has the correct font + $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId); continue; } elseif ('Q' == $command['o']) { $currentFont = $clippedFont; diff --git a/tests/Integration/PageTest.php b/tests/Integration/PageTest.php index 2976b3d5..5d7e992b 100644 --- a/tests/Integration/PageTest.php +++ b/tests/Integration/PageTest.php @@ -634,4 +634,94 @@ public function testGetDataTmIssue450() $this->assertEquals(756.25, $dataTm[0][0][5]); $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]); } + + public function testIsFpdf(): void + { + $filename = $this->rootDir.'/samples/Document1_foxitreader.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertFalse($page->isFpdf()); + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertTrue($page->isFpdf()); + } + + public function testGetPageNumber(): void + { + $filename = $this->rootDir.'/samples/Document1_foxitreader.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/Document1_pdfcreator.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/Document2_pdfcreator_nocompressed.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/InternationalChars.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $this->assertEquals(0, $page->getPageNumber()); + $page = $pages[1]; + $this->assertEquals(1, $page->getPageNumber()); + $page = $pages[2]; + $this->assertEquals(2, $page->getPageNumber()); + $page = $pages[3]; + $this->assertEquals(3, $page->getPageNumber()); + } + + public function testIssue454(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue454.pdf'; + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $this->assertIsArray($dataTm); + $this->assertGreaterThan(0, \count($dataTm)); + $this->assertIsArray($dataTm[0]); + $this->assertEquals(2, \count($dataTm[0])); + $this->assertIsArray($dataTm[0][0]); + $this->assertEquals(6, \count($dataTm[0][0])); + $this->assertEquals(201.96, $dataTm[0][0][4]); + $this->assertEquals(720.68, $dataTm[0][0][5]); + $this->assertStringContainsString('Document title', $dataTm[0][1]); + $textData = $page->getTextXY(201.96, 720.68); + $this->assertStringContainsString('Document title', $textData[0][1]); + $page = $pages[2]; + $dataTm = $page->getDataTm(); + $this->assertIsArray($dataTm); + $this->assertGreaterThan(0, \count($dataTm)); + $this->assertIsArray($dataTm[0]); + $this->assertEquals(2, \count($dataTm[0])); + $this->assertIsArray($dataTm[0][0]); + $this->assertEquals(6, \count($dataTm[0][0])); + $this->assertEquals(67.5, $dataTm[0][0][4]); + $this->assertEquals(756.25, $dataTm[0][0][5]); + $this->assertStringContainsString('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]); + $textData = $page->getTextXY(67.5, 756.25); + $this->assertStringContainsString('{signature:signer505906:Please+Sign+Here}', $textData[0][1]); + } }