diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 016787af..df0a6402 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -287,7 +287,7 @@ public function extractXMPMetadata(string $content): void } // Only use this metadata if it's referring to a PDF - if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) { + if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) { // According to the XMP specifications: 'Conflict resolution // for separate packets that describe the same resource is // beyond the scope of this document.' - Section 6.1 diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 5f19b696..346ba633 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -232,4 +232,36 @@ public function testGetPagesMissingCatalog(): void $document = $this->getDocumentInstance(); $document->getPages(); } + + /** + * @see https://github.com/smalot/pdfparser/issues/721 + */ + public function testExtractXMPMetadataIssue721(): void + { + $document = $this->getDocumentInstance(); + + // Check that XMP metadata is parsed even if missing a dc:format tag + $content = ' + + + + + + PdfParser + + + 2018-02-07T11:51:44-05:00 + 2019-10-23T09:56:01-04:00 + + +'; + + $document->extractXMPMetadata($content); + $document->init(); + $details = $document->getDetails(); + + $this->assertEquals(4, \count($details)); + $this->assertEquals('PdfParser', $details['dc:creator']); + $this->assertEquals('2019-10-23T09:56:01-04:00', $details['xmp:modifydate']); + } }