Skip to content

Commit

Permalink
Refactor: declare pdf whitespaces in global variable to avoid bug (#411)
Browse files Browse the repository at this point in the history
* Refactor: declare pdf whitespaces in global variable to reduce chance of bug using regular whitespace

* Refactor: move $pdfWhitespaces and $pdfWhitespacesRegex to Config.php

* Bug fix in php 5.6

* Add explicit return types and phpdoc @var

* Merge #409
  • Loading branch information
LucianoHanna authored Apr 26, 2021
1 parent ef0204b commit 57667ee
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 23 deletions.
32 changes: 32 additions & 0 deletions src/Smalot/PdfParser/Config.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ class Config
{
private $fontSpaceLimit = -50;

/**
* Represents: (NUL, HT, LF, FF, CR, SP)
* @var string
*/
private $pdfWhitespaces = "\0\t\n\f\r ";

/**
* Represents: (NUL, HT, LF, FF, CR, SP)
* @var string
*/
private $pdfWhitespacesRegex = '[\0\t\n\f\r ]';

public function getFontSpaceLimit()
{
return $this->fontSpaceLimit;
Expand All @@ -49,4 +61,24 @@ public function setFontSpaceLimit($value)
{
$this->fontSpaceLimit = $value;
}

public function getPdfWhitespaces(): string
{
return $this->pdfWhitespaces;
}

public function setPdfWhitespaces(string $pdfWhitespaces): void
{
$this->pdfWhitespaces = $pdfWhitespaces;
}

public function getPdfWhitespacesRegex(): string
{
return $this->pdfWhitespacesRegex;
}

public function setPdfWhitespacesRegex(string $pdfWhitespacesRegex): void
{
$this->pdfWhitespacesRegex = $pdfWhitespacesRegex;
}
}
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ class Parser

public function __construct($cfg = [], Config $config = null)
{
$this->rawDataParser = new RawDataParser($cfg);
$this->config = $config ?: new Config();
$this->rawDataParser = new RawDataParser($cfg, $this->config);
}

/**
Expand Down
53 changes: 32 additions & 21 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,15 @@
namespace Smalot\PdfParser\RawData;

use Exception;
use Smalot\PdfParser\Config;

class RawDataParser
{
/**
* @var \Smalot\PdfParser\Config
*/
private $config;

/**
* Configuration array.
*/
Expand All @@ -60,12 +66,13 @@ class RawDataParser
/**
* @param array $cfg Configuration array, default is []
*/
public function __construct($cfg = [])
public function __construct($cfg = [], Config $config = null)
{
// merge given array with default values
$this->cfg = array_merge($this->cfg, $cfg);

$this->filterHelper = new FilterHelper();
$this->config = $config ?: new Config();
}

/**
Expand Down Expand Up @@ -148,8 +155,8 @@ protected function decodeStream($pdfData, $xref, $sdic, $stream)
protected function decodeXref($pdfData, $startxref, $xref = [])
{
$startxref += 4; // 4 is the length of the word 'xref'
// skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
$offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
// skip initial white space chars
$offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
// initialize object number
$obj_num = 0;
// search for cross-reference entries or subsection
Expand Down Expand Up @@ -463,6 +470,19 @@ protected function decodeXrefStream($pdfData, $startxref, $xref = [])
return $xref;
}

protected function getObjectHeaderPattern($objRefArr): string
{
// consider all whitespace character (PDF specifications)
return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
}

protected function getObjectHeaderLen($objRefArr): int
{
// "4 0 obj"
// 2 whitespaces + strlen("obj") = 5
return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
}

/**
* Get content of indirect object.
*
Expand All @@ -486,18 +506,17 @@ protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $dec
if (2 !== \count($objRefArr)) {
throw new Exception('Invalid object reference for $obj.');
}
$objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';

$objHeaderLen = $this->getObjectHeaderLen($objRefArr);

/*
* check if we are in position
*/
// ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
$offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
// ignore whitespace characters at offset
$offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
// ignore leading zeros for object number
$offset += strspn($pdfData, '0', $offset);
// consider all whitespace character (PDF specifications)
$objHeaderPattern = '/'.$objRefArr[0].'[\0\t\n\f\r ]'.$objRefArr[1].'[\0\t\n\f\r ]obj'.'/';
if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader)))) {
if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
// an indirect reference to an undefined object shall be considered a reference to the null object
return ['null', 'null', $offset];
}
Expand All @@ -506,7 +525,7 @@ protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $dec
* get content
*/
// starting position of object content
$offset += \strlen($objHeader);
$offset += $objHeaderLen;
$objContentArr = [];
$i = 0; // object main index
do {
Expand Down Expand Up @@ -570,16 +589,8 @@ protected function getRawObject($pdfData, $offset = 0)
$objtype = ''; // object type to be returned
$objval = ''; // object value to be returned

/*
* skip initial white space chars:
* \x00 null (NUL)
* \x09 horizontal tab (HT)
* \x0A line feed (LF)
* \x0C form feed (FF)
* \x0D carriage return (CR)
* \x20 space (SP)
*/
$offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
// skip initial white space chars
$offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);

// get first char
$char = $pdfData[$offset];
Expand Down Expand Up @@ -694,7 +705,7 @@ protected function getRawObject($pdfData, $offset = 0)
);
if (('<' == $char) && 1 == $pregResult) {
// remove white space characters
$objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
$objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
$offset += \strlen($matches[0]);
} elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
$offset = $endpos + 1;
Expand Down
3 changes: 2 additions & 1 deletion tests/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

namespace Tests\Smalot\PdfParser\Integration\RawData;

use Smalot\PdfParser\Config;
use Smalot\PdfParser\RawData\RawDataParser;
use Tests\Smalot\PdfParser\TestCase;

Expand All @@ -52,7 +53,7 @@ protected function setUp(): void
{
parent::setUp();

$this->fixture = new RawDataParserHelper();
$this->fixture = new RawDataParserHelper([], new Config());
}

/**
Expand Down

0 comments on commit 57667ee

Please sign in to comment.