PdfParser.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. <?php
  2. /**
  3. * This file is part of FPDI
  4. *
  5. * @package setasign\Fpdi
  6. * @copyright Copyright (c) 2024 Setasign GmbH & Co. KG (https://www.setasign.com)
  7. * @license http://opensource.org/licenses/mit-license The MIT License
  8. */
  9. namespace setasign\Fpdi\PdfParser;
  10. use setasign\Fpdi\PdfParser\CrossReference\CrossReference;
  11. use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException;
  12. use setasign\Fpdi\PdfParser\Type\PdfArray;
  13. use setasign\Fpdi\PdfParser\Type\PdfBoolean;
  14. use setasign\Fpdi\PdfParser\Type\PdfDictionary;
  15. use setasign\Fpdi\PdfParser\Type\PdfHexString;
  16. use setasign\Fpdi\PdfParser\Type\PdfIndirectObject;
  17. use setasign\Fpdi\PdfParser\Type\PdfIndirectObjectReference;
  18. use setasign\Fpdi\PdfParser\Type\PdfName;
  19. use setasign\Fpdi\PdfParser\Type\PdfNull;
  20. use setasign\Fpdi\PdfParser\Type\PdfNumeric;
  21. use setasign\Fpdi\PdfParser\Type\PdfStream;
  22. use setasign\Fpdi\PdfParser\Type\PdfString;
  23. use setasign\Fpdi\PdfParser\Type\PdfToken;
  24. use setasign\Fpdi\PdfParser\Type\PdfType;
  25. use setasign\Fpdi\PdfParser\Type\PdfTypeException;
  26. /**
  27. * A PDF parser class
  28. */
  29. class PdfParser
  30. {
  31. /**
  32. * @var StreamReader
  33. */
  34. protected $streamReader;
  35. /**
  36. * @var Tokenizer
  37. */
  38. protected $tokenizer;
  39. /**
  40. * The file header.
  41. *
  42. * @var string
  43. */
  44. protected $fileHeader;
  45. /**
  46. * The offset to the file header.
  47. *
  48. * @var int
  49. */
  50. protected $fileHeaderOffset;
  51. /**
  52. * @var CrossReference|null
  53. */
  54. protected $xref;
  55. /**
  56. * All read objects.
  57. *
  58. * @var array
  59. */
  60. protected $objects = [];
  61. /**
  62. * PdfParser constructor.
  63. *
  64. * @param StreamReader $streamReader
  65. */
  66. public function __construct(StreamReader $streamReader)
  67. {
  68. $this->streamReader = $streamReader;
  69. $this->tokenizer = new Tokenizer($streamReader);
  70. }
  71. /**
  72. * Removes cycled references.
  73. *
  74. * @internal
  75. */
  76. public function cleanUp()
  77. {
  78. $this->xref = null;
  79. }
  80. /**
  81. * Get the stream reader instance.
  82. *
  83. * @return StreamReader
  84. */
  85. public function getStreamReader()
  86. {
  87. return $this->streamReader;
  88. }
  89. /**
  90. * Get the tokenizer instance.
  91. *
  92. * @return Tokenizer
  93. */
  94. public function getTokenizer()
  95. {
  96. return $this->tokenizer;
  97. }
  98. /**
  99. * Resolves the file header.
  100. *
  101. * @throws PdfParserException
  102. * @return int
  103. */
  104. protected function resolveFileHeader()
  105. {
  106. if ($this->fileHeader) {
  107. return $this->fileHeaderOffset;
  108. }
  109. $this->streamReader->reset(0);
  110. $maxIterations = 1000;
  111. while (true) {
  112. $buffer = $this->streamReader->getBuffer(false);
  113. $offset = \strpos($buffer, '%PDF-');
  114. if ($offset === false) {
  115. if (!$this->streamReader->increaseLength(100) || (--$maxIterations === 0)) {
  116. throw new PdfParserException(
  117. 'Unable to find PDF file header.',
  118. PdfParserException::FILE_HEADER_NOT_FOUND
  119. );
  120. }
  121. continue;
  122. }
  123. break;
  124. }
  125. $this->fileHeaderOffset = $offset;
  126. $this->streamReader->setOffset($offset);
  127. $this->fileHeader = \trim($this->streamReader->readLine());
  128. return $this->fileHeaderOffset;
  129. }
  130. /**
  131. * Get the cross-reference instance.
  132. *
  133. * @return CrossReference
  134. * @throws CrossReferenceException
  135. * @throws PdfParserException
  136. */
  137. public function getCrossReference()
  138. {
  139. if ($this->xref === null) {
  140. $this->xref = new CrossReference($this, $this->resolveFileHeader());
  141. }
  142. return $this->xref;
  143. }
  144. /**
  145. * Get the PDF version.
  146. *
  147. * @return int[] An array of major and minor version.
  148. * @throws PdfParserException
  149. */
  150. public function getPdfVersion()
  151. {
  152. $this->resolveFileHeader();
  153. if (\preg_match('/%PDF-(\d)\.(\d)/', $this->fileHeader, $result) === 0) {
  154. throw new PdfParserException(
  155. 'Unable to extract PDF version from file header.',
  156. PdfParserException::PDF_VERSION_NOT_FOUND
  157. );
  158. }
  159. list(, $major, $minor) = $result;
  160. $catalog = $this->getCatalog();
  161. if (isset($catalog->value['Version'])) {
  162. $versionParts = \explode(
  163. '.',
  164. PdfName::unescape(PdfType::resolve($catalog->value['Version'], $this)->value)
  165. );
  166. if (count($versionParts) === 2) {
  167. list($major, $minor) = $versionParts;
  168. }
  169. }
  170. return [(int) $major, (int) $minor];
  171. }
  172. /**
  173. * Get the catalog dictionary.
  174. *
  175. * @return PdfDictionary
  176. * @throws Type\PdfTypeException
  177. * @throws CrossReferenceException
  178. * @throws PdfParserException
  179. */
  180. public function getCatalog()
  181. {
  182. $trailer = $this->getCrossReference()->getTrailer();
  183. $catalog = PdfType::resolve(PdfDictionary::get($trailer, 'Root'), $this);
  184. return PdfDictionary::ensure($catalog);
  185. }
  186. /**
  187. * Get an indirect object by its object number.
  188. *
  189. * @param int $objectNumber
  190. * @param bool $cache
  191. * @return PdfIndirectObject
  192. * @throws CrossReferenceException
  193. * @throws PdfParserException
  194. */
  195. public function getIndirectObject($objectNumber, $cache = false)
  196. {
  197. $objectNumber = (int) $objectNumber;
  198. if (isset($this->objects[$objectNumber])) {
  199. return $this->objects[$objectNumber];
  200. }
  201. $object = $this->getCrossReference()->getIndirectObject($objectNumber);
  202. if ($cache) {
  203. $this->objects[$objectNumber] = $object;
  204. }
  205. return $object;
  206. }
  207. /**
  208. * Read a PDF value.
  209. *
  210. * @param null|bool|string $token
  211. * @param null|string $expectedType
  212. * @return false|PdfArray|PdfBoolean|PdfDictionary|PdfHexString|PdfIndirectObject|PdfIndirectObjectReference|PdfName|PdfNull|PdfNumeric|PdfStream|PdfString|PdfToken
  213. * @throws Type\PdfTypeException
  214. */
  215. public function readValue($token = null, $expectedType = null)
  216. {
  217. if ($token === null) {
  218. $token = $this->tokenizer->getNextToken();
  219. }
  220. if ($token === false) {
  221. if ($expectedType !== null) {
  222. throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
  223. }
  224. return false;
  225. }
  226. switch ($token) {
  227. case '(':
  228. $this->ensureExpectedType($token, $expectedType);
  229. return $this->parsePdfString();
  230. case '<':
  231. if ($this->streamReader->getByte() === '<') {
  232. $this->ensureExpectedType('<<', $expectedType);
  233. $this->streamReader->addOffset(1);
  234. return $this->parsePdfDictionary();
  235. }
  236. $this->ensureExpectedType($token, $expectedType);
  237. return $this->parsePdfHexString();
  238. case '/':
  239. $this->ensureExpectedType($token, $expectedType);
  240. return $this->parsePdfName();
  241. case '[':
  242. $this->ensureExpectedType($token, $expectedType);
  243. return $this->parsePdfArray();
  244. default:
  245. if (\is_numeric($token)) {
  246. if (($token2 = $this->tokenizer->getNextToken()) !== false) {
  247. if (\is_numeric($token2) && ($token3 = $this->tokenizer->getNextToken()) !== false) {
  248. switch ($token3) {
  249. case 'obj':
  250. if ($expectedType !== null && $expectedType !== PdfIndirectObject::class) {
  251. throw new Type\PdfTypeException(
  252. 'Got unexpected token type.',
  253. Type\PdfTypeException::INVALID_DATA_TYPE
  254. );
  255. }
  256. return $this->parsePdfIndirectObject((int)$token, (int)$token2);
  257. case 'R':
  258. if (
  259. $expectedType !== null &&
  260. $expectedType !== PdfIndirectObjectReference::class
  261. ) {
  262. throw new Type\PdfTypeException(
  263. 'Got unexpected token type.',
  264. Type\PdfTypeException::INVALID_DATA_TYPE
  265. );
  266. }
  267. return PdfIndirectObjectReference::create((int)$token, (int)$token2);
  268. }
  269. $this->tokenizer->pushStack($token3);
  270. }
  271. $this->tokenizer->pushStack($token2);
  272. }
  273. if ($expectedType !== null && $expectedType !== PdfNumeric::class) {
  274. throw new Type\PdfTypeException(
  275. 'Got unexpected token type.',
  276. Type\PdfTypeException::INVALID_DATA_TYPE
  277. );
  278. }
  279. return PdfNumeric::create($token + 0);
  280. }
  281. if ($token === 'true' || $token === 'false') {
  282. $this->ensureExpectedType($token, $expectedType);
  283. return PdfBoolean::create($token === 'true');
  284. }
  285. if ($token === 'null') {
  286. $this->ensureExpectedType($token, $expectedType);
  287. return new PdfNull();
  288. }
  289. if ($expectedType !== null && $expectedType !== PdfToken::class) {
  290. throw new Type\PdfTypeException(
  291. 'Got unexpected token type.',
  292. Type\PdfTypeException::INVALID_DATA_TYPE
  293. );
  294. }
  295. $v = new PdfToken();
  296. $v->value = $token;
  297. return $v;
  298. }
  299. }
  300. /**
  301. * @return PdfString
  302. */
  303. protected function parsePdfString()
  304. {
  305. return PdfString::parse($this->streamReader);
  306. }
  307. /**
  308. * @return false|PdfHexString
  309. */
  310. protected function parsePdfHexString()
  311. {
  312. return PdfHexString::parse($this->streamReader);
  313. }
  314. /**
  315. * @return bool|PdfDictionary
  316. * @throws PdfTypeException
  317. */
  318. protected function parsePdfDictionary()
  319. {
  320. return PdfDictionary::parse($this->tokenizer, $this->streamReader, $this);
  321. }
  322. /**
  323. * @return PdfName
  324. */
  325. protected function parsePdfName()
  326. {
  327. return PdfName::parse($this->tokenizer, $this->streamReader);
  328. }
  329. /**
  330. * @return false|PdfArray
  331. * @throws PdfTypeException
  332. */
  333. protected function parsePdfArray()
  334. {
  335. return PdfArray::parse($this->tokenizer, $this);
  336. }
  337. /**
  338. * @param int $objectNumber
  339. * @param int $generationNumber
  340. * @return false|PdfIndirectObject
  341. * @throws Type\PdfTypeException
  342. */
  343. protected function parsePdfIndirectObject($objectNumber, $generationNumber)
  344. {
  345. return PdfIndirectObject::parse(
  346. $objectNumber,
  347. $generationNumber,
  348. $this,
  349. $this->tokenizer,
  350. $this->streamReader
  351. );
  352. }
  353. /**
  354. * Ensures that the token will evaluate to an expected object type (or not).
  355. *
  356. * @param string $token
  357. * @param string|null $expectedType
  358. * @return bool
  359. * @throws Type\PdfTypeException
  360. */
  361. protected function ensureExpectedType($token, $expectedType)
  362. {
  363. static $mapping = [
  364. '(' => PdfString::class,
  365. '<' => PdfHexString::class,
  366. '<<' => PdfDictionary::class,
  367. '/' => PdfName::class,
  368. '[' => PdfArray::class,
  369. 'true' => PdfBoolean::class,
  370. 'false' => PdfBoolean::class,
  371. 'null' => PdfNull::class
  372. ];
  373. if ($expectedType === null || $mapping[$token] === $expectedType) {
  374. return true;
  375. }
  376. throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
  377. }
  378. }