<?php /** * Zend Framework * * LICENSE * * This source file is subject to the new BSD license that is bundled * with this package in the file LICENSE.txt. * It is also available through the world-wide-web at this URL: * http://framework.zend.com/license/new-bsd * If you did not receive a copy of the license and are unable to * obtain it through the world-wide-web, please send an email * to license@zend.com so we can send you a copy immediately. * * @category Zend * @package Zend_Pdf * @subpackage FileParser * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License * @version $Id$ */ /** * Abstract utility class for parsing binary files. * * Provides a library of methods to quickly navigate and extract various data * types (signed and unsigned integers, floating- and fixed-point numbers, * strings, etc.) from the file. * * File access is managed via a {@link Zend_Pdf_FileParserDataSource} object. * This allows the same parser code to work with many different data sources: * in-memory objects, filesystem files, etc. * * @package Zend_Pdf * @subpackage FileParser * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Pdf_FileParser { /**** Class Constants ****/ /** * Little-endian byte order (0x04 0x03 0x02 0x01). */ const BYTE_ORDER_LITTLE_ENDIAN = 0; /** * Big-endian byte order (0x01 0x02 0x03 0x04). */ const BYTE_ORDER_BIG_ENDIAN = 1; /**** Instance Variables ****/ /** * Flag indicating that the file has passed a cursory validation check. * @var boolean */ protected $_isScreened = false; /** * Flag indicating that the file has been sucessfully parsed. * @var boolean */ protected $_isParsed = false; /** * Object representing the data source to be parsed. * @var Zend_Pdf_FileParserDataSource */ protected $_dataSource = null; /**** Public Interface ****/ /* Abstract Methods */ /** * Performs a cursory check to verify that the binary file is in the expected * format. Intended to quickly weed out obviously bogus files. * * Must set $this->_isScreened to true if successful. * * @throws Zend_Pdf_Exception */ abstract public function screen(); /** * Reads and parses the complete binary file. * * Must set $this->_isParsed to true if successful. * * @throws Zend_Pdf_Exception */ abstract public function parse(); /* Object Lifecycle */ /** * Object constructor. * * Verifies that the data source has been properly initialized. * * @param Zend_Pdf_FileParserDataSource $dataSource * @throws Zend_Pdf_Exception */ public function __construct(Zend_Pdf_FileParserDataSource $dataSource) { if ($dataSource->getSize() == 0) { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception('The data source has not been properly initialized', Zend_Pdf_Exception::BAD_DATA_SOURCE); } $this->_dataSource = $dataSource; } /** * Object destructor. * * Discards the data source object. */ public function __destruct() { $this->_dataSource = null; } /* Accessors */ /** * Returns true if the file has passed a cursory validation check. * * @return boolean */ public function isScreened() { return $this->_isScreened; } /** * Returns true if the file has been successfully parsed. * * @return boolean */ public function isParsed() { return $this->_isParsed; } /** * Returns the data source object representing the file being parsed. * * @return Zend_Pdf_FileParserDataSource */ public function getDataSource() { return $this->_dataSource; } /* Primitive Methods */ /** * Convenience wrapper for the data source object's moveToOffset() method. * * @param integer $offset Destination byte offset. * @throws Zend_Pdf_Exception */ public function moveToOffset($offset) { $this->_dataSource->moveToOffset($offset); } public function getOffset() { return $this->_dataSource->getOffset(); } public function getSize() { return $this->_dataSource->getSize(); } /** * Convenience wrapper for the data source object's readBytes() method. * * @param integer $byteCount Number of bytes to read. * @return string * @throws Zend_Pdf_Exception */ public function readBytes($byteCount) { return $this->_dataSource->readBytes($byteCount); } /** * Convenience wrapper for the data source object's skipBytes() method. * * @param integer $byteCount Number of bytes to skip. * @throws Zend_Pdf_Exception */ public function skipBytes($byteCount) { $this->_dataSource->skipBytes($byteCount); } /* Parser Methods */ /** * Reads the signed integer value from the binary file at the current byte * offset. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * @param integer $size Size of integer in bytes: 1-4 * @param integer $byteOrder (optional) Big- or little-endian byte order. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. * If omitted, uses big-endian. * @return integer * @throws Zend_Pdf_Exception */ public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { if (($size < 1) || ($size > 4)) { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception("Invalid signed integer size: $size", Zend_Pdf_Exception::INVALID_INTEGER_SIZE); } $bytes = $this->_dataSource->readBytes($size); /* unpack() will not work for this method because it always works in * the host byte order for signed integers. It also does not allow for * variable integer sizes. */ if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { $number = ord($bytes[0]); if (($number & 0x80) == 0x80) { /* This number is negative. Extract the positive equivalent. */ $number = (~ $number) & 0xff; for ($i = 1; $i < $size; $i++) { $number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff); } /* Now turn this back into a negative number by taking the * two's complement (we didn't add one above so won't * subtract it below). This works reliably on both 32- and * 64-bit systems. */ $number = ~$number; } else { for ($i = 1; $i < $size; $i++) { $number = ($number << 8) | ord($bytes[$i]); } } } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { $number = ord($bytes[$size - 1]); if (($number & 0x80) == 0x80) { /* Negative number. See discussion above. */ $number = 0; for ($i = --$size; $i >= 0; $i--) { $number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8); } $number = ~$number; } else { $number = 0; for ($i = --$size; $i >= 0; $i--) { $number |= ord($bytes[$i]) << ($i * 8); } } } else { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", Zend_Pdf_Exception::INVALID_BYTE_ORDER); } return $number; } /** * Reads the unsigned integer value from the binary file at the current byte * offset. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the * resulting value WILL BE SIGNED because PHP uses signed integers internally * for everything. To guarantee portability, be sure to use bitwise operators * operators on large unsigned integers! * * @param integer $size Size of integer in bytes: 1-4 * @param integer $byteOrder (optional) Big- or little-endian byte order. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. * If omitted, uses big-endian. * @return integer * @throws Zend_Pdf_Exception */ public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { if (($size < 1) || ($size > 4)) { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size", Zend_Pdf_Exception::INVALID_INTEGER_SIZE); } $bytes = $this->_dataSource->readBytes($size); /* unpack() is a bit heavyweight for this simple conversion. Just * work the bytes directly. */ if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { $number = ord($bytes[0]); for ($i = 1; $i < $size; $i++) { $number = ($number << 8) | ord($bytes[$i]); } } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { $number = 0; for ($i = --$size; $i >= 0; $i--) { $number |= ord($bytes[$i]) << ($i * 8); } } else { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", Zend_Pdf_Exception::INVALID_BYTE_ORDER); } return $number; } /** * Returns true if the specified bit is set in the integer bitfield. * * @param integer $bit Bit number to test (i.e. - 0-31) * @param integer $bitField * @return boolean */ public function isBitSet($bit, $bitField) { $bitMask = 1 << $bit; $isSet = (($bitField & $bitMask) == $bitMask); return $isSet; } /** * Reads the signed fixed-point number from the binary file at the current * byte offset. * * Common fixed-point sizes are 2.14 and 16.16. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * @param integer $mantissaBits Number of bits in the mantissa * @param integer $fractionBits Number of bits in the fraction * @param integer $byteOrder (optional) Big- or little-endian byte order. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. * If omitted, uses big-endian. * @return float * @throws Zend_Pdf_Exception */ public function readFixed($mantissaBits, $fractionBits, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { $bitsToRead = $mantissaBits + $fractionBits; if (($bitsToRead % 8) !== 0) { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes', Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE); } $number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits); return $number; } /** * Reads the Unicode UTF-16-encoded string from the binary file at the * current byte offset. * * The byte order of the UTF-16 string must be specified. You must also * supply the desired resulting character set. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * @todo Consider changing $byteCount to a character count. They are not * always equivalent (in the case of surrogates). * @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the * string being extracted. * * @param integer $byteCount Number of bytes (characters * 2) to return. * @param integer $byteOrder (optional) Big- or little-endian byte order. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. * If omitted, uses big-endian. * @param string $characterSet (optional) Desired resulting character set. * You may use any character set supported by {@link iconv()}. If omitted, * uses 'current locale'. * @return string * @throws Zend_Pdf_Exception */ public function readStringUTF16($byteCount, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN, $characterSet = '') { if ($byteCount == 0) { return ''; } $bytes = $this->_dataSource->readBytes($byteCount); if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { if ($characterSet == 'UTF-16BE') { return $bytes; } return iconv('UTF-16BE', $characterSet, $bytes); } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { if ($characterSet == 'UTF-16LE') { return $bytes; } return iconv('UTF-16LE', $characterSet, $bytes); } else { require_once 'Zend/Pdf/Exception.php'; throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", Zend_Pdf_Exception::INVALID_BYTE_ORDER); } } /** * Reads the Mac Roman-encoded string from the binary file at the current * byte offset. * * You must supply the desired resulting character set. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * @param integer $byteCount Number of bytes (characters) to return. * @param string $characterSet (optional) Desired resulting character set. * You may use any character set supported by {@link iconv()}. If omitted, * uses 'current locale'. * @return string * @throws Zend_Pdf_Exception */ public function readStringMacRoman($byteCount, $characterSet = '') { if ($byteCount == 0) { return ''; } $bytes = $this->_dataSource->readBytes($byteCount); if ($characterSet == 'MacRoman') { return $bytes; } return iconv('MacRoman', $characterSet, $bytes); } /** * Reads the Pascal string from the binary file at the current byte offset. * * The length of the Pascal string is determined by reading the length bytes * which preceed the character data. You must supply the desired resulting * character set. * * Advances the offset by the number of bytes read. Throws an exception if * an error occurs. * * @param string $characterSet (optional) Desired resulting character set. * You may use any character set supported by {@link iconv()}. If omitted, * uses 'current locale'. * @param integer $lengthBytes (optional) Number of bytes that make up the * length. Default is 1. * @return string * @throws Zend_Pdf_Exception */ public function readStringPascal($characterSet = '', $lengthBytes = 1) { $byteCount = $this->readUInt($lengthBytes); if ($byteCount == 0) { return ''; } $bytes = $this->_dataSource->readBytes($byteCount); if ($characterSet == 'ASCII') { return $bytes; } return iconv('ASCII', $characterSet, $bytes); } }