diff --git a/README.md b/README.md index 48044ea..ffe4cf6 100644 --- a/README.md +++ b/README.md @@ -281,6 +281,8 @@ pdf2zh example.pdf -t 1 - [ ] Support multiple language with [Noto Font](https://fonts.google.com/noto), [Ubuntu Font](https://design.ubuntu.com/font) +- [ ] Retry except KeyboardInterrupt +

Acknowledgements

- Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF) diff --git a/docs/licenses/LICENSE.pdfminer.six b/docs/licenses/LICENSE.pdfminer.six deleted file mode 100644 index 3940067..0000000 --- a/docs/licenses/LICENSE.pdfminer.six +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2004-2016 Yusuke Shinyama - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/docs/licenses/LICENSE.pyHanko b/docs/licenses/LICENSE.pyHanko deleted file mode 100644 index b0e3a00..0000000 --- a/docs/licenses/LICENSE.pyHanko +++ /dev/null @@ -1,23 +0,0 @@ -This package contains various elements based on code from the pyHanko project, of which we reproduce the license below. - -MIT License - -Copyright (c) 2020 Matthias Valvekens - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/pdf2zh/_saslprep.py b/pdf2zh/_saslprep.py deleted file mode 100644 index 7f31716..0000000 --- a/pdf2zh/_saslprep.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright 2016-present MongoDB, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Some changes copyright 2021-present Matthias Valvekens, -# licensed under the license of the pyHanko project (see LICENSE file). - - -"""An implementation of RFC4013 SASLprep.""" - -__all__ = ["saslprep"] - -import stringprep -import unicodedata -from typing import Callable, Tuple - -from pdf2zh.pdfexceptions import PDFValueError - -# RFC4013 section 2.3 prohibited output. -_PROHIBITED: Tuple[Callable[[str], bool], ...] = ( - # A strict reading of RFC 4013 requires table c12 here, but - # characters from it are mapped to SPACE in the Map step. Can - # normalization reintroduce them somehow? - stringprep.in_table_c12, - stringprep.in_table_c21_c22, - stringprep.in_table_c3, - stringprep.in_table_c4, - stringprep.in_table_c5, - stringprep.in_table_c6, - stringprep.in_table_c7, - stringprep.in_table_c8, - stringprep.in_table_c9, -) - - -def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str: - """An implementation of RFC4013 SASLprep. - :param data: - The string to SASLprep. - :param prohibit_unassigned_code_points: - RFC 3454 and RFCs for various SASL mechanisms distinguish between - `queries` (unassigned code points allowed) and - `stored strings` (unassigned code points prohibited). Defaults - to ``True`` (unassigned code points are prohibited). - :return: The SASLprep'ed version of `data`. - """ - if prohibit_unassigned_code_points: - prohibited = _PROHIBITED + (stringprep.in_table_a1,) - else: - prohibited = _PROHIBITED - - # RFC3454 section 2, step 1 - Map - # RFC4013 section 2.1 mappings - # Map Non-ASCII space characters to SPACE (U+0020). Map - # commonly mapped to nothing characters to, well, nothing. - in_table_c12 = stringprep.in_table_c12 - in_table_b1 = stringprep.in_table_b1 - data = "".join( - [ - "\u0020" if in_table_c12(elt) else elt - for elt in data - if not in_table_b1(elt) - ], - ) - - # RFC3454 section 2, step 2 - Normalize - # RFC4013 section 2.2 normalization - data = unicodedata.ucd_3_2_0.normalize("NFKC", data) - - in_table_d1 = stringprep.in_table_d1 - if in_table_d1(data[0]): - if not in_table_d1(data[-1]): - # RFC3454, Section 6, #3. If a string contains any - # RandALCat character, the first and last characters - # MUST be RandALCat characters. - raise PDFValueError("SASLprep: failed bidirectional check") - # RFC3454, Section 6, #2. If a string contains any RandALCat - # character, it MUST NOT contain any LCat character. - prohibited = prohibited + (stringprep.in_table_d2,) - else: - # RFC3454, Section 6, #3. Following the logic of #3, if - # the first character is not a RandALCat, no other character - # can be either. - prohibited = prohibited + (in_table_d1,) - - # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi - for char in data: - if any(in_table(char) for in_table in prohibited): - raise PDFValueError("SASLprep: failed prohibited character check") - - return data diff --git a/pdf2zh/arcfour.py b/pdf2zh/arcfour.py deleted file mode 100644 index cc78e36..0000000 --- a/pdf2zh/arcfour.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Python implementation of Arcfour encryption algorithm. -See https://en.wikipedia.org/wiki/RC4 -This code is in the public domain. - -""" - -from typing import Sequence - - -class Arcfour: - def __init__(self, key: Sequence[int]) -> None: - # because Py3 range is not indexable - s = [i for i in range(256)] - j = 0 - klen = len(key) - for i in range(256): - j = (j + s[i] + key[i % klen]) % 256 - (s[i], s[j]) = (s[j], s[i]) - self.s = s - (self.i, self.j) = (0, 0) - - def process(self, data: bytes) -> bytes: - (i, j) = (self.i, self.j) - s = self.s - r = b"" - for c in iter(data): - i = (i + 1) % 256 - j = (j + s[i]) % 256 - (s[i], s[j]) = (s[j], s[i]) - k = s[(s[i] + s[j]) % 256] - r += bytes((c ^ k,)) - (self.i, self.j) = (i, j) - return r - - encrypt = decrypt = process diff --git a/pdf2zh/ascii85.py b/pdf2zh/ascii85.py deleted file mode 100644 index 233bc74..0000000 --- a/pdf2zh/ascii85.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version). - -This code is in the public domain. - -""" - -import re -import struct - - -# ascii85decode(data) -def ascii85decode(data: bytes) -> bytes: - """In ASCII85 encoding, every four bytes are encoded with five ASCII - letters, using 85 different types of characters (as 256**4 < 85**5). - When the length of the original bytes is not a multiple of 4, a special - rule is used for round up. - - The Adobe's ASCII85 implementation is slightly different from - its original in handling the last characters. - - """ - n = b = 0 - out = b"" - for i in iter(data): - c = bytes((i,)) - if c >= b"!" and c <= b"u": - n += 1 - b = b * 85 + (ord(c) - 33) - if n == 5: - out += struct.pack(">L", b) - n = b = 0 - elif c == b"z": - assert n == 0, str(n) - out += b"\0\0\0\0" - elif c == b"~": - if n: - for _ in range(5 - n): - b = b * 85 + 84 - out += struct.pack(">L", b)[: n - 1] - break - return out - - -# asciihexdecode(data) -hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE) -trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE) - - -def asciihexdecode(data: bytes) -> bytes: - """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 - For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the - ASCIIHexDecode filter produces one byte of binary data. All white-space - characters are ignored. A right angle bracket character (>) indicates - EOD. Any other characters will cause an error. If the filter encounters - the EOD marker after reading an odd number of hexadecimal digits, it - will behave as if a 0 followed the last digit. - """ - - def decode(x: bytes) -> bytes: - i = int(x, 16) - return bytes((i,)) - - out = b"" - for x in hex_re.findall(data): - out += decode(x) - - m = trail_re.search(data) - if m: - out += decode(m.group(1) + b"0") - return out diff --git a/pdf2zh/casting.py b/pdf2zh/casting.py deleted file mode 100644 index ac6bac5..0000000 --- a/pdf2zh/casting.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Any, Optional - - -def safe_int(o: Any) -> Optional[int]: - try: - return int(o) - except (TypeError, ValueError): - return None - - -def safe_float(o: Any) -> Optional[float]: - try: - return float(o) - except (TypeError, ValueError): - return None diff --git a/pdf2zh/ccitt.py b/pdf2zh/ccitt.py deleted file mode 100644 index b7527c5..0000000 --- a/pdf2zh/ccitt.py +++ /dev/null @@ -1,614 +0,0 @@ -# CCITT Fax decoder -# -# Bugs: uncompressed mode untested. -# -# cf. -# ITU-T Recommendation T.4 -# "Standardization of Group 3 facsimile terminals -# for document transmission" -# ITU-T Recommendation T.6 -# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS -# FOR GROUP 4 FACSIMILE APPARATUS" - - -import array -from typing import ( - Any, - Callable, - Dict, - Iterator, - List, - MutableSequence, - Optional, - Sequence, - Union, - cast, -) - -from pdf2zh.pdfexceptions import PDFException, PDFValueError - - -def get_bytes(data: bytes) -> Iterator[int]: - yield from data - - -# Workaround https://github.com/python/mypy/issues/731 -BitParserState = MutableSequence[Any] -# A better definition (not supported by mypy) would be: -# BitParserState = MutableSequence[Union["BitParserState", int, str, None]] - - -class BitParser: - _state: BitParserState - - # _accept is declared Optional solely as a workaround for - # https://github.com/python/mypy/issues/708 - _accept: Optional[Callable[[Any], BitParserState]] - - def __init__(self) -> None: - self._pos = 0 - - @classmethod - def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None: - p: BitParserState = root - b = None - for i in range(len(bits)): - if i > 0: - assert b is not None - if p[b] is None: - p[b] = [None, None] - p = p[b] - if bits[i] == "1": - b = 1 - else: - b = 0 - assert b is not None - p[b] = v - - def feedbytes(self, data: bytes) -> None: - for byte in get_bytes(data): - for m in (128, 64, 32, 16, 8, 4, 2, 1): - self._parse_bit(byte & m) - - def _parse_bit(self, x: object) -> None: - if x: - v = self._state[1] - else: - v = self._state[0] - self._pos += 1 - if isinstance(v, list): - self._state = v - else: - assert self._accept is not None - self._state = self._accept(v) - - -class CCITTG4Parser(BitParser): - MODE = [None, None] - BitParser.add(MODE, 0, "1") - BitParser.add(MODE, +1, "011") - BitParser.add(MODE, -1, "010") - BitParser.add(MODE, "h", "001") - BitParser.add(MODE, "p", "0001") - BitParser.add(MODE, +2, "000011") - BitParser.add(MODE, -2, "000010") - BitParser.add(MODE, +3, "0000011") - BitParser.add(MODE, -3, "0000010") - BitParser.add(MODE, "u", "0000001111") - BitParser.add(MODE, "x1", "0000001000") - BitParser.add(MODE, "x2", "0000001001") - BitParser.add(MODE, "x3", "0000001010") - BitParser.add(MODE, "x4", "0000001011") - BitParser.add(MODE, "x5", "0000001100") - BitParser.add(MODE, "x6", "0000001101") - BitParser.add(MODE, "x7", "0000001110") - BitParser.add(MODE, "e", "000000000001000000000001") - - WHITE = [None, None] - BitParser.add(WHITE, 0, "00110101") - BitParser.add(WHITE, 1, "000111") - BitParser.add(WHITE, 2, "0111") - BitParser.add(WHITE, 3, "1000") - BitParser.add(WHITE, 4, "1011") - BitParser.add(WHITE, 5, "1100") - BitParser.add(WHITE, 6, "1110") - BitParser.add(WHITE, 7, "1111") - BitParser.add(WHITE, 8, "10011") - BitParser.add(WHITE, 9, "10100") - BitParser.add(WHITE, 10, "00111") - BitParser.add(WHITE, 11, "01000") - BitParser.add(WHITE, 12, "001000") - BitParser.add(WHITE, 13, "000011") - BitParser.add(WHITE, 14, "110100") - BitParser.add(WHITE, 15, "110101") - BitParser.add(WHITE, 16, "101010") - BitParser.add(WHITE, 17, "101011") - BitParser.add(WHITE, 18, "0100111") - BitParser.add(WHITE, 19, "0001100") - BitParser.add(WHITE, 20, "0001000") - BitParser.add(WHITE, 21, "0010111") - BitParser.add(WHITE, 22, "0000011") - BitParser.add(WHITE, 23, "0000100") - BitParser.add(WHITE, 24, "0101000") - BitParser.add(WHITE, 25, "0101011") - BitParser.add(WHITE, 26, "0010011") - BitParser.add(WHITE, 27, "0100100") - BitParser.add(WHITE, 28, "0011000") - BitParser.add(WHITE, 29, "00000010") - BitParser.add(WHITE, 30, "00000011") - BitParser.add(WHITE, 31, "00011010") - BitParser.add(WHITE, 32, "00011011") - BitParser.add(WHITE, 33, "00010010") - BitParser.add(WHITE, 34, "00010011") - BitParser.add(WHITE, 35, "00010100") - BitParser.add(WHITE, 36, "00010101") - BitParser.add(WHITE, 37, "00010110") - BitParser.add(WHITE, 38, "00010111") - BitParser.add(WHITE, 39, "00101000") - BitParser.add(WHITE, 40, "00101001") - BitParser.add(WHITE, 41, "00101010") - BitParser.add(WHITE, 42, "00101011") - BitParser.add(WHITE, 43, "00101100") - BitParser.add(WHITE, 44, "00101101") - BitParser.add(WHITE, 45, "00000100") - BitParser.add(WHITE, 46, "00000101") - BitParser.add(WHITE, 47, "00001010") - BitParser.add(WHITE, 48, "00001011") - BitParser.add(WHITE, 49, "01010010") - BitParser.add(WHITE, 50, "01010011") - BitParser.add(WHITE, 51, "01010100") - BitParser.add(WHITE, 52, "01010101") - BitParser.add(WHITE, 53, "00100100") - BitParser.add(WHITE, 54, "00100101") - BitParser.add(WHITE, 55, "01011000") - BitParser.add(WHITE, 56, "01011001") - BitParser.add(WHITE, 57, "01011010") - BitParser.add(WHITE, 58, "01011011") - BitParser.add(WHITE, 59, "01001010") - BitParser.add(WHITE, 60, "01001011") - BitParser.add(WHITE, 61, "00110010") - BitParser.add(WHITE, 62, "00110011") - BitParser.add(WHITE, 63, "00110100") - BitParser.add(WHITE, 64, "11011") - BitParser.add(WHITE, 128, "10010") - BitParser.add(WHITE, 192, "010111") - BitParser.add(WHITE, 256, "0110111") - BitParser.add(WHITE, 320, "00110110") - BitParser.add(WHITE, 384, "00110111") - BitParser.add(WHITE, 448, "01100100") - BitParser.add(WHITE, 512, "01100101") - BitParser.add(WHITE, 576, "01101000") - BitParser.add(WHITE, 640, "01100111") - BitParser.add(WHITE, 704, "011001100") - BitParser.add(WHITE, 768, "011001101") - BitParser.add(WHITE, 832, "011010010") - BitParser.add(WHITE, 896, "011010011") - BitParser.add(WHITE, 960, "011010100") - BitParser.add(WHITE, 1024, "011010101") - BitParser.add(WHITE, 1088, "011010110") - BitParser.add(WHITE, 1152, "011010111") - BitParser.add(WHITE, 1216, "011011000") - BitParser.add(WHITE, 1280, "011011001") - BitParser.add(WHITE, 1344, "011011010") - BitParser.add(WHITE, 1408, "011011011") - BitParser.add(WHITE, 1472, "010011000") - BitParser.add(WHITE, 1536, "010011001") - BitParser.add(WHITE, 1600, "010011010") - BitParser.add(WHITE, 1664, "011000") - BitParser.add(WHITE, 1728, "010011011") - BitParser.add(WHITE, 1792, "00000001000") - BitParser.add(WHITE, 1856, "00000001100") - BitParser.add(WHITE, 1920, "00000001101") - BitParser.add(WHITE, 1984, "000000010010") - BitParser.add(WHITE, 2048, "000000010011") - BitParser.add(WHITE, 2112, "000000010100") - BitParser.add(WHITE, 2176, "000000010101") - BitParser.add(WHITE, 2240, "000000010110") - BitParser.add(WHITE, 2304, "000000010111") - BitParser.add(WHITE, 2368, "000000011100") - BitParser.add(WHITE, 2432, "000000011101") - BitParser.add(WHITE, 2496, "000000011110") - BitParser.add(WHITE, 2560, "000000011111") - - BLACK = [None, None] - BitParser.add(BLACK, 0, "0000110111") - BitParser.add(BLACK, 1, "010") - BitParser.add(BLACK, 2, "11") - BitParser.add(BLACK, 3, "10") - BitParser.add(BLACK, 4, "011") - BitParser.add(BLACK, 5, "0011") - BitParser.add(BLACK, 6, "0010") - BitParser.add(BLACK, 7, "00011") - BitParser.add(BLACK, 8, "000101") - BitParser.add(BLACK, 9, "000100") - BitParser.add(BLACK, 10, "0000100") - BitParser.add(BLACK, 11, "0000101") - BitParser.add(BLACK, 12, "0000111") - BitParser.add(BLACK, 13, "00000100") - BitParser.add(BLACK, 14, "00000111") - BitParser.add(BLACK, 15, "000011000") - BitParser.add(BLACK, 16, "0000010111") - BitParser.add(BLACK, 17, "0000011000") - BitParser.add(BLACK, 18, "0000001000") - BitParser.add(BLACK, 19, "00001100111") - BitParser.add(BLACK, 20, "00001101000") - BitParser.add(BLACK, 21, "00001101100") - BitParser.add(BLACK, 22, "00000110111") - BitParser.add(BLACK, 23, "00000101000") - BitParser.add(BLACK, 24, "00000010111") - BitParser.add(BLACK, 25, "00000011000") - BitParser.add(BLACK, 26, "000011001010") - BitParser.add(BLACK, 27, "000011001011") - BitParser.add(BLACK, 28, "000011001100") - BitParser.add(BLACK, 29, "000011001101") - BitParser.add(BLACK, 30, "000001101000") - BitParser.add(BLACK, 31, "000001101001") - BitParser.add(BLACK, 32, "000001101010") - BitParser.add(BLACK, 33, "000001101011") - BitParser.add(BLACK, 34, "000011010010") - BitParser.add(BLACK, 35, "000011010011") - BitParser.add(BLACK, 36, "000011010100") - BitParser.add(BLACK, 37, "000011010101") - BitParser.add(BLACK, 38, "000011010110") - BitParser.add(BLACK, 39, "000011010111") - BitParser.add(BLACK, 40, "000001101100") - BitParser.add(BLACK, 41, "000001101101") - BitParser.add(BLACK, 42, "000011011010") - BitParser.add(BLACK, 43, "000011011011") - BitParser.add(BLACK, 44, "000001010100") - BitParser.add(BLACK, 45, "000001010101") - BitParser.add(BLACK, 46, "000001010110") - BitParser.add(BLACK, 47, "000001010111") - BitParser.add(BLACK, 48, "000001100100") - BitParser.add(BLACK, 49, "000001100101") - BitParser.add(BLACK, 50, "000001010010") - BitParser.add(BLACK, 51, "000001010011") - BitParser.add(BLACK, 52, "000000100100") - BitParser.add(BLACK, 53, "000000110111") - BitParser.add(BLACK, 54, "000000111000") - BitParser.add(BLACK, 55, "000000100111") - BitParser.add(BLACK, 56, "000000101000") - BitParser.add(BLACK, 57, "000001011000") - BitParser.add(BLACK, 58, "000001011001") - BitParser.add(BLACK, 59, "000000101011") - BitParser.add(BLACK, 60, "000000101100") - BitParser.add(BLACK, 61, "000001011010") - BitParser.add(BLACK, 62, "000001100110") - BitParser.add(BLACK, 63, "000001100111") - BitParser.add(BLACK, 64, "0000001111") - BitParser.add(BLACK, 128, "000011001000") - BitParser.add(BLACK, 192, "000011001001") - BitParser.add(BLACK, 256, "000001011011") - BitParser.add(BLACK, 320, "000000110011") - BitParser.add(BLACK, 384, "000000110100") - BitParser.add(BLACK, 448, "000000110101") - BitParser.add(BLACK, 512, "0000001101100") - BitParser.add(BLACK, 576, "0000001101101") - BitParser.add(BLACK, 640, "0000001001010") - BitParser.add(BLACK, 704, "0000001001011") - BitParser.add(BLACK, 768, "0000001001100") - BitParser.add(BLACK, 832, "0000001001101") - BitParser.add(BLACK, 896, "0000001110010") - BitParser.add(BLACK, 960, "0000001110011") - BitParser.add(BLACK, 1024, "0000001110100") - BitParser.add(BLACK, 1088, "0000001110101") - BitParser.add(BLACK, 1152, "0000001110110") - BitParser.add(BLACK, 1216, "0000001110111") - BitParser.add(BLACK, 1280, "0000001010010") - BitParser.add(BLACK, 1344, "0000001010011") - BitParser.add(BLACK, 1408, "0000001010100") - BitParser.add(BLACK, 1472, "0000001010101") - BitParser.add(BLACK, 1536, "0000001011010") - BitParser.add(BLACK, 1600, "0000001011011") - BitParser.add(BLACK, 1664, "0000001100100") - BitParser.add(BLACK, 1728, "0000001100101") - BitParser.add(BLACK, 1792, "00000001000") - BitParser.add(BLACK, 1856, "00000001100") - BitParser.add(BLACK, 1920, "00000001101") - BitParser.add(BLACK, 1984, "000000010010") - BitParser.add(BLACK, 2048, "000000010011") - BitParser.add(BLACK, 2112, "000000010100") - BitParser.add(BLACK, 2176, "000000010101") - BitParser.add(BLACK, 2240, "000000010110") - BitParser.add(BLACK, 2304, "000000010111") - BitParser.add(BLACK, 2368, "000000011100") - BitParser.add(BLACK, 2432, "000000011101") - BitParser.add(BLACK, 2496, "000000011110") - BitParser.add(BLACK, 2560, "000000011111") - - UNCOMPRESSED = [None, None] - BitParser.add(UNCOMPRESSED, "1", "1") - BitParser.add(UNCOMPRESSED, "01", "01") - BitParser.add(UNCOMPRESSED, "001", "001") - BitParser.add(UNCOMPRESSED, "0001", "0001") - BitParser.add(UNCOMPRESSED, "00001", "00001") - BitParser.add(UNCOMPRESSED, "00000", "000001") - BitParser.add(UNCOMPRESSED, "T00", "00000011") - BitParser.add(UNCOMPRESSED, "T10", "00000010") - BitParser.add(UNCOMPRESSED, "T000", "000000011") - BitParser.add(UNCOMPRESSED, "T100", "000000010") - BitParser.add(UNCOMPRESSED, "T0000", "0000000011") - BitParser.add(UNCOMPRESSED, "T1000", "0000000010") - BitParser.add(UNCOMPRESSED, "T00000", "00000000011") - BitParser.add(UNCOMPRESSED, "T10000", "00000000010") - - class CCITTException(PDFException): - pass - - class EOFB(CCITTException): - pass - - class InvalidData(CCITTException): - pass - - class ByteSkip(CCITTException): - pass - - _color: int - - def __init__(self, width: int, bytealign: bool = False) -> None: - BitParser.__init__(self) - self.width = width - self.bytealign = bytealign - self.reset() - - def feedbytes(self, data: bytes) -> None: - for byte in get_bytes(data): - try: - for m in (128, 64, 32, 16, 8, 4, 2, 1): - self._parse_bit(byte & m) - except self.ByteSkip: - self._accept = self._parse_mode - self._state = self.MODE - except self.EOFB: - break - - def _parse_mode(self, mode: object) -> BitParserState: - if mode == "p": - self._do_pass() - self._flush_line() - return self.MODE - elif mode == "h": - self._n1 = 0 - self._accept = self._parse_horiz1 - if self._color: - return self.WHITE - else: - return self.BLACK - elif mode == "u": - self._accept = self._parse_uncompressed - return self.UNCOMPRESSED - elif mode == "e": - raise self.EOFB - elif isinstance(mode, int): - self._do_vertical(mode) - self._flush_line() - return self.MODE - else: - raise self.InvalidData(mode) - - def _parse_horiz1(self, n: Any) -> BitParserState: - if n is None: - raise self.InvalidData - self._n1 += n - if n < 64: - self._n2 = 0 - self._color = 1 - self._color - self._accept = self._parse_horiz2 - if self._color: - return self.WHITE - else: - return self.BLACK - - def _parse_horiz2(self, n: Any) -> BitParserState: - if n is None: - raise self.InvalidData - self._n2 += n - if n < 64: - self._color = 1 - self._color - self._accept = self._parse_mode - self._do_horizontal(self._n1, self._n2) - self._flush_line() - return self.MODE - elif self._color: - return self.WHITE - else: - return self.BLACK - - def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState: - if not bits: - raise self.InvalidData - if bits.startswith("T"): - self._accept = self._parse_mode - self._color = int(bits[1]) - self._do_uncompressed(bits[2:]) - return self.MODE - else: - self._do_uncompressed(bits) - return self.UNCOMPRESSED - - def _get_bits(self) -> str: - return "".join(str(b) for b in self._curline[: self._curpos]) - - def _get_refline(self, i: int) -> str: - if i < 0: - return "[]" + "".join(str(b) for b in self._refline) - elif len(self._refline) <= i: - return "".join(str(b) for b in self._refline) + "[]" - else: - return ( - "".join(str(b) for b in self._refline[:i]) - + "[" - + str(self._refline[i]) - + "]" - + "".join(str(b) for b in self._refline[i + 1 :]) - ) - - def reset(self) -> None: - self._y = 0 - self._curline = array.array("b", [1] * self.width) - self._reset_line() - self._accept = self._parse_mode - self._state = self.MODE - - def output_line(self, y: int, bits: Sequence[int]) -> None: - print(y, "".join(str(b) for b in bits)) - - def _reset_line(self) -> None: - self._refline = self._curline - self._curline = array.array("b", [1] * self.width) - self._curpos = -1 - self._color = 1 - - def _flush_line(self) -> None: - if self.width <= self._curpos: - self.output_line(self._y, self._curline) - self._y += 1 - self._reset_line() - if self.bytealign: - raise self.ByteSkip - - def _do_vertical(self, dx: int) -> None: - x1 = self._curpos + 1 - while 1: - if x1 == 0: - if self._color == 1 and self._refline[x1] != self._color: - break - elif x1 == len(self._refline) or ( - self._refline[x1 - 1] == self._color - and self._refline[x1] != self._color - ): - break - x1 += 1 - x1 += dx - x0 = max(0, self._curpos) - x1 = max(0, min(self.width, x1)) - if x1 < x0: - for x in range(x1, x0): - self._curline[x] = self._color - elif x0 < x1: - for x in range(x0, x1): - self._curline[x] = self._color - self._curpos = x1 - self._color = 1 - self._color - - def _do_pass(self) -> None: - x1 = self._curpos + 1 - while 1: - if x1 == 0: - if self._color == 1 and self._refline[x1] != self._color: - break - elif x1 == len(self._refline) or ( - self._refline[x1 - 1] == self._color - and self._refline[x1] != self._color - ): - break - x1 += 1 - while 1: - if x1 == 0: - if self._color == 0 and self._refline[x1] == self._color: - break - elif x1 == len(self._refline) or ( - self._refline[x1 - 1] != self._color - and self._refline[x1] == self._color - ): - break - x1 += 1 - for x in range(self._curpos, x1): - self._curline[x] = self._color - self._curpos = x1 - - def _do_horizontal(self, n1: int, n2: int) -> None: - if self._curpos < 0: - self._curpos = 0 - x = self._curpos - for _ in range(n1): - if len(self._curline) <= x: - break - self._curline[x] = self._color - x += 1 - for _ in range(n2): - if len(self._curline) <= x: - break - self._curline[x] = 1 - self._color - x += 1 - self._curpos = x - - def _do_uncompressed(self, bits: str) -> None: - for c in bits: - self._curline[self._curpos] = int(c) - self._curpos += 1 - self._flush_line() - - -class CCITTFaxDecoder(CCITTG4Parser): - def __init__( - self, - width: int, - bytealign: bool = False, - reversed: bool = False, - ) -> None: - CCITTG4Parser.__init__(self, width, bytealign=bytealign) - self.reversed = reversed - self._buf = b"" - - def close(self) -> bytes: - return self._buf - - def output_line(self, y: int, bits: Sequence[int]) -> None: - arr = array.array("B", [0] * ((len(bits) + 7) // 8)) - if self.reversed: - bits = [1 - b for b in bits] - for i, b in enumerate(bits): - if b: - arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] - self._buf += arr.tobytes() - - -def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes: - K = params.get("K") - if K == -1: - cols = cast(int, params.get("Columns")) - bytealign = cast(bool, params.get("EncodedByteAlign")) - reversed = cast(bool, params.get("BlackIs1")) - parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed) - else: - raise PDFValueError(K) - parser.feedbytes(data) - return parser.close() - - -# test -def main(argv: List[str]) -> None: - if not argv[1:]: - import unittest - - unittest.main() - return - - class Parser(CCITTG4Parser): - def __init__(self, width: int, bytealign: bool = False) -> None: - import pygame # type: ignore[import] - - CCITTG4Parser.__init__(self, width, bytealign=bytealign) - self.img = pygame.Surface((self.width, 1000)) - - def output_line(self, y: int, bits: Sequence[int]) -> None: - for x, b in enumerate(bits): - if b: - self.img.set_at((x, y), (255, 255, 255)) - else: - self.img.set_at((x, y), (0, 0, 0)) - - def close(self) -> None: - import pygame - - pygame.image.save(self.img, "out.bmp") - - for path in argv[1:]: - fp = open(path, "rb") - (_, _, k, w, h, _) = path.split(".") - parser = Parser(int(w)) - parser.feedbytes(fp.read()) - parser.close() - fp.close() diff --git a/pdf2zh/cmapdb.py b/pdf2zh/cmapdb.py deleted file mode 100644 index 21a0f3f..0000000 --- a/pdf2zh/cmapdb.py +++ /dev/null @@ -1,471 +0,0 @@ -"""Adobe character mapping (CMap) support. - -CMaps provide the mapping between character codes and Unicode -code-points to character ids (CIDs). - -More information is available on: - - https://github.com/adobe-type-tools/cmap-resources - -""" - -import gzip -import logging -import os -import os.path -import pickle as pickle -import struct -import sys -from typing import ( - Any, - BinaryIO, - Dict, - Iterable, - Iterator, - List, - MutableMapping, - Optional, - Set, - TextIO, - Tuple, - Union, - cast, -) - -from pdf2zh.encodingdb import name2unicode -from pdf2zh.pdfexceptions import PDFException, PDFTypeError -from pdf2zh.psexceptions import PSEOF, PSSyntaxError -from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name -from pdf2zh.utils import choplist, nunpack - -log = logging.getLogger(__name__) - - -class CMapError(PDFException): - pass - - -class CMapBase: - debug = 0 - - def __init__(self, **kwargs: object) -> None: - self.attrs: MutableMapping[str, object] = kwargs.copy() - - def is_vertical(self) -> bool: - return self.attrs.get("WMode", 0) != 0 - - def set_attr(self, k: str, v: object) -> None: - self.attrs[k] = v - - def add_code2cid(self, code: str, cid: int) -> None: - pass - - def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: - pass - - def use_cmap(self, cmap: "CMapBase") -> None: - pass - - def decode(self, code: bytes) -> Iterable[int]: - raise NotImplementedError - - -class CMap(CMapBase): - def __init__(self, **kwargs: Union[str, int]) -> None: - CMapBase.__init__(self, **kwargs) - self.code2cid: Dict[int, object] = {} - - def __repr__(self) -> str: - return "" % self.attrs.get("CMapName") - - def use_cmap(self, cmap: CMapBase) -> None: - assert isinstance(cmap, CMap), str(type(cmap)) - - def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: - for k, v in src.items(): - if isinstance(v, dict): - d: Dict[int, object] = {} - dst[k] = d - copy(d, v) - else: - dst[k] = v - - copy(self.code2cid, cmap.code2cid) - - def decode(self, code: bytes) -> Iterator[int]: - # log.debug("decode: %r, %r", self, code) - d = self.code2cid - for i in iter(code): - if i in d: - x = d[i] - if isinstance(x, int): - yield x - d = self.code2cid - else: - d = cast(Dict[int, object], x) - else: - d = self.code2cid - - def dump( - self, - out: TextIO = sys.stdout, - code2cid: Optional[Dict[int, object]] = None, - code: Tuple[int, ...] = (), - ) -> None: - if code2cid is None: - code2cid = self.code2cid - code = () - for k, v in sorted(code2cid.items()): - c = code + (k,) - if isinstance(v, int): - out.write("code %r = cid %d\n" % (c, v)) - else: - self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) - - -class IdentityCMap(CMapBase): - def decode(self, code: bytes) -> Tuple[int, ...]: - n = len(code) // 2 - if n: - return struct.unpack(">%dH" % n, code) - else: - return () - - -class IdentityCMapByte(IdentityCMap): - def decode(self, code: bytes) -> Tuple[int, ...]: - n = len(code) - if n: - return struct.unpack(">%dB" % n, code) - else: - return () - - -class UnicodeMap(CMapBase): - def __init__(self, **kwargs: Union[str, int]) -> None: - CMapBase.__init__(self, **kwargs) - self.cid2unichr: Dict[int, str] = {} - - def __repr__(self) -> str: - return "" % self.attrs.get("CMapName") - - def get_unichr(self, cid: int) -> str: - # log.debug("get_unichr: %r, %r", self, cid) - return self.cid2unichr[cid] - - def dump(self, out: TextIO = sys.stdout) -> None: - for k, v in sorted(self.cid2unichr.items()): - out.write("cid %d = unicode %r\n" % (k, v)) - - -class IdentityUnicodeMap(UnicodeMap): - def get_unichr(self, cid: int) -> str: - """Interpret character id as unicode codepoint""" - # log.debug("get_unichr: %r, %r", self, cid) - return chr(cid) - - -class FileCMap(CMap): - def add_code2cid(self, code: str, cid: int) -> None: - assert isinstance(code, str) and isinstance(cid, int), str( - (type(code), type(cid)), - ) - d = self.code2cid - for c in code[:-1]: - ci = ord(c) - if ci in d: - d = cast(Dict[int, object], d[ci]) - else: - t: Dict[int, object] = {} - d[ci] = t - d = t - ci = ord(code[-1]) - d[ci] = cid - - -class FileUnicodeMap(UnicodeMap): - def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: - assert isinstance(cid, int), str(type(cid)) - if isinstance(code, PSLiteral): - # Interpret as an Adobe glyph name. - assert isinstance(code.name, str) - unichr = name2unicode(code.name) - elif isinstance(code, bytes): - # Interpret as UTF-16BE. - unichr = code.decode("UTF-16BE", "ignore") - elif isinstance(code, int): - unichr = chr(code) - else: - raise PDFTypeError(code) - - # A0 = non-breaking space, some weird fonts can have a collision on a cid here. - if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": - return - self.cid2unichr[cid] = unichr - - -class PyCMap(CMap): - def __init__(self, name: str, module: Any) -> None: - super().__init__(CMapName=name) - self.code2cid = module.CODE2CID - if module.IS_VERTICAL: - self.attrs["WMode"] = 1 - - -class PyUnicodeMap(UnicodeMap): - def __init__(self, name: str, module: Any, vertical: bool) -> None: - super().__init__(CMapName=name) - if vertical: - self.cid2unichr = module.CID2UNICHR_V - self.attrs["WMode"] = 1 - else: - self.cid2unichr = module.CID2UNICHR_H - - -class CMapDB: - _cmap_cache: Dict[str, PyCMap] = {} - _umap_cache: Dict[str, List[PyUnicodeMap]] = {} - - class CMapNotFound(CMapError): - pass - - @classmethod - def _load_data(cls, name: str) -> Any: - name = name.replace("\0", "") - filename = "%s.pickle.gz" % name - # log.debug("loading: %r", name) - cmap_paths = ( - os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"), - os.path.join(os.path.dirname(__file__), "cmap"), - ) - for directory in cmap_paths: - path = os.path.join(directory, filename) - if os.path.exists(path): - gzfile = gzip.open(path) - try: - return type(str(name), (), pickle.loads(gzfile.read())) - finally: - gzfile.close() - raise CMapDB.CMapNotFound(name) - - @classmethod - def get_cmap(cls, name: str) -> CMapBase: - if name == "Identity-H": - return IdentityCMap(WMode=0) - elif name == "Identity-V": - return IdentityCMap(WMode=1) - elif name == "OneByteIdentityH": - return IdentityCMapByte(WMode=0) - elif name == "OneByteIdentityV": - return IdentityCMapByte(WMode=1) - try: - return cls._cmap_cache[name] - except KeyError: - pass - data = cls._load_data(name) - cls._cmap_cache[name] = cmap = PyCMap(name, data) - return cmap - - @classmethod - def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: - try: - return cls._umap_cache[name][vertical] - except KeyError: - pass - data = cls._load_data("to-unicode-%s" % name) - cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] - return cls._umap_cache[name][vertical] - - -class CMapParser(PSStackParser[PSKeyword]): - def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) - self.cmap = cmap - # some ToUnicode maps don't have "begincmap" keyword. - self._in_cmap = True - self._warnings: Set[str] = set() - - def run(self) -> None: - try: - self.nextobject() - except PSEOF: - pass - - KEYWORD_BEGINCMAP = KWD(b"begincmap") - KEYWORD_ENDCMAP = KWD(b"endcmap") - KEYWORD_USECMAP = KWD(b"usecmap") - KEYWORD_DEF = KWD(b"def") - KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") - KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") - KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") - KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") - KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") - KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") - KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") - KEYWORD_ENDBFRANGE = KWD(b"endbfrange") - KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") - KEYWORD_ENDBFCHAR = KWD(b"endbfchar") - KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") - KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - """ToUnicode CMaps - - See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. - """ - if token is self.KEYWORD_BEGINCMAP: - self._in_cmap = True - self.popall() - return - - elif token is self.KEYWORD_ENDCMAP: - self._in_cmap = False - return - - if not self._in_cmap: - return - - if token is self.KEYWORD_DEF: - try: - ((_, k), (_, v)) = self.pop(2) - self.cmap.set_attr(literal_name(k), v) - except PSSyntaxError: - pass - return - - if token is self.KEYWORD_USECMAP: - try: - ((_, cmapname),) = self.pop(1) - self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) - except PSSyntaxError: - pass - except CMapDB.CMapNotFound: - pass - return - - if token is self.KEYWORD_BEGINCODESPACERANGE: - self.popall() - return - if token is self.KEYWORD_ENDCODESPACERANGE: - self.popall() - return - - if token is self.KEYWORD_BEGINCIDRANGE: - self.popall() - return - - if token is self.KEYWORD_ENDCIDRANGE: - objs = [obj for (__, obj) in self.popall()] - for start_byte, end_byte, cid in choplist(3, objs): - if not isinstance(start_byte, bytes): - self._warn_once("The start object of begincidrange is not a byte.") - continue - if not isinstance(end_byte, bytes): - self._warn_once("The end object of begincidrange is not a byte.") - continue - if not isinstance(cid, int): - self._warn_once("The cid object of begincidrange is not a byte.") - continue - if len(start_byte) != len(end_byte): - self._warn_once( - "The start and end byte of begincidrange have " - "different lengths.", - ) - continue - start_prefix = start_byte[:-4] - end_prefix = end_byte[:-4] - if start_prefix != end_prefix: - self._warn_once( - "The prefix of the start and end byte of " - "begincidrange are not the same.", - ) - continue - svar = start_byte[-4:] - evar = end_byte[-4:] - start = nunpack(svar) - end = nunpack(evar) - vlen = len(svar) - for i in range(end - start + 1): - x = start_prefix + struct.pack(">L", start + i)[-vlen:] - self.cmap.add_cid2unichr(cid + i, x) - return - - if token is self.KEYWORD_BEGINCIDCHAR: - self.popall() - return - - if token is self.KEYWORD_ENDCIDCHAR: - objs = [obj for (__, obj) in self.popall()] - for cid, code in choplist(2, objs): - if isinstance(code, bytes) and isinstance(cid, int): - self.cmap.add_cid2unichr(cid, code) - return - - if token is self.KEYWORD_BEGINBFRANGE: - self.popall() - return - - if token is self.KEYWORD_ENDBFRANGE: - objs = [obj for (__, obj) in self.popall()] - for start_byte, end_byte, code in choplist(3, objs): - if not isinstance(start_byte, bytes): - self._warn_once("The start object is not a byte.") - continue - if not isinstance(end_byte, bytes): - self._warn_once("The end object is not a byte.") - continue - if len(start_byte) != len(end_byte): - self._warn_once("The start and end byte have different lengths.") - continue - start = nunpack(start_byte) - end = nunpack(end_byte) - if isinstance(code, list): - if len(code) != end - start + 1: - self._warn_once( - "The difference between the start and end " - "offsets does not match the code length.", - ) - for cid, unicode_value in zip(range(start, end + 1), code): - self.cmap.add_cid2unichr(cid, unicode_value) - else: - assert isinstance(code, bytes) - var = code[-4:] - base = nunpack(var) - prefix = code[:-4] - vlen = len(var) - for i in range(end - start + 1): - x = prefix + struct.pack(">L", base + i)[-vlen:] - self.cmap.add_cid2unichr(start + i, x) - return - - if token is self.KEYWORD_BEGINBFCHAR: - self.popall() - return - - if token is self.KEYWORD_ENDBFCHAR: - objs = [obj for (__, obj) in self.popall()] - for cid, code in choplist(2, objs): - if isinstance(cid, bytes) and isinstance(code, bytes): - self.cmap.add_cid2unichr(nunpack(cid), code) - return - - if token is self.KEYWORD_BEGINNOTDEFRANGE: - self.popall() - return - - if token is self.KEYWORD_ENDNOTDEFRANGE: - self.popall() - return - - self.push((pos, token)) - - def _warn_once(self, msg: str) -> None: - """Warn once for each unique message""" - if msg not in self._warnings: - self._warnings.add(msg) - base_msg = ( - "Ignoring (part of) ToUnicode map because the PDF data " - "does not conform to the format. This could result in " - "(cid) values in the output. " - ) - log.warning(base_msg + msg) diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py index 4f93269..ec54792 100644 --- a/pdf2zh/converter.py +++ b/pdf2zh/converter.py @@ -1,61 +1,16 @@ -from pdf2zh.utils import ( - AnyIO, - Matrix, - PathSegment, - Point, - Rect, - apply_matrix_pt, - bbox2str, - enc, - make_compat_str, - mult_matrix, - matrix_scale, -) -from pdf2zh.pdftypes import PDFStream -from pdf2zh.pdfpage import PDFPage -from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager -from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont -from pdf2zh.pdfexceptions import PDFValueError -from pdf2zh.pdfdevice import PDFTextDevice -from pdf2zh.pdfcolor import PDFColorSpace -from pdf2zh.layout import ( - LAParams, - LTAnno, +from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager +from pdfminer.pdffont import PDFCIDFont +from pdfminer.converter import PDFConverter +from pdfminer.pdffont import PDFUnicodeNotDefined +from pdfminer.utils import apply_matrix_pt, mult_matrix +from pdfminer.layout import ( LTChar, - LTComponent, - LTCurve, LTFigure, - LTImage, - LTItem, - LTLayoutContainer, LTLine, LTPage, - LTRect, - LTText, - LTTextBox, - LTTextBoxVertical, - LTTextGroup, - LTTextLine, - TextGroupElement, ) -from pdf2zh.image import ImageWriter -from pdf2zh import utils -import io import logging import re -from typing import ( - BinaryIO, - Dict, - Generic, - List, - Optional, - Sequence, - TextIO, - Tuple, - TypeVar, - Union, - cast, -) import concurrent.futures import numpy as np import unicodedata @@ -72,47 +27,27 @@ TencentTranslator, ) - -def remove_control_characters(s): - return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") - - log = logging.getLogger(__name__) -class PDFLayoutAnalyzer(PDFTextDevice): - cur_item: LTLayoutContainer - ctm: Matrix - +class PDFConverterEx(PDFConverter): def __init__( self, rsrcmgr: PDFResourceManager, - pageno: int = 1, - laparams: Optional[LAParams] = None, ) -> None: - PDFTextDevice.__init__(self, rsrcmgr) - self.pageno = pageno - self.laparams = laparams - self._stack: List[LTLayoutContainer] = [] + PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None) - def begin_page(self, page: PDFPage, ctm: Matrix) -> None: - # (x0, y0, x1, y1) = page.mediabox + def begin_page(self, page, ctm) -> None: (x0, y0, x1, y1) = page.cropbox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) self.cur_item = LTPage(page.pageno, mediabox) - def end_page(self, page: PDFPage): - assert not self._stack, str(len(self._stack)) - assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) - # 取消默认排版分析 - # if self.laparams is not None: - # self.cur_item.analyze(self.laparams) - self.pageno += 1 + def end_page(self, page): return self.receive_layout(self.cur_item) - def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: + def begin_figure(self, name, bbox, matrix) -> None: self._stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) self.cur_item.pageid = self._stack[-1].pageid @@ -124,142 +59,15 @@ def end_figure(self, _: str) -> None: self.cur_item.add(fig) return self.receive_layout(fig) - def render_image(self, name: str, stream: PDFStream) -> None: - assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) - item = LTImage( - name, - stream, - (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), - ) - self.cur_item.add(item) - - def paint_path( - self, - gstate: PDFGraphicState, - stroke: bool, - fill: bool, - evenodd: bool, - path: Sequence[PathSegment], - ) -> None: - """Paint paths described in section 4.4 of the PDF reference manual""" - shape = "".join(x[0] for x in path) - - if shape[:1] != "m": - # Per PDF Reference Section 4.4.1, "path construction operators may - # be invoked in any sequence, but the first one invoked must be m - # or re to begin a new subpath." Since pdf2zh.six already - # converts all `re` (rectangle) operators to their equivelent - # `mlllh` representation, paths ingested by `.paint_path(...)` that - # do not begin with the `m` operator are invalid. - pass - - elif shape.count("m") > 1: - # recurse if there are multiple m's in this shape - for m in re.finditer(r"m[^m]+", shape): - subpath = path[m.start(0) : m.end(0)] - self.paint_path(gstate, stroke, fill, evenodd, subpath) - - else: - # Although the 'h' command does not not literally provide a - # point-position, its position is (by definition) equal to the - # subpath's starting point. - # - # And, per Section 4.4's Table 4.9, all other path commands place - # their point-position in their final two arguments. (Any preceding - # arguments represent control points on Bézier curves.) - raw_pts = [ - cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path - ] - pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] - - operators = [str(operation[0]) for operation in path] - transformed_points = [ - [ - apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) - for operand1, operand2 in zip(operation[1::2], operation[2::2]) - ] - for operation in path - ] - transformed_path = [ - cast(PathSegment, (o, *p)) - for o, p in zip(operators, transformed_points) - ] - - if shape in {"mlh", "ml"}: - # single line segment - # - # Note: 'ml', in conditional above, is a frequent anomaly - # that we want to support. - line = LTLine( - gstate.linewidth * matrix_scale(self.ctm), - pts[0], - pts[1], - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - original_path=transformed_path, - dashing_style=gstate.dash, - ) - self.cur_item.add(line) - - elif shape in {"mlllh", "mllll"}: - (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts - - is_closed_loop = pts[0] == pts[4] - has_square_coordinates = ( - x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 - ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) - if is_closed_loop and has_square_coordinates: - rect = LTRect( - gstate.linewidth * matrix_scale(self.ctm), - (*pts[0], *pts[2]), - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ) - self.cur_item.add(rect) - else: - curve = LTCurve( - gstate.linewidth * matrix_scale(self.ctm), - pts, - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ) - self.cur_item.add(curve) - else: - curve = LTCurve( - gstate.linewidth * matrix_scale(self.ctm), - pts, - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ) - self.cur_item.add(curve) - def render_char( self, - matrix: Matrix, - font: PDFFont, + matrix, + font, fontsize: float, scaling: float, rise: float, cid: int, - ncs: PDFColorSpace, + ncs, graphicstate: PDFGraphicState, ) -> float: try: @@ -283,78 +91,14 @@ def render_char( ) self.cur_item.add(item) item.cid = cid # hack 插入原字符编码 + item.font = font # hack 插入原字符字体 return item.adv - def handle_undefined_char(self, font: PDFFont, cid: int) -> str: - # log.debug("undefined: %r, %r", font, cid) - return "(cid:%d)" % cid - - def receive_layout(self, ltpage: LTPage) -> None: - pass - - -class PDFPageAggregator(PDFLayoutAnalyzer): - def __init__( - self, - rsrcmgr: PDFResourceManager, - pageno: int = 1, - laparams: Optional[LAParams] = None, - ) -> None: - PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.result: Optional[LTPage] = None - - def receive_layout(self, ltpage: LTPage) -> None: - self.result = ltpage - - def get_result(self) -> LTPage: - assert self.result is not None - return self.result - - -# Some PDFConverter children support only binary I/O -IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO) - - -class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): - def __init__( - self, - rsrcmgr: PDFResourceManager, - outfp: IOType, - codec: str = "utf-8", - pageno: int = 1, - laparams: Optional[LAParams] = None, - ) -> None: - PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.outfp: IOType = outfp - self.codec = codec - self.outfp_binary = self._is_binary_stream(self.outfp) - - @staticmethod - def _is_binary_stream(outfp: AnyIO) -> bool: - """Test if an stream is binary or not""" - if "b" in getattr(outfp, "mode", ""): - return True - elif hasattr(outfp, "mode"): - # output stream has a mode, but it does not contain 'b' - return False - elif isinstance(outfp, io.BytesIO): - return True - elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase): - return False - - return True - -class TextConverter(PDFConverter[AnyIO]): +class TranslateConverter(PDFConverterEx): def __init__( self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = "utf-8", - pageno: int = 1, - laparams: Optional[LAParams] = None, - showpageno: bool = False, - imagewriter: Optional[ImageWriter] = None, + rsrcmgr, vfont: str = None, vchar: str = None, thread: int = 0, @@ -363,9 +107,7 @@ def __init__( lang_out: str = "", service: str = "", ) -> None: - super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) - self.showpageno = showpageno - self.imagewriter = imagewriter + super().__init__(rsrcmgr) self.vfont = vfont self.vchar = vchar self.thread = thread @@ -402,13 +144,6 @@ def __init__( else: raise ValueError("Unsupported translation service") - def write_text(self, text: str) -> None: - text = utils.compatible_encode_method(text, self.codec, "ignore") - if self.outfp_binary: - cast(BinaryIO, self.outfp).write(text.encode()) - else: - cast(TextIO, self.outfp).write(text) - # fmt: off def receive_layout(self, ltpage: LTPage): xt = None # 上一个字符 @@ -589,7 +324,6 @@ def worker(s): # 多线程翻译 new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存 if new is None: new = self.translator.translate(s) - new = remove_control_characters(new) cache.write_paragraph(hash_key, hash_key_paragraph, new) return new except BaseException as e: @@ -708,677 +442,3 @@ def raw_string(fcur, cstk): # 编码字符串 ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " ops = f"BT {ops}ET " return ops - - # Some dummy functions to save memory/CPU when all that is wanted - # is text. This stops all the image and drawing output from being - # recorded and taking up RAM. - def render_image(self, name: str, stream: PDFStream) -> None: - if self.imagewriter is not None: - PDFConverter.render_image(self, name, stream) - - # def paint_path( - # self, - # gstate: PDFGraphicState, - # stroke: bool, - # fill: bool, - # evenodd: bool, - # path: Sequence[PathSegment], - # ) -> None: - # pass - - -class HTMLConverter(PDFConverter[AnyIO]): - RECT_COLORS = { - "figure": "yellow", - "textline": "magenta", - "textbox": "cyan", - "textgroup": "red", - "curve": "black", - "page": "gray", - } - - TEXT_COLORS = { - "textbox": "blue", - "char": "black", - } - - def __init__( - self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = "utf-8", - pageno: int = 1, - laparams: Optional[LAParams] = None, - scale: float = 1, - fontscale: float = 1.0, - layoutmode: str = "normal", - showpageno: bool = True, - pagemargin: int = 50, - imagewriter: Optional[ImageWriter] = None, - debug: int = 0, - rect_colors: Optional[Dict[str, str]] = None, - text_colors: Optional[Dict[str, str]] = None, - ) -> None: - PDFConverter.__init__( - self, - rsrcmgr, - outfp, - codec=codec, - pageno=pageno, - laparams=laparams, - ) - - # write() assumes a codec for binary I/O, or no codec for text I/O. - if self.outfp_binary and not self.codec: - raise PDFValueError("Codec is required for a binary I/O output") - if not self.outfp_binary and self.codec: - raise PDFValueError("Codec must not be specified for a text I/O output") - - if text_colors is None: - text_colors = {"char": "black"} - if rect_colors is None: - rect_colors = {"curve": "black", "page": "gray"} - - self.scale = scale - self.fontscale = fontscale - self.layoutmode = layoutmode - self.showpageno = showpageno - self.pagemargin = pagemargin - self.imagewriter = imagewriter - self.rect_colors = rect_colors - self.text_colors = text_colors - if debug: - self.rect_colors.update(self.RECT_COLORS) - self.text_colors.update(self.TEXT_COLORS) - self._yoffset: float = self.pagemargin - self._font: Optional[Tuple[str, float]] = None - self._fontstack: List[Optional[Tuple[str, float]]] = [] - self.write_header() - - def write(self, text: str) -> None: - if self.codec: - cast(BinaryIO, self.outfp).write(text.encode(self.codec)) - else: - cast(TextIO, self.outfp).write(text) - - def write_header(self) -> None: - self.write("\n") - if self.codec: - s = ( - '\n' % self.codec - ) - else: - s = '\n' - self.write(s) - self.write("\n") - - def write_footer(self) -> None: - page_links = [f'{i}' for i in range(1, self.pageno)] - s = '
Page: %s
\n' % ", ".join( - page_links, - ) - self.write(s) - self.write("\n") - - def write_text(self, text: str) -> None: - self.write(enc(text)) - - def place_rect( - self, - color: str, - borderwidth: int, - x: float, - y: float, - w: float, - h: float, - ) -> None: - color2 = self.rect_colors.get(color) - if color2 is not None: - s = ( - '\n' - % ( - color2, - borderwidth, - x * self.scale, - (self._yoffset - y) * self.scale, - w * self.scale, - h * self.scale, - ) - ) - self.write(s) - - def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None: - self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) - - def place_image( - self, - item: LTImage, - borderwidth: int, - x: float, - y: float, - w: float, - h: float, - ) -> None: - if self.imagewriter is not None: - name = self.imagewriter.export_image(item) - s = ( - '\n' - % ( - enc(name), - borderwidth, - x * self.scale, - (self._yoffset - y) * self.scale, - w * self.scale, - h * self.scale, - ) - ) - self.write(s) - - def place_text( - self, - color: str, - text: str, - x: float, - y: float, - size: float, - ) -> None: - color2 = self.text_colors.get(color) - if color2 is not None: - s = ( - '' - % ( - color2, - x * self.scale, - (self._yoffset - y) * self.scale, - size * self.scale * self.fontscale, - ) - ) - self.write(s) - self.write_text(text) - self.write("\n") - - def begin_div( - self, - color: str, - borderwidth: int, - x: float, - y: float, - w: float, - h: float, - writing_mode: str = "False", - ) -> None: - self._fontstack.append(self._font) - self._font = None - s = ( - '
' - % ( - color, - borderwidth, - writing_mode, - x * self.scale, - (self._yoffset - y) * self.scale, - w * self.scale, - h * self.scale, - ) - ) - self.write(s) - - def end_div(self, color: str) -> None: - if self._font is not None: - self.write("") - self._font = self._fontstack.pop() - self.write("
") - - def put_text(self, text: str, fontname: str, fontsize: float) -> None: - font = (fontname, fontsize) - if font != self._font: - if self._font is not None: - self.write("") - # Remove subset tag from fontname, see PDF Reference 5.5.3 - fontname_without_subset_tag = fontname.split("+")[-1] - self.write( - '' - % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale), - ) - self._font = font - self.write_text(text) - - def put_newline(self) -> None: - self.write("
") - - def receive_layout(self, ltpage: LTPage) -> None: - def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None: - if isinstance(item, LTTextGroup): - self.place_border("textgroup", 1, item) - for child in item: - show_group(child) - - def render(item: LTItem) -> None: - child: LTItem - if isinstance(item, LTPage): - self._yoffset += item.y1 - self.place_border("page", 1, item) - if self.showpageno: - self.write( - '
' - % ((self._yoffset - item.y1) * self.scale), - ) - self.write( - f'Page {item.pageid}
\n', - ) - for child in item: - render(child) - if item.groups is not None: - for group in item.groups: - show_group(group) - elif isinstance(item, LTCurve): - self.place_border("curve", 1, item) - elif isinstance(item, LTFigure): - self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height) - for child in item: - render(child) - self.end_div("figure") - elif isinstance(item, LTImage): - self.place_image(item, 1, item.x0, item.y1, item.width, item.height) - elif self.layoutmode == "exact": - if isinstance(item, LTTextLine): - self.place_border("textline", 1, item) - for child in item: - render(child) - elif isinstance(item, LTTextBox): - self.place_border("textbox", 1, item) - self.place_text( - "textbox", - str(item.index + 1), - item.x0, - item.y1, - 20, - ) - for child in item: - render(child) - elif isinstance(item, LTChar): - self.place_border("char", 1, item) - self.place_text( - "char", - item.get_text(), - item.x0, - item.y1, - item.size, - ) - elif isinstance(item, LTTextLine): - for child in item: - render(child) - if self.layoutmode != "loose": - self.put_newline() - elif isinstance(item, LTTextBox): - self.begin_div( - "textbox", - 1, - item.x0, - item.y1, - item.width, - item.height, - item.get_writing_mode(), - ) - for child in item: - render(child) - self.end_div("textbox") - elif isinstance(item, LTChar): - fontname = make_compat_str(item.fontname) - self.put_text(item.get_text(), fontname, item.size) - elif isinstance(item, LTText): - self.write_text(item.get_text()) - - render(ltpage) - self._yoffset += self.pagemargin - - def close(self) -> None: - self.write_footer() - - -class XMLConverter(PDFConverter[AnyIO]): - CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]") - - def __init__( - self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = "utf-8", - pageno: int = 1, - laparams: Optional[LAParams] = None, - imagewriter: Optional[ImageWriter] = None, - stripcontrol: bool = False, - ) -> None: - PDFConverter.__init__( - self, - rsrcmgr, - outfp, - codec=codec, - pageno=pageno, - laparams=laparams, - ) - - # write() assumes a codec for binary I/O, or no codec for text I/O. - if self.outfp_binary == (not self.codec): - raise PDFValueError("Codec is required for a binary I/O output") - - self.imagewriter = imagewriter - self.stripcontrol = stripcontrol - self.write_header() - - def write(self, text: str) -> None: - if self.codec: - cast(BinaryIO, self.outfp).write(text.encode(self.codec)) - else: - cast(TextIO, self.outfp).write(text) - - def write_header(self) -> None: - if self.codec: - self.write('\n' % self.codec) - else: - self.write('\n') - self.write("\n") - - def write_footer(self) -> None: - self.write("\n") - - def write_text(self, text: str) -> None: - if self.stripcontrol: - text = self.CONTROL.sub("", text) - self.write(enc(text)) - - def receive_layout(self, ltpage: LTPage) -> None: - def show_group(item: LTItem) -> None: - if isinstance(item, LTTextBox): - self.write( - '\n' - % (item.index, bbox2str(item.bbox)), - ) - elif isinstance(item, LTTextGroup): - self.write('\n' % bbox2str(item.bbox)) - for child in item: - show_group(child) - self.write("\n") - - def render(item: LTItem) -> None: - child: LTItem - if isinstance(item, LTPage): - s = '\n' % ( - item.pageid, - bbox2str(item.bbox), - item.rotate, - ) - self.write(s) - for child in item: - render(child) - if item.groups is not None: - self.write("\n") - for group in item.groups: - show_group(group) - self.write("\n") - self.write("\n") - elif isinstance(item, LTLine): - s = '\n' % ( - item.linewidth, - bbox2str(item.bbox), - ) - self.write(s) - elif isinstance(item, LTRect): - s = '\n' % ( - item.linewidth, - bbox2str(item.bbox), - ) - self.write(s) - elif isinstance(item, LTCurve): - s = '\n' % ( - item.linewidth, - bbox2str(item.bbox), - item.get_pts(), - ) - self.write(s) - elif isinstance(item, LTFigure): - s = f'
\n' - self.write(s) - for child in item: - render(child) - self.write("
\n") - elif isinstance(item, LTTextLine): - self.write('\n' % bbox2str(item.bbox)) - for child in item: - render(child) - self.write("\n") - elif isinstance(item, LTTextBox): - wmode = "" - if isinstance(item, LTTextBoxVertical): - wmode = ' wmode="vertical"' - s = '\n' % ( - item.index, - bbox2str(item.bbox), - wmode, - ) - self.write(s) - for child in item: - render(child) - self.write("\n") - elif isinstance(item, LTChar): - s = ( - '' - % ( - enc(item.fontname), - bbox2str(item.bbox), - item.ncs.name, - item.graphicstate.ncolor, - item.size, - ) - ) - self.write(s) - self.write_text(item.get_text()) - self.write("\n") - elif isinstance(item, LTText): - self.write("%s\n" % item.get_text()) - elif isinstance(item, LTImage): - if self.imagewriter is not None: - name = self.imagewriter.export_image(item) - self.write( - '\n' - % (enc(name), item.width, item.height), - ) - else: - self.write( - '\n' - % (item.width, item.height), - ) - else: - assert False, str(("Unhandled", item)) - - render(ltpage) - - def close(self) -> None: - self.write_footer() - - -class HOCRConverter(PDFConverter[AnyIO]): - """Extract an hOCR representation from explicit text information within a PDF.""" - - # Where text is being extracted from a variety of types of PDF within a - # business process, those PDFs where the text is only present in image - # form will need to be analysed using an OCR tool which will typically - # output hOCR. This converter extracts the explicit text information from - # those PDFs that do have it and uses it to genxerate a basic hOCR - # representation that is designed to be used in conjunction with the image - # of the PDF in the same way as genuine OCR output would be, but without the - # inevitable OCR errors. - - # The converter does not handle images, diagrams or text colors. - - # In the examples processed by the contributor it was necessary to set - # LAParams.all_texts to True. - - CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") - - def __init__( - self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = "utf8", - pageno: int = 1, - laparams: Optional[LAParams] = None, - stripcontrol: bool = False, - ): - PDFConverter.__init__( - self, - rsrcmgr, - outfp, - codec=codec, - pageno=pageno, - laparams=laparams, - ) - self.stripcontrol = stripcontrol - self.within_chars = False - self.write_header() - - def bbox_repr(self, bbox: Rect) -> str: - (in_x0, in_y0, in_x1, in_y1) = bbox - # PDF y-coordinates are the other way round from hOCR coordinates - out_x0 = int(in_x0) - out_y0 = int(self.page_bbox[3] - in_y1) - out_x1 = int(in_x1) - out_y1 = int(self.page_bbox[3] - in_y0) - return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" - - def write(self, text: str) -> None: - if self.codec: - encoded_text = text.encode(self.codec) - cast(BinaryIO, self.outfp).write(encoded_text) - else: - cast(TextIO, self.outfp).write(text) - - def write_header(self) -> None: - if self.codec: - self.write( - "\n" % self.codec, - ) - else: - self.write( - "\n", - ) - self.write("\n") - self.write("\n") - self.write( - "\n", - ) - self.write( - "\n", - ) - self.write( - " \n", - ) - self.write("\n") - self.write("\n") - - def write_footer(self) -> None: - self.write("\n") - self.write( - "\n", - ) - - def write_text(self, text: str) -> None: - if self.stripcontrol: - text = self.CONTROL.sub("", text) - self.write(text) - - def write_word(self) -> None: - if len(self.working_text) > 0: - bold_and_italic_styles = "" - if "Italic" in self.working_font: - bold_and_italic_styles = "font-style: italic; " - if "Bold" in self.working_font: - bold_and_italic_styles += "font-weight: bold; " - self.write( - "%s" - % ( - ( - self.working_font, - self.working_size, - bold_and_italic_styles, - self.bbox_repr(self.working_bbox), - self.working_font, - self.working_size, - self.working_text.strip(), - ) - ), - ) - self.within_chars = False - - def receive_layout(self, ltpage: LTPage) -> None: - def render(item: LTItem) -> None: - if self.within_chars and isinstance(item, LTAnno): - self.write_word() - if isinstance(item, LTPage): - self.page_bbox = item.bbox - self.write( - "
\n" - % (item.pageid, self.bbox_repr(item.bbox)), - ) - for child in item: - render(child) - self.write("
\n") - elif isinstance(item, LTTextLine): - self.write( - "" % (self.bbox_repr(item.bbox)), - ) - for child_line in item: - render(child_line) - self.write("\n") - elif isinstance(item, LTTextBox): - self.write( - "
\n" - % (item.index, self.bbox_repr(item.bbox)), - ) - for child in item: - render(child) - self.write("
\n") - elif isinstance(item, LTChar): - if not self.within_chars: - self.within_chars = True - self.working_text = item.get_text() - self.working_bbox = item.bbox - self.working_font = item.fontname - self.working_size = item.size - elif len(item.get_text().strip()) == 0: - self.write_word() - self.write(item.get_text()) - else: - if ( - self.working_bbox[1] != item.bbox[1] - or self.working_font != item.fontname - or self.working_size != item.size - ): - self.write_word() - self.working_bbox = item.bbox - self.working_font = item.fontname - self.working_size = item.size - self.working_text += item.get_text() - self.working_bbox = ( - self.working_bbox[0], - self.working_bbox[1], - item.bbox[2], - self.working_bbox[3], - ) - - render(ltpage) - - def close(self) -> None: - self.write_footer() diff --git a/pdf2zh/data_structures.py b/pdf2zh/data_structures.py deleted file mode 100644 index cbce5e3..0000000 --- a/pdf2zh/data_structures.py +++ /dev/null @@ -1,52 +0,0 @@ -from typing import Any, Iterable, List, Optional, Tuple - -from pdf2zh import settings -from pdf2zh.pdfparser import PDFSyntaxError -from pdf2zh.pdftypes import dict_value, int_value, list_value -from pdf2zh.utils import choplist - - -class NumberTree: - """A PDF number tree. - - See Section 3.8.6 of the PDF Reference. - """ - - def __init__(self, obj: Any): - self._obj = dict_value(obj) - self.nums: Optional[Iterable[Any]] = None - self.kids: Optional[Iterable[Any]] = None - self.limits: Optional[Iterable[Any]] = None - - if "Nums" in self._obj: - self.nums = list_value(self._obj["Nums"]) - if "Kids" in self._obj: - self.kids = list_value(self._obj["Kids"]) - if "Limits" in self._obj: - self.limits = list_value(self._obj["Limits"]) - - def _parse(self) -> List[Tuple[int, Any]]: - items = [] - if self.nums: # Leaf node - for k, v in choplist(2, self.nums): - items.append((int_value(k), v)) - - if self.kids: # Root or intermediate node - for child_ref in self.kids: - items += NumberTree(child_ref)._parse() - - return items - - values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy - - @property # type: ignore[no-redef,misc] - def values(self) -> List[Tuple[int, Any]]: - values = self._parse() - - if settings.STRICT: - if not all(a[0] <= b[0] for a, b in zip(values, values[1:])): - raise PDFSyntaxError("Number tree elements are out of order") - else: - values.sort(key=lambda t: t[0]) - - return values diff --git a/pdf2zh/encodingdb.py b/pdf2zh/encodingdb.py deleted file mode 100644 index ee6a106..0000000 --- a/pdf2zh/encodingdb.py +++ /dev/null @@ -1,127 +0,0 @@ -import logging -import re -from typing import Dict, Iterable, Optional, cast - -from pdf2zh.glyphlist import glyphname2unicode -from pdf2zh.latin_enc import ENCODING -from pdf2zh.pdfexceptions import PDFKeyError -from pdf2zh.psparser import PSLiteral - -HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") - -log = logging.getLogger(__name__) - - -def name2unicode(name: str) -> str: - """Converts Adobe glyph names to Unicode numbers. - - In contrast to the specification, this raises a KeyError instead of return - an empty string when the key is unknown. - This way the caller must explicitly define what to do - when there is not a match. - - Reference: - https://github.com/adobe-type-tools/agl-specification#2-the-mapping - - :returns unicode character if name resembles something, - otherwise a KeyError - """ - if not isinstance(name, str): - raise PDFKeyError( - 'Could not convert unicode name "%s" to character because ' - "it should be of type str but is of type %s" % (name, type(name)), - ) - - name = name.split(".")[0] - components = name.split("_") - - if len(components) > 1: - return "".join(map(name2unicode, components)) - - elif name in glyphname2unicode: - return glyphname2unicode[name] - - elif name.startswith("uni"): - name_without_uni = name.strip("uni") - - if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: - unicode_digits = [ - int(name_without_uni[i : i + 4], base=16) - for i in range(0, len(name_without_uni), 4) - ] - for digit in unicode_digits: - raise_key_error_for_invalid_unicode(digit) - characters = map(chr, unicode_digits) - return "".join(characters) - - elif name.startswith("u"): - name_without_u = name.strip("u") - - if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: - unicode_digit = int(name_without_u, base=16) - raise_key_error_for_invalid_unicode(unicode_digit) - return chr(unicode_digit) - - raise PDFKeyError( - 'Could not convert unicode name "%s" to character because ' - "it does not match specification" % name, - ) - - -def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: - """Unicode values should not be in the range D800 through DFFF because - that is used for surrogate pairs in UTF-16 - - :raises KeyError if unicode digit is invalid - """ - if 55295 < unicode_digit < 57344: - raise PDFKeyError( - "Unicode digit %d is invalid because " - "it is in the range D800 through DFFF" % unicode_digit, - ) - - -class EncodingDB: - std2unicode: Dict[int, str] = {} - mac2unicode: Dict[int, str] = {} - win2unicode: Dict[int, str] = {} - pdf2unicode: Dict[int, str] = {} - for name, std, mac, win, pdf in ENCODING: - c = name2unicode(name) - if std: - std2unicode[std] = c - if mac: - mac2unicode[mac] = c - if win: - win2unicode[win] = c - if pdf: - pdf2unicode[pdf] = c - - encodings = { - "StandardEncoding": std2unicode, - "MacRomanEncoding": mac2unicode, - "WinAnsiEncoding": win2unicode, - "PDFDocEncoding": pdf2unicode, - } - - @classmethod - def get_encoding( - cls, - name: str, - diff: Optional[Iterable[object]] = None, - ) -> Dict[int, str]: - cid2unicode = cls.encodings.get(name, cls.std2unicode) - if diff: - cid2unicode = cid2unicode.copy() - cid = 0 - for x in diff: - if isinstance(x, int): - cid = x - elif isinstance(x, PSLiteral): - try: - cid2unicode[cid] = name2unicode(cast(str, x.name)) - except (KeyError, ValueError): - # log.debug(str(e)) - pass - cid += 1 - return cid2unicode diff --git a/pdf2zh/fontmetrics.py b/pdf2zh/fontmetrics.py deleted file mode 100644 index c95c1c1..0000000 --- a/pdf2zh/fontmetrics.py +++ /dev/null @@ -1,4464 +0,0 @@ -"""Font metrics for the Adobe core 14 fonts. - -Font metrics are used to compute the boundary of each character -written with a proportional font. - -The following data were extracted from the AFM files: - - http://www.ctan.org/tex-archive/fonts/adobe/afm/ - -""" - -# BEGIN Verbatim copy of the license part - -# -# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe -# -# This file and the 35 PostScript(R) AFM files it accompanies may be -# used, copied, and distributed for any purpose and without charge, -# with or without modification, provided that all copyright notices -# are retained; that the AFM files are not distributed without this -# file; that all modifications to this file or any of the AFM files -# are prominently noted in the modified file(s); and that this -# paragraph is not modified. Adobe Systems has no responsibility or -# obligation to support the use of the AFM files. -# - -# END Verbatim copy of the license part - -# flake8: noqa -from typing import Dict - - -def convert_font_metrics(path: str) -> None: - """Convert an AFM file to a mapping of font metrics. - - See below for the output. - """ - fonts = {} - with open(path) as fileinput: - for line in fileinput.readlines(): - f = line.strip().split(" ") - if not f: - continue - k = f[0] - if k == "FontName": - fontname = f[1] - props = {"FontName": fontname, "Flags": 0} - chars: Dict[int, int] = {} - fonts[fontname] = (props, chars) - elif k == "C": - cid = int(f[1]) - if 0 <= cid and cid <= 255: - width = int(f[4]) - chars[cid] = width - elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"): - k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k) - props[k] = float(f[1]) - elif k in ("FontName", "FamilyName", "Weight"): - k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k) - props[k] = f[1] - elif k == "IsFixedPitch": - if f[1].lower() == "true": - props["Flags"] = 64 - elif k == "FontBBox": - props[k] = tuple(map(float, f[1:5])) - print("# -*- python -*-") - print("FONT_METRICS = {") - for fontname, (props, chars) in fonts.items(): - print(f" {fontname!r}: {(props, chars)!r},") - print("}") - - -FONT_METRICS = { - "Courier": ( - { - "FontName": "Courier", - "Descent": -194.0, - "FontBBox": (-6.0, -249.0, 639.0, 803.0), - "FontWeight": "Medium", - "CapHeight": 572.0, - "FontFamily": "Courier", - "Flags": 64, - "XHeight": 434.0, - "ItalicAngle": 0.0, - "Ascent": 627.0, - }, - { - " ": 600, - "!": 600, - '"': 600, - "#": 600, - "$": 600, - "%": 600, - "&": 600, - "'": 600, - "(": 600, - ")": 600, - "*": 600, - "+": 600, - ",": 600, - "-": 600, - ".": 600, - "/": 600, - "0": 600, - "1": 600, - "2": 600, - "3": 600, - "4": 600, - "5": 600, - "6": 600, - "7": 600, - "8": 600, - "9": 600, - ":": 600, - ";": 600, - "<": 600, - "=": 600, - ">": 600, - "?": 600, - "@": 600, - "A": 600, - "B": 600, - "C": 600, - "D": 600, - "E": 600, - "F": 600, - "G": 600, - "H": 600, - "I": 600, - "J": 600, - "K": 600, - "L": 600, - "M": 600, - "N": 600, - "O": 600, - "P": 600, - "Q": 600, - "R": 600, - "S": 600, - "T": 600, - "U": 600, - "V": 600, - "W": 600, - "X": 600, - "Y": 600, - "Z": 600, - "[": 600, - "\\": 600, - "]": 600, - "^": 600, - "_": 600, - "`": 600, - "a": 600, - "b": 600, - "c": 600, - "d": 600, - "e": 600, - "f": 600, - "g": 600, - "h": 600, - "i": 600, - "j": 600, - "k": 600, - "l": 600, - "m": 600, - "n": 600, - "o": 600, - "p": 600, - "q": 600, - "r": 600, - "s": 600, - "t": 600, - "u": 600, - "v": 600, - "w": 600, - "x": 600, - "y": 600, - "z": 600, - "{": 600, - "|": 600, - "}": 600, - "~": 600, - "\xa1": 600, - "\xa2": 600, - "\xa3": 600, - "\xa4": 600, - "\xa5": 600, - "\xa6": 600, - "\xa7": 600, - "\xa8": 600, - "\xa9": 600, - "\xaa": 600, - "\xab": 600, - "\xac": 600, - "\xae": 600, - "\xaf": 600, - "\xb0": 600, - "\xb1": 600, - "\xb2": 600, - "\xb3": 600, - "\xb4": 600, - "\xb5": 600, - "\xb6": 600, - "\xb7": 600, - "\xb8": 600, - "\xb9": 600, - "\xba": 600, - "\xbb": 600, - "\xbc": 600, - "\xbd": 600, - "\xbe": 600, - "\xbf": 600, - "\xc0": 600, - "\xc1": 600, - "\xc2": 600, - "\xc3": 600, - "\xc4": 600, - "\xc5": 600, - "\xc6": 600, - "\xc7": 600, - "\xc8": 600, - "\xc9": 600, - "\xca": 600, - "\xcb": 600, - "\xcc": 600, - "\xcd": 600, - "\xce": 600, - "\xcf": 600, - "\xd0": 600, - "\xd1": 600, - "\xd2": 600, - "\xd3": 600, - "\xd4": 600, - "\xd5": 600, - "\xd6": 600, - "\xd7": 600, - "\xd8": 600, - "\xd9": 600, - "\xda": 600, - "\xdb": 600, - "\xdc": 600, - "\xdd": 600, - "\xde": 600, - "\xdf": 600, - "\xe0": 600, - "\xe1": 600, - "\xe2": 600, - "\xe3": 600, - "\xe4": 600, - "\xe5": 600, - "\xe6": 600, - "\xe7": 600, - "\xe8": 600, - "\xe9": 600, - "\xea": 600, - "\xeb": 600, - "\xec": 600, - "\xed": 600, - "\xee": 600, - "\xef": 600, - "\xf0": 600, - "\xf1": 600, - "\xf2": 600, - "\xf3": 600, - "\xf4": 600, - "\xf5": 600, - "\xf6": 600, - "\xf7": 600, - "\xf8": 600, - "\xf9": 600, - "\xfa": 600, - "\xfb": 600, - "\xfc": 600, - "\xfd": 600, - "\xfe": 600, - "\xff": 600, - "\u0100": 600, - "\u0101": 600, - "\u0102": 600, - "\u0103": 600, - "\u0104": 600, - "\u0105": 600, - "\u0106": 600, - "\u0107": 600, - "\u010c": 600, - "\u010d": 600, - "\u010e": 600, - "\u010f": 600, - "\u0110": 600, - "\u0111": 600, - "\u0112": 600, - "\u0113": 600, - "\u0116": 600, - "\u0117": 600, - "\u0118": 600, - "\u0119": 600, - "\u011a": 600, - "\u011b": 600, - "\u011e": 600, - "\u011f": 600, - "\u0122": 600, - "\u0123": 600, - "\u012a": 600, - "\u012b": 600, - "\u012e": 600, - "\u012f": 600, - "\u0130": 600, - "\u0131": 600, - "\u0136": 600, - "\u0137": 600, - "\u0139": 600, - "\u013a": 600, - "\u013b": 600, - "\u013c": 600, - "\u013d": 600, - "\u013e": 600, - "\u0141": 600, - "\u0142": 600, - "\u0143": 600, - "\u0144": 600, - "\u0145": 600, - "\u0146": 600, - "\u0147": 600, - "\u0148": 600, - "\u014c": 600, - "\u014d": 600, - "\u0150": 600, - "\u0151": 600, - "\u0152": 600, - "\u0153": 600, - "\u0154": 600, - "\u0155": 600, - "\u0156": 600, - "\u0157": 600, - "\u0158": 600, - "\u0159": 600, - "\u015a": 600, - "\u015b": 600, - "\u015e": 600, - "\u015f": 600, - "\u0160": 600, - "\u0161": 600, - "\u0162": 600, - "\u0163": 600, - "\u0164": 600, - "\u0165": 600, - "\u016a": 600, - "\u016b": 600, - "\u016e": 600, - "\u016f": 600, - "\u0170": 600, - "\u0171": 600, - "\u0172": 600, - "\u0173": 600, - "\u0178": 600, - "\u0179": 600, - "\u017a": 600, - "\u017b": 600, - "\u017c": 600, - "\u017d": 600, - "\u017e": 600, - "\u0192": 600, - "\u0218": 600, - "\u0219": 600, - "\u02c6": 600, - "\u02c7": 600, - "\u02d8": 600, - "\u02d9": 600, - "\u02da": 600, - "\u02db": 600, - "\u02dc": 600, - "\u02dd": 600, - "\u2013": 600, - "\u2014": 600, - "\u2018": 600, - "\u2019": 600, - "\u201a": 600, - "\u201c": 600, - "\u201d": 600, - "\u201e": 600, - "\u2020": 600, - "\u2021": 600, - "\u2022": 600, - "\u2026": 600, - "\u2030": 600, - "\u2039": 600, - "\u203a": 600, - "\u2044": 600, - "\u2122": 600, - "\u2202": 600, - "\u2206": 600, - "\u2211": 600, - "\u2212": 600, - "\u221a": 600, - "\u2260": 600, - "\u2264": 600, - "\u2265": 600, - "\u25ca": 600, - "\uf6c3": 600, - "\ufb01": 600, - "\ufb02": 600, - }, - ), - "Courier-Bold": ( - { - "FontName": "Courier-Bold", - "Descent": -194.0, - "FontBBox": (-88.0, -249.0, 697.0, 811.0), - "FontWeight": "Bold", - "CapHeight": 572.0, - "FontFamily": "Courier", - "Flags": 64, - "XHeight": 434.0, - "ItalicAngle": 0.0, - "Ascent": 627.0, - }, - { - " ": 600, - "!": 600, - '"': 600, - "#": 600, - "$": 600, - "%": 600, - "&": 600, - "'": 600, - "(": 600, - ")": 600, - "*": 600, - "+": 600, - ",": 600, - "-": 600, - ".": 600, - "/": 600, - "0": 600, - "1": 600, - "2": 600, - "3": 600, - "4": 600, - "5": 600, - "6": 600, - "7": 600, - "8": 600, - "9": 600, - ":": 600, - ";": 600, - "<": 600, - "=": 600, - ">": 600, - "?": 600, - "@": 600, - "A": 600, - "B": 600, - "C": 600, - "D": 600, - "E": 600, - "F": 600, - "G": 600, - "H": 600, - "I": 600, - "J": 600, - "K": 600, - "L": 600, - "M": 600, - "N": 600, - "O": 600, - "P": 600, - "Q": 600, - "R": 600, - "S": 600, - "T": 600, - "U": 600, - "V": 600, - "W": 600, - "X": 600, - "Y": 600, - "Z": 600, - "[": 600, - "\\": 600, - "]": 600, - "^": 600, - "_": 600, - "`": 600, - "a": 600, - "b": 600, - "c": 600, - "d": 600, - "e": 600, - "f": 600, - "g": 600, - "h": 600, - "i": 600, - "j": 600, - "k": 600, - "l": 600, - "m": 600, - "n": 600, - "o": 600, - "p": 600, - "q": 600, - "r": 600, - "s": 600, - "t": 600, - "u": 600, - "v": 600, - "w": 600, - "x": 600, - "y": 600, - "z": 600, - "{": 600, - "|": 600, - "}": 600, - "~": 600, - "\xa1": 600, - "\xa2": 600, - "\xa3": 600, - "\xa4": 600, - "\xa5": 600, - "\xa6": 600, - "\xa7": 600, - "\xa8": 600, - "\xa9": 600, - "\xaa": 600, - "\xab": 600, - "\xac": 600, - "\xae": 600, - "\xaf": 600, - "\xb0": 600, - "\xb1": 600, - "\xb2": 600, - "\xb3": 600, - "\xb4": 600, - "\xb5": 600, - "\xb6": 600, - "\xb7": 600, - "\xb8": 600, - "\xb9": 600, - "\xba": 600, - "\xbb": 600, - "\xbc": 600, - "\xbd": 600, - "\xbe": 600, - "\xbf": 600, - "\xc0": 600, - "\xc1": 600, - "\xc2": 600, - "\xc3": 600, - "\xc4": 600, - "\xc5": 600, - "\xc6": 600, - "\xc7": 600, - "\xc8": 600, - "\xc9": 600, - "\xca": 600, - "\xcb": 600, - "\xcc": 600, - "\xcd": 600, - "\xce": 600, - "\xcf": 600, - "\xd0": 600, - "\xd1": 600, - "\xd2": 600, - "\xd3": 600, - "\xd4": 600, - "\xd5": 600, - "\xd6": 600, - "\xd7": 600, - "\xd8": 600, - "\xd9": 600, - "\xda": 600, - "\xdb": 600, - "\xdc": 600, - "\xdd": 600, - "\xde": 600, - "\xdf": 600, - "\xe0": 600, - "\xe1": 600, - "\xe2": 600, - "\xe3": 600, - "\xe4": 600, - "\xe5": 600, - "\xe6": 600, - "\xe7": 600, - "\xe8": 600, - "\xe9": 600, - "\xea": 600, - "\xeb": 600, - "\xec": 600, - "\xed": 600, - "\xee": 600, - "\xef": 600, - "\xf0": 600, - "\xf1": 600, - "\xf2": 600, - "\xf3": 600, - "\xf4": 600, - "\xf5": 600, - "\xf6": 600, - "\xf7": 600, - "\xf8": 600, - "\xf9": 600, - "\xfa": 600, - "\xfb": 600, - "\xfc": 600, - "\xfd": 600, - "\xfe": 600, - "\xff": 600, - "\u0100": 600, - "\u0101": 600, - "\u0102": 600, - "\u0103": 600, - "\u0104": 600, - "\u0105": 600, - "\u0106": 600, - "\u0107": 600, - "\u010c": 600, - "\u010d": 600, - "\u010e": 600, - "\u010f": 600, - "\u0110": 600, - "\u0111": 600, - "\u0112": 600, - "\u0113": 600, - "\u0116": 600, - "\u0117": 600, - "\u0118": 600, - "\u0119": 600, - "\u011a": 600, - "\u011b": 600, - "\u011e": 600, - "\u011f": 600, - "\u0122": 600, - "\u0123": 600, - "\u012a": 600, - "\u012b": 600, - "\u012e": 600, - "\u012f": 600, - "\u0130": 600, - "\u0131": 600, - "\u0136": 600, - "\u0137": 600, - "\u0139": 600, - "\u013a": 600, - "\u013b": 600, - "\u013c": 600, - "\u013d": 600, - "\u013e": 600, - "\u0141": 600, - "\u0142": 600, - "\u0143": 600, - "\u0144": 600, - "\u0145": 600, - "\u0146": 600, - "\u0147": 600, - "\u0148": 600, - "\u014c": 600, - "\u014d": 600, - "\u0150": 600, - "\u0151": 600, - "\u0152": 600, - "\u0153": 600, - "\u0154": 600, - "\u0155": 600, - "\u0156": 600, - "\u0157": 600, - "\u0158": 600, - "\u0159": 600, - "\u015a": 600, - "\u015b": 600, - "\u015e": 600, - "\u015f": 600, - "\u0160": 600, - "\u0161": 600, - "\u0162": 600, - "\u0163": 600, - "\u0164": 600, - "\u0165": 600, - "\u016a": 600, - "\u016b": 600, - "\u016e": 600, - "\u016f": 600, - "\u0170": 600, - "\u0171": 600, - "\u0172": 600, - "\u0173": 600, - "\u0178": 600, - "\u0179": 600, - "\u017a": 600, - "\u017b": 600, - "\u017c": 600, - "\u017d": 600, - "\u017e": 600, - "\u0192": 600, - "\u0218": 600, - "\u0219": 600, - "\u02c6": 600, - "\u02c7": 600, - "\u02d8": 600, - "\u02d9": 600, - "\u02da": 600, - "\u02db": 600, - "\u02dc": 600, - "\u02dd": 600, - "\u2013": 600, - "\u2014": 600, - "\u2018": 600, - "\u2019": 600, - "\u201a": 600, - "\u201c": 600, - "\u201d": 600, - "\u201e": 600, - "\u2020": 600, - "\u2021": 600, - "\u2022": 600, - "\u2026": 600, - "\u2030": 600, - "\u2039": 600, - "\u203a": 600, - "\u2044": 600, - "\u2122": 600, - "\u2202": 600, - "\u2206": 600, - "\u2211": 600, - "\u2212": 600, - "\u221a": 600, - "\u2260": 600, - "\u2264": 600, - "\u2265": 600, - "\u25ca": 600, - "\uf6c3": 600, - "\ufb01": 600, - "\ufb02": 600, - }, - ), - "Courier-BoldOblique": ( - { - "FontName": "Courier-BoldOblique", - "Descent": -194.0, - "FontBBox": (-49.0, -249.0, 758.0, 811.0), - "FontWeight": "Bold", - "CapHeight": 572.0, - "FontFamily": "Courier", - "Flags": 64, - "XHeight": 434.0, - "ItalicAngle": -11.0, - "Ascent": 627.0, - }, - { - " ": 600, - "!": 600, - '"': 600, - "#": 600, - "$": 600, - "%": 600, - "&": 600, - "'": 600, - "(": 600, - ")": 600, - "*": 600, - "+": 600, - ",": 600, - "-": 600, - ".": 600, - "/": 600, - "0": 600, - "1": 600, - "2": 600, - "3": 600, - "4": 600, - "5": 600, - "6": 600, - "7": 600, - "8": 600, - "9": 600, - ":": 600, - ";": 600, - "<": 600, - "=": 600, - ">": 600, - "?": 600, - "@": 600, - "A": 600, - "B": 600, - "C": 600, - "D": 600, - "E": 600, - "F": 600, - "G": 600, - "H": 600, - "I": 600, - "J": 600, - "K": 600, - "L": 600, - "M": 600, - "N": 600, - "O": 600, - "P": 600, - "Q": 600, - "R": 600, - "S": 600, - "T": 600, - "U": 600, - "V": 600, - "W": 600, - "X": 600, - "Y": 600, - "Z": 600, - "[": 600, - "\\": 600, - "]": 600, - "^": 600, - "_": 600, - "`": 600, - "a": 600, - "b": 600, - "c": 600, - "d": 600, - "e": 600, - "f": 600, - "g": 600, - "h": 600, - "i": 600, - "j": 600, - "k": 600, - "l": 600, - "m": 600, - "n": 600, - "o": 600, - "p": 600, - "q": 600, - "r": 600, - "s": 600, - "t": 600, - "u": 600, - "v": 600, - "w": 600, - "x": 600, - "y": 600, - "z": 600, - "{": 600, - "|": 600, - "}": 600, - "~": 600, - "\xa1": 600, - "\xa2": 600, - "\xa3": 600, - "\xa4": 600, - "\xa5": 600, - "\xa6": 600, - "\xa7": 600, - "\xa8": 600, - "\xa9": 600, - "\xaa": 600, - "\xab": 600, - "\xac": 600, - "\xae": 600, - "\xaf": 600, - "\xb0": 600, - "\xb1": 600, - "\xb2": 600, - "\xb3": 600, - "\xb4": 600, - "\xb5": 600, - "\xb6": 600, - "\xb7": 600, - "\xb8": 600, - "\xb9": 600, - "\xba": 600, - "\xbb": 600, - "\xbc": 600, - "\xbd": 600, - "\xbe": 600, - "\xbf": 600, - "\xc0": 600, - "\xc1": 600, - "\xc2": 600, - "\xc3": 600, - "\xc4": 600, - "\xc5": 600, - "\xc6": 600, - "\xc7": 600, - "\xc8": 600, - "\xc9": 600, - "\xca": 600, - "\xcb": 600, - "\xcc": 600, - "\xcd": 600, - "\xce": 600, - "\xcf": 600, - "\xd0": 600, - "\xd1": 600, - "\xd2": 600, - "\xd3": 600, - "\xd4": 600, - "\xd5": 600, - "\xd6": 600, - "\xd7": 600, - "\xd8": 600, - "\xd9": 600, - "\xda": 600, - "\xdb": 600, - "\xdc": 600, - "\xdd": 600, - "\xde": 600, - "\xdf": 600, - "\xe0": 600, - "\xe1": 600, - "\xe2": 600, - "\xe3": 600, - "\xe4": 600, - "\xe5": 600, - "\xe6": 600, - "\xe7": 600, - "\xe8": 600, - "\xe9": 600, - "\xea": 600, - "\xeb": 600, - "\xec": 600, - "\xed": 600, - "\xee": 600, - "\xef": 600, - "\xf0": 600, - "\xf1": 600, - "\xf2": 600, - "\xf3": 600, - "\xf4": 600, - "\xf5": 600, - "\xf6": 600, - "\xf7": 600, - "\xf8": 600, - "\xf9": 600, - "\xfa": 600, - "\xfb": 600, - "\xfc": 600, - "\xfd": 600, - "\xfe": 600, - "\xff": 600, - "\u0100": 600, - "\u0101": 600, - "\u0102": 600, - "\u0103": 600, - "\u0104": 600, - "\u0105": 600, - "\u0106": 600, - "\u0107": 600, - "\u010c": 600, - "\u010d": 600, - "\u010e": 600, - "\u010f": 600, - "\u0110": 600, - "\u0111": 600, - "\u0112": 600, - "\u0113": 600, - "\u0116": 600, - "\u0117": 600, - "\u0118": 600, - "\u0119": 600, - "\u011a": 600, - "\u011b": 600, - "\u011e": 600, - "\u011f": 600, - "\u0122": 600, - "\u0123": 600, - "\u012a": 600, - "\u012b": 600, - "\u012e": 600, - "\u012f": 600, - "\u0130": 600, - "\u0131": 600, - "\u0136": 600, - "\u0137": 600, - "\u0139": 600, - "\u013a": 600, - "\u013b": 600, - "\u013c": 600, - "\u013d": 600, - "\u013e": 600, - "\u0141": 600, - "\u0142": 600, - "\u0143": 600, - "\u0144": 600, - "\u0145": 600, - "\u0146": 600, - "\u0147": 600, - "\u0148": 600, - "\u014c": 600, - "\u014d": 600, - "\u0150": 600, - "\u0151": 600, - "\u0152": 600, - "\u0153": 600, - "\u0154": 600, - "\u0155": 600, - "\u0156": 600, - "\u0157": 600, - "\u0158": 600, - "\u0159": 600, - "\u015a": 600, - "\u015b": 600, - "\u015e": 600, - "\u015f": 600, - "\u0160": 600, - "\u0161": 600, - "\u0162": 600, - "\u0163": 600, - "\u0164": 600, - "\u0165": 600, - "\u016a": 600, - "\u016b": 600, - "\u016e": 600, - "\u016f": 600, - "\u0170": 600, - "\u0171": 600, - "\u0172": 600, - "\u0173": 600, - "\u0178": 600, - "\u0179": 600, - "\u017a": 600, - "\u017b": 600, - "\u017c": 600, - "\u017d": 600, - "\u017e": 600, - "\u0192": 600, - "\u0218": 600, - "\u0219": 600, - "\u02c6": 600, - "\u02c7": 600, - "\u02d8": 600, - "\u02d9": 600, - "\u02da": 600, - "\u02db": 600, - "\u02dc": 600, - "\u02dd": 600, - "\u2013": 600, - "\u2014": 600, - "\u2018": 600, - "\u2019": 600, - "\u201a": 600, - "\u201c": 600, - "\u201d": 600, - "\u201e": 600, - "\u2020": 600, - "\u2021": 600, - "\u2022": 600, - "\u2026": 600, - "\u2030": 600, - "\u2039": 600, - "\u203a": 600, - "\u2044": 600, - "\u2122": 600, - "\u2202": 600, - "\u2206": 600, - "\u2211": 600, - "\u2212": 600, - "\u221a": 600, - "\u2260": 600, - "\u2264": 600, - "\u2265": 600, - "\u25ca": 600, - "\uf6c3": 600, - "\ufb01": 600, - "\ufb02": 600, - }, - ), - "Courier-Oblique": ( - { - "FontName": "Courier-Oblique", - "Descent": -194.0, - "FontBBox": (-49.0, -249.0, 749.0, 803.0), - "FontWeight": "Medium", - "CapHeight": 572.0, - "FontFamily": "Courier", - "Flags": 64, - "XHeight": 434.0, - "ItalicAngle": -11.0, - "Ascent": 627.0, - }, - { - " ": 600, - "!": 600, - '"': 600, - "#": 600, - "$": 600, - "%": 600, - "&": 600, - "'": 600, - "(": 600, - ")": 600, - "*": 600, - "+": 600, - ",": 600, - "-": 600, - ".": 600, - "/": 600, - "0": 600, - "1": 600, - "2": 600, - "3": 600, - "4": 600, - "5": 600, - "6": 600, - "7": 600, - "8": 600, - "9": 600, - ":": 600, - ";": 600, - "<": 600, - "=": 600, - ">": 600, - "?": 600, - "@": 600, - "A": 600, - "B": 600, - "C": 600, - "D": 600, - "E": 600, - "F": 600, - "G": 600, - "H": 600, - "I": 600, - "J": 600, - "K": 600, - "L": 600, - "M": 600, - "N": 600, - "O": 600, - "P": 600, - "Q": 600, - "R": 600, - "S": 600, - "T": 600, - "U": 600, - "V": 600, - "W": 600, - "X": 600, - "Y": 600, - "Z": 600, - "[": 600, - "\\": 600, - "]": 600, - "^": 600, - "_": 600, - "`": 600, - "a": 600, - "b": 600, - "c": 600, - "d": 600, - "e": 600, - "f": 600, - "g": 600, - "h": 600, - "i": 600, - "j": 600, - "k": 600, - "l": 600, - "m": 600, - "n": 600, - "o": 600, - "p": 600, - "q": 600, - "r": 600, - "s": 600, - "t": 600, - "u": 600, - "v": 600, - "w": 600, - "x": 600, - "y": 600, - "z": 600, - "{": 600, - "|": 600, - "}": 600, - "~": 600, - "\xa1": 600, - "\xa2": 600, - "\xa3": 600, - "\xa4": 600, - "\xa5": 600, - "\xa6": 600, - "\xa7": 600, - "\xa8": 600, - "\xa9": 600, - "\xaa": 600, - "\xab": 600, - "\xac": 600, - "\xae": 600, - "\xaf": 600, - "\xb0": 600, - "\xb1": 600, - "\xb2": 600, - "\xb3": 600, - "\xb4": 600, - "\xb5": 600, - "\xb6": 600, - "\xb7": 600, - "\xb8": 600, - "\xb9": 600, - "\xba": 600, - "\xbb": 600, - "\xbc": 600, - "\xbd": 600, - "\xbe": 600, - "\xbf": 600, - "\xc0": 600, - "\xc1": 600, - "\xc2": 600, - "\xc3": 600, - "\xc4": 600, - "\xc5": 600, - "\xc6": 600, - "\xc7": 600, - "\xc8": 600, - "\xc9": 600, - "\xca": 600, - "\xcb": 600, - "\xcc": 600, - "\xcd": 600, - "\xce": 600, - "\xcf": 600, - "\xd0": 600, - "\xd1": 600, - "\xd2": 600, - "\xd3": 600, - "\xd4": 600, - "\xd5": 600, - "\xd6": 600, - "\xd7": 600, - "\xd8": 600, - "\xd9": 600, - "\xda": 600, - "\xdb": 600, - "\xdc": 600, - "\xdd": 600, - "\xde": 600, - "\xdf": 600, - "\xe0": 600, - "\xe1": 600, - "\xe2": 600, - "\xe3": 600, - "\xe4": 600, - "\xe5": 600, - "\xe6": 600, - "\xe7": 600, - "\xe8": 600, - "\xe9": 600, - "\xea": 600, - "\xeb": 600, - "\xec": 600, - "\xed": 600, - "\xee": 600, - "\xef": 600, - "\xf0": 600, - "\xf1": 600, - "\xf2": 600, - "\xf3": 600, - "\xf4": 600, - "\xf5": 600, - "\xf6": 600, - "\xf7": 600, - "\xf8": 600, - "\xf9": 600, - "\xfa": 600, - "\xfb": 600, - "\xfc": 600, - "\xfd": 600, - "\xfe": 600, - "\xff": 600, - "\u0100": 600, - "\u0101": 600, - "\u0102": 600, - "\u0103": 600, - "\u0104": 600, - "\u0105": 600, - "\u0106": 600, - "\u0107": 600, - "\u010c": 600, - "\u010d": 600, - "\u010e": 600, - "\u010f": 600, - "\u0110": 600, - "\u0111": 600, - "\u0112": 600, - "\u0113": 600, - "\u0116": 600, - "\u0117": 600, - "\u0118": 600, - "\u0119": 600, - "\u011a": 600, - "\u011b": 600, - "\u011e": 600, - "\u011f": 600, - "\u0122": 600, - "\u0123": 600, - "\u012a": 600, - "\u012b": 600, - "\u012e": 600, - "\u012f": 600, - "\u0130": 600, - "\u0131": 600, - "\u0136": 600, - "\u0137": 600, - "\u0139": 600, - "\u013a": 600, - "\u013b": 600, - "\u013c": 600, - "\u013d": 600, - "\u013e": 600, - "\u0141": 600, - "\u0142": 600, - "\u0143": 600, - "\u0144": 600, - "\u0145": 600, - "\u0146": 600, - "\u0147": 600, - "\u0148": 600, - "\u014c": 600, - "\u014d": 600, - "\u0150": 600, - "\u0151": 600, - "\u0152": 600, - "\u0153": 600, - "\u0154": 600, - "\u0155": 600, - "\u0156": 600, - "\u0157": 600, - "\u0158": 600, - "\u0159": 600, - "\u015a": 600, - "\u015b": 600, - "\u015e": 600, - "\u015f": 600, - "\u0160": 600, - "\u0161": 600, - "\u0162": 600, - "\u0163": 600, - "\u0164": 600, - "\u0165": 600, - "\u016a": 600, - "\u016b": 600, - "\u016e": 600, - "\u016f": 600, - "\u0170": 600, - "\u0171": 600, - "\u0172": 600, - "\u0173": 600, - "\u0178": 600, - "\u0179": 600, - "\u017a": 600, - "\u017b": 600, - "\u017c": 600, - "\u017d": 600, - "\u017e": 600, - "\u0192": 600, - "\u0218": 600, - "\u0219": 600, - "\u02c6": 600, - "\u02c7": 600, - "\u02d8": 600, - "\u02d9": 600, - "\u02da": 600, - "\u02db": 600, - "\u02dc": 600, - "\u02dd": 600, - "\u2013": 600, - "\u2014": 600, - "\u2018": 600, - "\u2019": 600, - "\u201a": 600, - "\u201c": 600, - "\u201d": 600, - "\u201e": 600, - "\u2020": 600, - "\u2021": 600, - "\u2022": 600, - "\u2026": 600, - "\u2030": 600, - "\u2039": 600, - "\u203a": 600, - "\u2044": 600, - "\u2122": 600, - "\u2202": 600, - "\u2206": 600, - "\u2211": 600, - "\u2212": 600, - "\u221a": 600, - "\u2260": 600, - "\u2264": 600, - "\u2265": 600, - "\u25ca": 600, - "\uf6c3": 600, - "\ufb01": 600, - "\ufb02": 600, - }, - ), - "Helvetica": ( - { - "FontName": "Helvetica", - "Descent": -207.0, - "FontBBox": (-166.0, -225.0, 1000.0, 931.0), - "FontWeight": "Medium", - "CapHeight": 718.0, - "FontFamily": "Helvetica", - "Flags": 0, - "XHeight": 523.0, - "ItalicAngle": 0.0, - "Ascent": 718.0, - }, - { - " ": 278, - "!": 278, - '"': 355, - "#": 556, - "$": 556, - "%": 889, - "&": 667, - "'": 191, - "(": 333, - ")": 333, - "*": 389, - "+": 584, - ",": 278, - "-": 333, - ".": 278, - "/": 278, - "0": 556, - "1": 556, - "2": 556, - "3": 556, - "4": 556, - "5": 556, - "6": 556, - "7": 556, - "8": 556, - "9": 556, - ":": 278, - ";": 278, - "<": 584, - "=": 584, - ">": 584, - "?": 556, - "@": 1015, - "A": 667, - "B": 667, - "C": 722, - "D": 722, - "E": 667, - "F": 611, - "G": 778, - "H": 722, - "I": 278, - "J": 500, - "K": 667, - "L": 556, - "M": 833, - "N": 722, - "O": 778, - "P": 667, - "Q": 778, - "R": 722, - "S": 667, - "T": 611, - "U": 722, - "V": 667, - "W": 944, - "X": 667, - "Y": 667, - "Z": 611, - "[": 278, - "\\": 278, - "]": 278, - "^": 469, - "_": 556, - "`": 333, - "a": 556, - "b": 556, - "c": 500, - "d": 556, - "e": 556, - "f": 278, - "g": 556, - "h": 556, - "i": 222, - "j": 222, - "k": 500, - "l": 222, - "m": 833, - "n": 556, - "o": 556, - "p": 556, - "q": 556, - "r": 333, - "s": 500, - "t": 278, - "u": 556, - "v": 500, - "w": 722, - "x": 500, - "y": 500, - "z": 500, - "{": 334, - "|": 260, - "}": 334, - "~": 584, - "\xa1": 333, - "\xa2": 556, - "\xa3": 556, - "\xa4": 556, - "\xa5": 556, - "\xa6": 260, - "\xa7": 556, - "\xa8": 333, - "\xa9": 737, - "\xaa": 370, - "\xab": 556, - "\xac": 584, - "\xae": 737, - "\xaf": 333, - "\xb0": 400, - "\xb1": 584, - "\xb2": 333, - "\xb3": 333, - "\xb4": 333, - "\xb5": 556, - "\xb6": 537, - "\xb7": 278, - "\xb8": 333, - "\xb9": 333, - "\xba": 365, - "\xbb": 556, - "\xbc": 834, - "\xbd": 834, - "\xbe": 834, - "\xbf": 611, - "\xc0": 667, - "\xc1": 667, - "\xc2": 667, - "\xc3": 667, - "\xc4": 667, - "\xc5": 667, - "\xc6": 1000, - "\xc7": 722, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 278, - "\xcd": 278, - "\xce": 278, - "\xcf": 278, - "\xd0": 722, - "\xd1": 722, - "\xd2": 778, - "\xd3": 778, - "\xd4": 778, - "\xd5": 778, - "\xd6": 778, - "\xd7": 584, - "\xd8": 778, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 667, - "\xde": 667, - "\xdf": 611, - "\xe0": 556, - "\xe1": 556, - "\xe2": 556, - "\xe3": 556, - "\xe4": 556, - "\xe5": 556, - "\xe6": 889, - "\xe7": 500, - "\xe8": 556, - "\xe9": 556, - "\xea": 556, - "\xeb": 556, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 556, - "\xf1": 556, - "\xf2": 556, - "\xf3": 556, - "\xf4": 556, - "\xf5": 556, - "\xf6": 556, - "\xf7": 584, - "\xf8": 611, - "\xf9": 556, - "\xfa": 556, - "\xfb": 556, - "\xfc": 556, - "\xfd": 500, - "\xfe": 556, - "\xff": 500, - "\u0100": 667, - "\u0101": 556, - "\u0102": 667, - "\u0103": 556, - "\u0104": 667, - "\u0105": 556, - "\u0106": 722, - "\u0107": 500, - "\u010c": 722, - "\u010d": 500, - "\u010e": 722, - "\u010f": 643, - "\u0110": 722, - "\u0111": 556, - "\u0112": 667, - "\u0113": 556, - "\u0116": 667, - "\u0117": 556, - "\u0118": 667, - "\u0119": 556, - "\u011a": 667, - "\u011b": 556, - "\u011e": 778, - "\u011f": 556, - "\u0122": 778, - "\u0123": 556, - "\u012a": 278, - "\u012b": 278, - "\u012e": 278, - "\u012f": 222, - "\u0130": 278, - "\u0131": 278, - "\u0136": 667, - "\u0137": 500, - "\u0139": 556, - "\u013a": 222, - "\u013b": 556, - "\u013c": 222, - "\u013d": 556, - "\u013e": 299, - "\u0141": 556, - "\u0142": 222, - "\u0143": 722, - "\u0144": 556, - "\u0145": 722, - "\u0146": 556, - "\u0147": 722, - "\u0148": 556, - "\u014c": 778, - "\u014d": 556, - "\u0150": 778, - "\u0151": 556, - "\u0152": 1000, - "\u0153": 944, - "\u0154": 722, - "\u0155": 333, - "\u0156": 722, - "\u0157": 333, - "\u0158": 722, - "\u0159": 333, - "\u015a": 667, - "\u015b": 500, - "\u015e": 667, - "\u015f": 500, - "\u0160": 667, - "\u0161": 500, - "\u0162": 611, - "\u0163": 278, - "\u0164": 611, - "\u0165": 317, - "\u016a": 722, - "\u016b": 556, - "\u016e": 722, - "\u016f": 556, - "\u0170": 722, - "\u0171": 556, - "\u0172": 722, - "\u0173": 556, - "\u0178": 667, - "\u0179": 611, - "\u017a": 500, - "\u017b": 611, - "\u017c": 500, - "\u017d": 611, - "\u017e": 500, - "\u0192": 556, - "\u0218": 667, - "\u0219": 500, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 556, - "\u2014": 1000, - "\u2018": 222, - "\u2019": 222, - "\u201a": 222, - "\u201c": 333, - "\u201d": 333, - "\u201e": 333, - "\u2020": 556, - "\u2021": 556, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 476, - "\u2206": 612, - "\u2211": 600, - "\u2212": 584, - "\u221a": 453, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 471, - "\uf6c3": 250, - "\ufb01": 500, - "\ufb02": 500, - }, - ), - "Helvetica-Bold": ( - { - "FontName": "Helvetica-Bold", - "Descent": -207.0, - "FontBBox": (-170.0, -228.0, 1003.0, 962.0), - "FontWeight": "Bold", - "CapHeight": 718.0, - "FontFamily": "Helvetica", - "Flags": 0, - "XHeight": 532.0, - "ItalicAngle": 0.0, - "Ascent": 718.0, - }, - { - " ": 278, - "!": 333, - '"': 474, - "#": 556, - "$": 556, - "%": 889, - "&": 722, - "'": 238, - "(": 333, - ")": 333, - "*": 389, - "+": 584, - ",": 278, - "-": 333, - ".": 278, - "/": 278, - "0": 556, - "1": 556, - "2": 556, - "3": 556, - "4": 556, - "5": 556, - "6": 556, - "7": 556, - "8": 556, - "9": 556, - ":": 333, - ";": 333, - "<": 584, - "=": 584, - ">": 584, - "?": 611, - "@": 975, - "A": 722, - "B": 722, - "C": 722, - "D": 722, - "E": 667, - "F": 611, - "G": 778, - "H": 722, - "I": 278, - "J": 556, - "K": 722, - "L": 611, - "M": 833, - "N": 722, - "O": 778, - "P": 667, - "Q": 778, - "R": 722, - "S": 667, - "T": 611, - "U": 722, - "V": 667, - "W": 944, - "X": 667, - "Y": 667, - "Z": 611, - "[": 333, - "\\": 278, - "]": 333, - "^": 584, - "_": 556, - "`": 333, - "a": 556, - "b": 611, - "c": 556, - "d": 611, - "e": 556, - "f": 333, - "g": 611, - "h": 611, - "i": 278, - "j": 278, - "k": 556, - "l": 278, - "m": 889, - "n": 611, - "o": 611, - "p": 611, - "q": 611, - "r": 389, - "s": 556, - "t": 333, - "u": 611, - "v": 556, - "w": 778, - "x": 556, - "y": 556, - "z": 500, - "{": 389, - "|": 280, - "}": 389, - "~": 584, - "\xa1": 333, - "\xa2": 556, - "\xa3": 556, - "\xa4": 556, - "\xa5": 556, - "\xa6": 280, - "\xa7": 556, - "\xa8": 333, - "\xa9": 737, - "\xaa": 370, - "\xab": 556, - "\xac": 584, - "\xae": 737, - "\xaf": 333, - "\xb0": 400, - "\xb1": 584, - "\xb2": 333, - "\xb3": 333, - "\xb4": 333, - "\xb5": 611, - "\xb6": 556, - "\xb7": 278, - "\xb8": 333, - "\xb9": 333, - "\xba": 365, - "\xbb": 556, - "\xbc": 834, - "\xbd": 834, - "\xbe": 834, - "\xbf": 611, - "\xc0": 722, - "\xc1": 722, - "\xc2": 722, - "\xc3": 722, - "\xc4": 722, - "\xc5": 722, - "\xc6": 1000, - "\xc7": 722, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 278, - "\xcd": 278, - "\xce": 278, - "\xcf": 278, - "\xd0": 722, - "\xd1": 722, - "\xd2": 778, - "\xd3": 778, - "\xd4": 778, - "\xd5": 778, - "\xd6": 778, - "\xd7": 584, - "\xd8": 778, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 667, - "\xde": 667, - "\xdf": 611, - "\xe0": 556, - "\xe1": 556, - "\xe2": 556, - "\xe3": 556, - "\xe4": 556, - "\xe5": 556, - "\xe6": 889, - "\xe7": 556, - "\xe8": 556, - "\xe9": 556, - "\xea": 556, - "\xeb": 556, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 611, - "\xf1": 611, - "\xf2": 611, - "\xf3": 611, - "\xf4": 611, - "\xf5": 611, - "\xf6": 611, - "\xf7": 584, - "\xf8": 611, - "\xf9": 611, - "\xfa": 611, - "\xfb": 611, - "\xfc": 611, - "\xfd": 556, - "\xfe": 611, - "\xff": 556, - "\u0100": 722, - "\u0101": 556, - "\u0102": 722, - "\u0103": 556, - "\u0104": 722, - "\u0105": 556, - "\u0106": 722, - "\u0107": 556, - "\u010c": 722, - "\u010d": 556, - "\u010e": 722, - "\u010f": 743, - "\u0110": 722, - "\u0111": 611, - "\u0112": 667, - "\u0113": 556, - "\u0116": 667, - "\u0117": 556, - "\u0118": 667, - "\u0119": 556, - "\u011a": 667, - "\u011b": 556, - "\u011e": 778, - "\u011f": 611, - "\u0122": 778, - "\u0123": 611, - "\u012a": 278, - "\u012b": 278, - "\u012e": 278, - "\u012f": 278, - "\u0130": 278, - "\u0131": 278, - "\u0136": 722, - "\u0137": 556, - "\u0139": 611, - "\u013a": 278, - "\u013b": 611, - "\u013c": 278, - "\u013d": 611, - "\u013e": 400, - "\u0141": 611, - "\u0142": 278, - "\u0143": 722, - "\u0144": 611, - "\u0145": 722, - "\u0146": 611, - "\u0147": 722, - "\u0148": 611, - "\u014c": 778, - "\u014d": 611, - "\u0150": 778, - "\u0151": 611, - "\u0152": 1000, - "\u0153": 944, - "\u0154": 722, - "\u0155": 389, - "\u0156": 722, - "\u0157": 389, - "\u0158": 722, - "\u0159": 389, - "\u015a": 667, - "\u015b": 556, - "\u015e": 667, - "\u015f": 556, - "\u0160": 667, - "\u0161": 556, - "\u0162": 611, - "\u0163": 333, - "\u0164": 611, - "\u0165": 389, - "\u016a": 722, - "\u016b": 611, - "\u016e": 722, - "\u016f": 611, - "\u0170": 722, - "\u0171": 611, - "\u0172": 722, - "\u0173": 611, - "\u0178": 667, - "\u0179": 611, - "\u017a": 500, - "\u017b": 611, - "\u017c": 500, - "\u017d": 611, - "\u017e": 500, - "\u0192": 556, - "\u0218": 667, - "\u0219": 556, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 556, - "\u2014": 1000, - "\u2018": 278, - "\u2019": 278, - "\u201a": 278, - "\u201c": 500, - "\u201d": 500, - "\u201e": 500, - "\u2020": 556, - "\u2021": 556, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 494, - "\u2206": 612, - "\u2211": 600, - "\u2212": 584, - "\u221a": 549, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 494, - "\uf6c3": 250, - "\ufb01": 611, - "\ufb02": 611, - }, - ), - "Helvetica-BoldOblique": ( - { - "FontName": "Helvetica-BoldOblique", - "Descent": -207.0, - "FontBBox": (-175.0, -228.0, 1114.0, 962.0), - "FontWeight": "Bold", - "CapHeight": 718.0, - "FontFamily": "Helvetica", - "Flags": 0, - "XHeight": 532.0, - "ItalicAngle": -12.0, - "Ascent": 718.0, - }, - { - " ": 278, - "!": 333, - '"': 474, - "#": 556, - "$": 556, - "%": 889, - "&": 722, - "'": 238, - "(": 333, - ")": 333, - "*": 389, - "+": 584, - ",": 278, - "-": 333, - ".": 278, - "/": 278, - "0": 556, - "1": 556, - "2": 556, - "3": 556, - "4": 556, - "5": 556, - "6": 556, - "7": 556, - "8": 556, - "9": 556, - ":": 333, - ";": 333, - "<": 584, - "=": 584, - ">": 584, - "?": 611, - "@": 975, - "A": 722, - "B": 722, - "C": 722, - "D": 722, - "E": 667, - "F": 611, - "G": 778, - "H": 722, - "I": 278, - "J": 556, - "K": 722, - "L": 611, - "M": 833, - "N": 722, - "O": 778, - "P": 667, - "Q": 778, - "R": 722, - "S": 667, - "T": 611, - "U": 722, - "V": 667, - "W": 944, - "X": 667, - "Y": 667, - "Z": 611, - "[": 333, - "\\": 278, - "]": 333, - "^": 584, - "_": 556, - "`": 333, - "a": 556, - "b": 611, - "c": 556, - "d": 611, - "e": 556, - "f": 333, - "g": 611, - "h": 611, - "i": 278, - "j": 278, - "k": 556, - "l": 278, - "m": 889, - "n": 611, - "o": 611, - "p": 611, - "q": 611, - "r": 389, - "s": 556, - "t": 333, - "u": 611, - "v": 556, - "w": 778, - "x": 556, - "y": 556, - "z": 500, - "{": 389, - "|": 280, - "}": 389, - "~": 584, - "\xa1": 333, - "\xa2": 556, - "\xa3": 556, - "\xa4": 556, - "\xa5": 556, - "\xa6": 280, - "\xa7": 556, - "\xa8": 333, - "\xa9": 737, - "\xaa": 370, - "\xab": 556, - "\xac": 584, - "\xae": 737, - "\xaf": 333, - "\xb0": 400, - "\xb1": 584, - "\xb2": 333, - "\xb3": 333, - "\xb4": 333, - "\xb5": 611, - "\xb6": 556, - "\xb7": 278, - "\xb8": 333, - "\xb9": 333, - "\xba": 365, - "\xbb": 556, - "\xbc": 834, - "\xbd": 834, - "\xbe": 834, - "\xbf": 611, - "\xc0": 722, - "\xc1": 722, - "\xc2": 722, - "\xc3": 722, - "\xc4": 722, - "\xc5": 722, - "\xc6": 1000, - "\xc7": 722, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 278, - "\xcd": 278, - "\xce": 278, - "\xcf": 278, - "\xd0": 722, - "\xd1": 722, - "\xd2": 778, - "\xd3": 778, - "\xd4": 778, - "\xd5": 778, - "\xd6": 778, - "\xd7": 584, - "\xd8": 778, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 667, - "\xde": 667, - "\xdf": 611, - "\xe0": 556, - "\xe1": 556, - "\xe2": 556, - "\xe3": 556, - "\xe4": 556, - "\xe5": 556, - "\xe6": 889, - "\xe7": 556, - "\xe8": 556, - "\xe9": 556, - "\xea": 556, - "\xeb": 556, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 611, - "\xf1": 611, - "\xf2": 611, - "\xf3": 611, - "\xf4": 611, - "\xf5": 611, - "\xf6": 611, - "\xf7": 584, - "\xf8": 611, - "\xf9": 611, - "\xfa": 611, - "\xfb": 611, - "\xfc": 611, - "\xfd": 556, - "\xfe": 611, - "\xff": 556, - "\u0100": 722, - "\u0101": 556, - "\u0102": 722, - "\u0103": 556, - "\u0104": 722, - "\u0105": 556, - "\u0106": 722, - "\u0107": 556, - "\u010c": 722, - "\u010d": 556, - "\u010e": 722, - "\u010f": 743, - "\u0110": 722, - "\u0111": 611, - "\u0112": 667, - "\u0113": 556, - "\u0116": 667, - "\u0117": 556, - "\u0118": 667, - "\u0119": 556, - "\u011a": 667, - "\u011b": 556, - "\u011e": 778, - "\u011f": 611, - "\u0122": 778, - "\u0123": 611, - "\u012a": 278, - "\u012b": 278, - "\u012e": 278, - "\u012f": 278, - "\u0130": 278, - "\u0131": 278, - "\u0136": 722, - "\u0137": 556, - "\u0139": 611, - "\u013a": 278, - "\u013b": 611, - "\u013c": 278, - "\u013d": 611, - "\u013e": 400, - "\u0141": 611, - "\u0142": 278, - "\u0143": 722, - "\u0144": 611, - "\u0145": 722, - "\u0146": 611, - "\u0147": 722, - "\u0148": 611, - "\u014c": 778, - "\u014d": 611, - "\u0150": 778, - "\u0151": 611, - "\u0152": 1000, - "\u0153": 944, - "\u0154": 722, - "\u0155": 389, - "\u0156": 722, - "\u0157": 389, - "\u0158": 722, - "\u0159": 389, - "\u015a": 667, - "\u015b": 556, - "\u015e": 667, - "\u015f": 556, - "\u0160": 667, - "\u0161": 556, - "\u0162": 611, - "\u0163": 333, - "\u0164": 611, - "\u0165": 389, - "\u016a": 722, - "\u016b": 611, - "\u016e": 722, - "\u016f": 611, - "\u0170": 722, - "\u0171": 611, - "\u0172": 722, - "\u0173": 611, - "\u0178": 667, - "\u0179": 611, - "\u017a": 500, - "\u017b": 611, - "\u017c": 500, - "\u017d": 611, - "\u017e": 500, - "\u0192": 556, - "\u0218": 667, - "\u0219": 556, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 556, - "\u2014": 1000, - "\u2018": 278, - "\u2019": 278, - "\u201a": 278, - "\u201c": 500, - "\u201d": 500, - "\u201e": 500, - "\u2020": 556, - "\u2021": 556, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 494, - "\u2206": 612, - "\u2211": 600, - "\u2212": 584, - "\u221a": 549, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 494, - "\uf6c3": 250, - "\ufb01": 611, - "\ufb02": 611, - }, - ), - "Helvetica-Oblique": ( - { - "FontName": "Helvetica-Oblique", - "Descent": -207.0, - "FontBBox": (-171.0, -225.0, 1116.0, 931.0), - "FontWeight": "Medium", - "CapHeight": 718.0, - "FontFamily": "Helvetica", - "Flags": 0, - "XHeight": 523.0, - "ItalicAngle": -12.0, - "Ascent": 718.0, - }, - { - " ": 278, - "!": 278, - '"': 355, - "#": 556, - "$": 556, - "%": 889, - "&": 667, - "'": 191, - "(": 333, - ")": 333, - "*": 389, - "+": 584, - ",": 278, - "-": 333, - ".": 278, - "/": 278, - "0": 556, - "1": 556, - "2": 556, - "3": 556, - "4": 556, - "5": 556, - "6": 556, - "7": 556, - "8": 556, - "9": 556, - ":": 278, - ";": 278, - "<": 584, - "=": 584, - ">": 584, - "?": 556, - "@": 1015, - "A": 667, - "B": 667, - "C": 722, - "D": 722, - "E": 667, - "F": 611, - "G": 778, - "H": 722, - "I": 278, - "J": 500, - "K": 667, - "L": 556, - "M": 833, - "N": 722, - "O": 778, - "P": 667, - "Q": 778, - "R": 722, - "S": 667, - "T": 611, - "U": 722, - "V": 667, - "W": 944, - "X": 667, - "Y": 667, - "Z": 611, - "[": 278, - "\\": 278, - "]": 278, - "^": 469, - "_": 556, - "`": 333, - "a": 556, - "b": 556, - "c": 500, - "d": 556, - "e": 556, - "f": 278, - "g": 556, - "h": 556, - "i": 222, - "j": 222, - "k": 500, - "l": 222, - "m": 833, - "n": 556, - "o": 556, - "p": 556, - "q": 556, - "r": 333, - "s": 500, - "t": 278, - "u": 556, - "v": 500, - "w": 722, - "x": 500, - "y": 500, - "z": 500, - "{": 334, - "|": 260, - "}": 334, - "~": 584, - "\xa1": 333, - "\xa2": 556, - "\xa3": 556, - "\xa4": 556, - "\xa5": 556, - "\xa6": 260, - "\xa7": 556, - "\xa8": 333, - "\xa9": 737, - "\xaa": 370, - "\xab": 556, - "\xac": 584, - "\xae": 737, - "\xaf": 333, - "\xb0": 400, - "\xb1": 584, - "\xb2": 333, - "\xb3": 333, - "\xb4": 333, - "\xb5": 556, - "\xb6": 537, - "\xb7": 278, - "\xb8": 333, - "\xb9": 333, - "\xba": 365, - "\xbb": 556, - "\xbc": 834, - "\xbd": 834, - "\xbe": 834, - "\xbf": 611, - "\xc0": 667, - "\xc1": 667, - "\xc2": 667, - "\xc3": 667, - "\xc4": 667, - "\xc5": 667, - "\xc6": 1000, - "\xc7": 722, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 278, - "\xcd": 278, - "\xce": 278, - "\xcf": 278, - "\xd0": 722, - "\xd1": 722, - "\xd2": 778, - "\xd3": 778, - "\xd4": 778, - "\xd5": 778, - "\xd6": 778, - "\xd7": 584, - "\xd8": 778, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 667, - "\xde": 667, - "\xdf": 611, - "\xe0": 556, - "\xe1": 556, - "\xe2": 556, - "\xe3": 556, - "\xe4": 556, - "\xe5": 556, - "\xe6": 889, - "\xe7": 500, - "\xe8": 556, - "\xe9": 556, - "\xea": 556, - "\xeb": 556, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 556, - "\xf1": 556, - "\xf2": 556, - "\xf3": 556, - "\xf4": 556, - "\xf5": 556, - "\xf6": 556, - "\xf7": 584, - "\xf8": 611, - "\xf9": 556, - "\xfa": 556, - "\xfb": 556, - "\xfc": 556, - "\xfd": 500, - "\xfe": 556, - "\xff": 500, - "\u0100": 667, - "\u0101": 556, - "\u0102": 667, - "\u0103": 556, - "\u0104": 667, - "\u0105": 556, - "\u0106": 722, - "\u0107": 500, - "\u010c": 722, - "\u010d": 500, - "\u010e": 722, - "\u010f": 643, - "\u0110": 722, - "\u0111": 556, - "\u0112": 667, - "\u0113": 556, - "\u0116": 667, - "\u0117": 556, - "\u0118": 667, - "\u0119": 556, - "\u011a": 667, - "\u011b": 556, - "\u011e": 778, - "\u011f": 556, - "\u0122": 778, - "\u0123": 556, - "\u012a": 278, - "\u012b": 278, - "\u012e": 278, - "\u012f": 222, - "\u0130": 278, - "\u0131": 278, - "\u0136": 667, - "\u0137": 500, - "\u0139": 556, - "\u013a": 222, - "\u013b": 556, - "\u013c": 222, - "\u013d": 556, - "\u013e": 299, - "\u0141": 556, - "\u0142": 222, - "\u0143": 722, - "\u0144": 556, - "\u0145": 722, - "\u0146": 556, - "\u0147": 722, - "\u0148": 556, - "\u014c": 778, - "\u014d": 556, - "\u0150": 778, - "\u0151": 556, - "\u0152": 1000, - "\u0153": 944, - "\u0154": 722, - "\u0155": 333, - "\u0156": 722, - "\u0157": 333, - "\u0158": 722, - "\u0159": 333, - "\u015a": 667, - "\u015b": 500, - "\u015e": 667, - "\u015f": 500, - "\u0160": 667, - "\u0161": 500, - "\u0162": 611, - "\u0163": 278, - "\u0164": 611, - "\u0165": 317, - "\u016a": 722, - "\u016b": 556, - "\u016e": 722, - "\u016f": 556, - "\u0170": 722, - "\u0171": 556, - "\u0172": 722, - "\u0173": 556, - "\u0178": 667, - "\u0179": 611, - "\u017a": 500, - "\u017b": 611, - "\u017c": 500, - "\u017d": 611, - "\u017e": 500, - "\u0192": 556, - "\u0218": 667, - "\u0219": 500, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 556, - "\u2014": 1000, - "\u2018": 222, - "\u2019": 222, - "\u201a": 222, - "\u201c": 333, - "\u201d": 333, - "\u201e": 333, - "\u2020": 556, - "\u2021": 556, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 476, - "\u2206": 612, - "\u2211": 600, - "\u2212": 584, - "\u221a": 453, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 471, - "\uf6c3": 250, - "\ufb01": 500, - "\ufb02": 500, - }, - ), - "Symbol": ( - { - "FontName": "Symbol", - "FontBBox": (-180.0, -293.0, 1090.0, 1010.0), - "FontWeight": "Medium", - "FontFamily": "Symbol", - "Flags": 0, - "ItalicAngle": 0.0, - }, - { - " ": 250, - "!": 333, - "#": 500, - "%": 833, - "&": 778, - "(": 333, - ")": 333, - "+": 549, - ",": 250, - ".": 250, - "/": 278, - "0": 500, - "1": 500, - "2": 500, - "3": 500, - "4": 500, - "5": 500, - "6": 500, - "7": 500, - "8": 500, - "9": 500, - ":": 278, - ";": 278, - "<": 549, - "=": 549, - ">": 549, - "?": 444, - "[": 333, - "]": 333, - "_": 500, - "{": 480, - "|": 200, - "}": 480, - "\xac": 713, - "\xb0": 400, - "\xb1": 549, - "\xb5": 576, - "\xd7": 549, - "\xf7": 549, - "\u0192": 500, - "\u0391": 722, - "\u0392": 667, - "\u0393": 603, - "\u0395": 611, - "\u0396": 611, - "\u0397": 722, - "\u0398": 741, - "\u0399": 333, - "\u039a": 722, - "\u039b": 686, - "\u039c": 889, - "\u039d": 722, - "\u039e": 645, - "\u039f": 722, - "\u03a0": 768, - "\u03a1": 556, - "\u03a3": 592, - "\u03a4": 611, - "\u03a5": 690, - "\u03a6": 763, - "\u03a7": 722, - "\u03a8": 795, - "\u03b1": 631, - "\u03b2": 549, - "\u03b3": 411, - "\u03b4": 494, - "\u03b5": 439, - "\u03b6": 494, - "\u03b7": 603, - "\u03b8": 521, - "\u03b9": 329, - "\u03ba": 549, - "\u03bb": 549, - "\u03bd": 521, - "\u03be": 493, - "\u03bf": 549, - "\u03c0": 549, - "\u03c1": 549, - "\u03c2": 439, - "\u03c3": 603, - "\u03c4": 439, - "\u03c5": 576, - "\u03c6": 521, - "\u03c7": 549, - "\u03c8": 686, - "\u03c9": 686, - "\u03d1": 631, - "\u03d2": 620, - "\u03d5": 603, - "\u03d6": 713, - "\u2022": 460, - "\u2026": 1000, - "\u2032": 247, - "\u2033": 411, - "\u2044": 167, - "\u20ac": 750, - "\u2111": 686, - "\u2118": 987, - "\u211c": 795, - "\u2126": 768, - "\u2135": 823, - "\u2190": 987, - "\u2191": 603, - "\u2192": 987, - "\u2193": 603, - "\u2194": 1042, - "\u21b5": 658, - "\u21d0": 987, - "\u21d1": 603, - "\u21d2": 987, - "\u21d3": 603, - "\u21d4": 1042, - "\u2200": 713, - "\u2202": 494, - "\u2203": 549, - "\u2205": 823, - "\u2206": 612, - "\u2207": 713, - "\u2208": 713, - "\u2209": 713, - "\u220b": 439, - "\u220f": 823, - "\u2211": 713, - "\u2212": 549, - "\u2217": 500, - "\u221a": 549, - "\u221d": 713, - "\u221e": 713, - "\u2220": 768, - "\u2227": 603, - "\u2228": 603, - "\u2229": 768, - "\u222a": 768, - "\u222b": 274, - "\u2234": 863, - "\u223c": 549, - "\u2245": 549, - "\u2248": 549, - "\u2260": 549, - "\u2261": 549, - "\u2264": 549, - "\u2265": 549, - "\u2282": 713, - "\u2283": 713, - "\u2284": 713, - "\u2286": 713, - "\u2287": 713, - "\u2295": 768, - "\u2297": 768, - "\u22a5": 658, - "\u22c5": 250, - "\u2320": 686, - "\u2321": 686, - "\u2329": 329, - "\u232a": 329, - "\u25ca": 494, - "\u2660": 753, - "\u2663": 753, - "\u2665": 753, - "\u2666": 753, - "\uf6d9": 790, - "\uf6da": 790, - "\uf6db": 890, - "\uf8e5": 500, - "\uf8e6": 603, - "\uf8e7": 1000, - "\uf8e8": 790, - "\uf8e9": 790, - "\uf8ea": 786, - "\uf8eb": 384, - "\uf8ec": 384, - "\uf8ed": 384, - "\uf8ee": 384, - "\uf8ef": 384, - "\uf8f0": 384, - "\uf8f1": 494, - "\uf8f2": 494, - "\uf8f3": 494, - "\uf8f4": 494, - "\uf8f5": 686, - "\uf8f6": 384, - "\uf8f7": 384, - "\uf8f8": 384, - "\uf8f9": 384, - "\uf8fa": 384, - "\uf8fb": 384, - "\uf8fc": 494, - "\uf8fd": 494, - "\uf8fe": 494, - "\uf8ff": 790, - }, - ), - "Times-Bold": ( - { - "FontName": "Times-Bold", - "Descent": -217.0, - "FontBBox": (-168.0, -218.0, 1000.0, 935.0), - "FontWeight": "Bold", - "CapHeight": 676.0, - "FontFamily": "Times", - "Flags": 0, - "XHeight": 461.0, - "ItalicAngle": 0.0, - "Ascent": 683.0, - }, - { - " ": 250, - "!": 333, - '"': 555, - "#": 500, - "$": 500, - "%": 1000, - "&": 833, - "'": 278, - "(": 333, - ")": 333, - "*": 500, - "+": 570, - ",": 250, - "-": 333, - ".": 250, - "/": 278, - "0": 500, - "1": 500, - "2": 500, - "3": 500, - "4": 500, - "5": 500, - "6": 500, - "7": 500, - "8": 500, - "9": 500, - ":": 333, - ";": 333, - "<": 570, - "=": 570, - ">": 570, - "?": 500, - "@": 930, - "A": 722, - "B": 667, - "C": 722, - "D": 722, - "E": 667, - "F": 611, - "G": 778, - "H": 778, - "I": 389, - "J": 500, - "K": 778, - "L": 667, - "M": 944, - "N": 722, - "O": 778, - "P": 611, - "Q": 778, - "R": 722, - "S": 556, - "T": 667, - "U": 722, - "V": 722, - "W": 1000, - "X": 722, - "Y": 722, - "Z": 667, - "[": 333, - "\\": 278, - "]": 333, - "^": 581, - "_": 500, - "`": 333, - "a": 500, - "b": 556, - "c": 444, - "d": 556, - "e": 444, - "f": 333, - "g": 500, - "h": 556, - "i": 278, - "j": 333, - "k": 556, - "l": 278, - "m": 833, - "n": 556, - "o": 500, - "p": 556, - "q": 556, - "r": 444, - "s": 389, - "t": 333, - "u": 556, - "v": 500, - "w": 722, - "x": 500, - "y": 500, - "z": 444, - "{": 394, - "|": 220, - "}": 394, - "~": 520, - "\xa1": 333, - "\xa2": 500, - "\xa3": 500, - "\xa4": 500, - "\xa5": 500, - "\xa6": 220, - "\xa7": 500, - "\xa8": 333, - "\xa9": 747, - "\xaa": 300, - "\xab": 500, - "\xac": 570, - "\xae": 747, - "\xaf": 333, - "\xb0": 400, - "\xb1": 570, - "\xb2": 300, - "\xb3": 300, - "\xb4": 333, - "\xb5": 556, - "\xb6": 540, - "\xb7": 250, - "\xb8": 333, - "\xb9": 300, - "\xba": 330, - "\xbb": 500, - "\xbc": 750, - "\xbd": 750, - "\xbe": 750, - "\xbf": 500, - "\xc0": 722, - "\xc1": 722, - "\xc2": 722, - "\xc3": 722, - "\xc4": 722, - "\xc5": 722, - "\xc6": 1000, - "\xc7": 722, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 389, - "\xcd": 389, - "\xce": 389, - "\xcf": 389, - "\xd0": 722, - "\xd1": 722, - "\xd2": 778, - "\xd3": 778, - "\xd4": 778, - "\xd5": 778, - "\xd6": 778, - "\xd7": 570, - "\xd8": 778, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 722, - "\xde": 611, - "\xdf": 556, - "\xe0": 500, - "\xe1": 500, - "\xe2": 500, - "\xe3": 500, - "\xe4": 500, - "\xe5": 500, - "\xe6": 722, - "\xe7": 444, - "\xe8": 444, - "\xe9": 444, - "\xea": 444, - "\xeb": 444, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 500, - "\xf1": 556, - "\xf2": 500, - "\xf3": 500, - "\xf4": 500, - "\xf5": 500, - "\xf6": 500, - "\xf7": 570, - "\xf8": 500, - "\xf9": 556, - "\xfa": 556, - "\xfb": 556, - "\xfc": 556, - "\xfd": 500, - "\xfe": 556, - "\xff": 500, - "\u0100": 722, - "\u0101": 500, - "\u0102": 722, - "\u0103": 500, - "\u0104": 722, - "\u0105": 500, - "\u0106": 722, - "\u0107": 444, - "\u010c": 722, - "\u010d": 444, - "\u010e": 722, - "\u010f": 672, - "\u0110": 722, - "\u0111": 556, - "\u0112": 667, - "\u0113": 444, - "\u0116": 667, - "\u0117": 444, - "\u0118": 667, - "\u0119": 444, - "\u011a": 667, - "\u011b": 444, - "\u011e": 778, - "\u011f": 500, - "\u0122": 778, - "\u0123": 500, - "\u012a": 389, - "\u012b": 278, - "\u012e": 389, - "\u012f": 278, - "\u0130": 389, - "\u0131": 278, - "\u0136": 778, - "\u0137": 556, - "\u0139": 667, - "\u013a": 278, - "\u013b": 667, - "\u013c": 278, - "\u013d": 667, - "\u013e": 394, - "\u0141": 667, - "\u0142": 278, - "\u0143": 722, - "\u0144": 556, - "\u0145": 722, - "\u0146": 556, - "\u0147": 722, - "\u0148": 556, - "\u014c": 778, - "\u014d": 500, - "\u0150": 778, - "\u0151": 500, - "\u0152": 1000, - "\u0153": 722, - "\u0154": 722, - "\u0155": 444, - "\u0156": 722, - "\u0157": 444, - "\u0158": 722, - "\u0159": 444, - "\u015a": 556, - "\u015b": 389, - "\u015e": 556, - "\u015f": 389, - "\u0160": 556, - "\u0161": 389, - "\u0162": 667, - "\u0163": 333, - "\u0164": 667, - "\u0165": 416, - "\u016a": 722, - "\u016b": 556, - "\u016e": 722, - "\u016f": 556, - "\u0170": 722, - "\u0171": 556, - "\u0172": 722, - "\u0173": 556, - "\u0178": 722, - "\u0179": 667, - "\u017a": 444, - "\u017b": 667, - "\u017c": 444, - "\u017d": 667, - "\u017e": 444, - "\u0192": 500, - "\u0218": 556, - "\u0219": 389, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 500, - "\u2014": 1000, - "\u2018": 333, - "\u2019": 333, - "\u201a": 333, - "\u201c": 500, - "\u201d": 500, - "\u201e": 500, - "\u2020": 500, - "\u2021": 500, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 494, - "\u2206": 612, - "\u2211": 600, - "\u2212": 570, - "\u221a": 549, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 494, - "\uf6c3": 250, - "\ufb01": 556, - "\ufb02": 556, - }, - ), - "Times-BoldItalic": ( - { - "FontName": "Times-BoldItalic", - "Descent": -217.0, - "FontBBox": (-200.0, -218.0, 996.0, 921.0), - "FontWeight": "Bold", - "CapHeight": 669.0, - "FontFamily": "Times", - "Flags": 0, - "XHeight": 462.0, - "ItalicAngle": -15.0, - "Ascent": 683.0, - }, - { - " ": 250, - "!": 389, - '"': 555, - "#": 500, - "$": 500, - "%": 833, - "&": 778, - "'": 278, - "(": 333, - ")": 333, - "*": 500, - "+": 570, - ",": 250, - "-": 333, - ".": 250, - "/": 278, - "0": 500, - "1": 500, - "2": 500, - "3": 500, - "4": 500, - "5": 500, - "6": 500, - "7": 500, - "8": 500, - "9": 500, - ":": 333, - ";": 333, - "<": 570, - "=": 570, - ">": 570, - "?": 500, - "@": 832, - "A": 667, - "B": 667, - "C": 667, - "D": 722, - "E": 667, - "F": 667, - "G": 722, - "H": 778, - "I": 389, - "J": 500, - "K": 667, - "L": 611, - "M": 889, - "N": 722, - "O": 722, - "P": 611, - "Q": 722, - "R": 667, - "S": 556, - "T": 611, - "U": 722, - "V": 667, - "W": 889, - "X": 667, - "Y": 611, - "Z": 611, - "[": 333, - "\\": 278, - "]": 333, - "^": 570, - "_": 500, - "`": 333, - "a": 500, - "b": 500, - "c": 444, - "d": 500, - "e": 444, - "f": 333, - "g": 500, - "h": 556, - "i": 278, - "j": 278, - "k": 500, - "l": 278, - "m": 778, - "n": 556, - "o": 500, - "p": 500, - "q": 500, - "r": 389, - "s": 389, - "t": 278, - "u": 556, - "v": 444, - "w": 667, - "x": 500, - "y": 444, - "z": 389, - "{": 348, - "|": 220, - "}": 348, - "~": 570, - "\xa1": 389, - "\xa2": 500, - "\xa3": 500, - "\xa4": 500, - "\xa5": 500, - "\xa6": 220, - "\xa7": 500, - "\xa8": 333, - "\xa9": 747, - "\xaa": 266, - "\xab": 500, - "\xac": 606, - "\xae": 747, - "\xaf": 333, - "\xb0": 400, - "\xb1": 570, - "\xb2": 300, - "\xb3": 300, - "\xb4": 333, - "\xb5": 576, - "\xb6": 500, - "\xb7": 250, - "\xb8": 333, - "\xb9": 300, - "\xba": 300, - "\xbb": 500, - "\xbc": 750, - "\xbd": 750, - "\xbe": 750, - "\xbf": 500, - "\xc0": 667, - "\xc1": 667, - "\xc2": 667, - "\xc3": 667, - "\xc4": 667, - "\xc5": 667, - "\xc6": 944, - "\xc7": 667, - "\xc8": 667, - "\xc9": 667, - "\xca": 667, - "\xcb": 667, - "\xcc": 389, - "\xcd": 389, - "\xce": 389, - "\xcf": 389, - "\xd0": 722, - "\xd1": 722, - "\xd2": 722, - "\xd3": 722, - "\xd4": 722, - "\xd5": 722, - "\xd6": 722, - "\xd7": 570, - "\xd8": 722, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 611, - "\xde": 611, - "\xdf": 500, - "\xe0": 500, - "\xe1": 500, - "\xe2": 500, - "\xe3": 500, - "\xe4": 500, - "\xe5": 500, - "\xe6": 722, - "\xe7": 444, - "\xe8": 444, - "\xe9": 444, - "\xea": 444, - "\xeb": 444, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 500, - "\xf1": 556, - "\xf2": 500, - "\xf3": 500, - "\xf4": 500, - "\xf5": 500, - "\xf6": 500, - "\xf7": 570, - "\xf8": 500, - "\xf9": 556, - "\xfa": 556, - "\xfb": 556, - "\xfc": 556, - "\xfd": 444, - "\xfe": 500, - "\xff": 444, - "\u0100": 667, - "\u0101": 500, - "\u0102": 667, - "\u0103": 500, - "\u0104": 667, - "\u0105": 500, - "\u0106": 667, - "\u0107": 444, - "\u010c": 667, - "\u010d": 444, - "\u010e": 722, - "\u010f": 608, - "\u0110": 722, - "\u0111": 500, - "\u0112": 667, - "\u0113": 444, - "\u0116": 667, - "\u0117": 444, - "\u0118": 667, - "\u0119": 444, - "\u011a": 667, - "\u011b": 444, - "\u011e": 722, - "\u011f": 500, - "\u0122": 722, - "\u0123": 500, - "\u012a": 389, - "\u012b": 278, - "\u012e": 389, - "\u012f": 278, - "\u0130": 389, - "\u0131": 278, - "\u0136": 667, - "\u0137": 500, - "\u0139": 611, - "\u013a": 278, - "\u013b": 611, - "\u013c": 278, - "\u013d": 611, - "\u013e": 382, - "\u0141": 611, - "\u0142": 278, - "\u0143": 722, - "\u0144": 556, - "\u0145": 722, - "\u0146": 556, - "\u0147": 722, - "\u0148": 556, - "\u014c": 722, - "\u014d": 500, - "\u0150": 722, - "\u0151": 500, - "\u0152": 944, - "\u0153": 722, - "\u0154": 667, - "\u0155": 389, - "\u0156": 667, - "\u0157": 389, - "\u0158": 667, - "\u0159": 389, - "\u015a": 556, - "\u015b": 389, - "\u015e": 556, - "\u015f": 389, - "\u0160": 556, - "\u0161": 389, - "\u0162": 611, - "\u0163": 278, - "\u0164": 611, - "\u0165": 366, - "\u016a": 722, - "\u016b": 556, - "\u016e": 722, - "\u016f": 556, - "\u0170": 722, - "\u0171": 556, - "\u0172": 722, - "\u0173": 556, - "\u0178": 611, - "\u0179": 611, - "\u017a": 389, - "\u017b": 611, - "\u017c": 389, - "\u017d": 611, - "\u017e": 389, - "\u0192": 500, - "\u0218": 556, - "\u0219": 389, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 500, - "\u2014": 1000, - "\u2018": 333, - "\u2019": 333, - "\u201a": 333, - "\u201c": 500, - "\u201d": 500, - "\u201e": 500, - "\u2020": 500, - "\u2021": 500, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 1000, - "\u2202": 494, - "\u2206": 612, - "\u2211": 600, - "\u2212": 606, - "\u221a": 549, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 494, - "\uf6c3": 250, - "\ufb01": 556, - "\ufb02": 556, - }, - ), - "Times-Italic": ( - { - "FontName": "Times-Italic", - "Descent": -217.0, - "FontBBox": (-169.0, -217.0, 1010.0, 883.0), - "FontWeight": "Medium", - "CapHeight": 653.0, - "FontFamily": "Times", - "Flags": 0, - "XHeight": 441.0, - "ItalicAngle": -15.5, - "Ascent": 683.0, - }, - { - " ": 250, - "!": 333, - '"': 420, - "#": 500, - "$": 500, - "%": 833, - "&": 778, - "'": 214, - "(": 333, - ")": 333, - "*": 500, - "+": 675, - ",": 250, - "-": 333, - ".": 250, - "/": 278, - "0": 500, - "1": 500, - "2": 500, - "3": 500, - "4": 500, - "5": 500, - "6": 500, - "7": 500, - "8": 500, - "9": 500, - ":": 333, - ";": 333, - "<": 675, - "=": 675, - ">": 675, - "?": 500, - "@": 920, - "A": 611, - "B": 611, - "C": 667, - "D": 722, - "E": 611, - "F": 611, - "G": 722, - "H": 722, - "I": 333, - "J": 444, - "K": 667, - "L": 556, - "M": 833, - "N": 667, - "O": 722, - "P": 611, - "Q": 722, - "R": 611, - "S": 500, - "T": 556, - "U": 722, - "V": 611, - "W": 833, - "X": 611, - "Y": 556, - "Z": 556, - "[": 389, - "\\": 278, - "]": 389, - "^": 422, - "_": 500, - "`": 333, - "a": 500, - "b": 500, - "c": 444, - "d": 500, - "e": 444, - "f": 278, - "g": 500, - "h": 500, - "i": 278, - "j": 278, - "k": 444, - "l": 278, - "m": 722, - "n": 500, - "o": 500, - "p": 500, - "q": 500, - "r": 389, - "s": 389, - "t": 278, - "u": 500, - "v": 444, - "w": 667, - "x": 444, - "y": 444, - "z": 389, - "{": 400, - "|": 275, - "}": 400, - "~": 541, - "\xa1": 389, - "\xa2": 500, - "\xa3": 500, - "\xa4": 500, - "\xa5": 500, - "\xa6": 275, - "\xa7": 500, - "\xa8": 333, - "\xa9": 760, - "\xaa": 276, - "\xab": 500, - "\xac": 675, - "\xae": 760, - "\xaf": 333, - "\xb0": 400, - "\xb1": 675, - "\xb2": 300, - "\xb3": 300, - "\xb4": 333, - "\xb5": 500, - "\xb6": 523, - "\xb7": 250, - "\xb8": 333, - "\xb9": 300, - "\xba": 310, - "\xbb": 500, - "\xbc": 750, - "\xbd": 750, - "\xbe": 750, - "\xbf": 500, - "\xc0": 611, - "\xc1": 611, - "\xc2": 611, - "\xc3": 611, - "\xc4": 611, - "\xc5": 611, - "\xc6": 889, - "\xc7": 667, - "\xc8": 611, - "\xc9": 611, - "\xca": 611, - "\xcb": 611, - "\xcc": 333, - "\xcd": 333, - "\xce": 333, - "\xcf": 333, - "\xd0": 722, - "\xd1": 667, - "\xd2": 722, - "\xd3": 722, - "\xd4": 722, - "\xd5": 722, - "\xd6": 722, - "\xd7": 675, - "\xd8": 722, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 556, - "\xde": 611, - "\xdf": 500, - "\xe0": 500, - "\xe1": 500, - "\xe2": 500, - "\xe3": 500, - "\xe4": 500, - "\xe5": 500, - "\xe6": 667, - "\xe7": 444, - "\xe8": 444, - "\xe9": 444, - "\xea": 444, - "\xeb": 444, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 500, - "\xf1": 500, - "\xf2": 500, - "\xf3": 500, - "\xf4": 500, - "\xf5": 500, - "\xf6": 500, - "\xf7": 675, - "\xf8": 500, - "\xf9": 500, - "\xfa": 500, - "\xfb": 500, - "\xfc": 500, - "\xfd": 444, - "\xfe": 500, - "\xff": 444, - "\u0100": 611, - "\u0101": 500, - "\u0102": 611, - "\u0103": 500, - "\u0104": 611, - "\u0105": 500, - "\u0106": 667, - "\u0107": 444, - "\u010c": 667, - "\u010d": 444, - "\u010e": 722, - "\u010f": 544, - "\u0110": 722, - "\u0111": 500, - "\u0112": 611, - "\u0113": 444, - "\u0116": 611, - "\u0117": 444, - "\u0118": 611, - "\u0119": 444, - "\u011a": 611, - "\u011b": 444, - "\u011e": 722, - "\u011f": 500, - "\u0122": 722, - "\u0123": 500, - "\u012a": 333, - "\u012b": 278, - "\u012e": 333, - "\u012f": 278, - "\u0130": 333, - "\u0131": 278, - "\u0136": 667, - "\u0137": 444, - "\u0139": 556, - "\u013a": 278, - "\u013b": 556, - "\u013c": 278, - "\u013d": 611, - "\u013e": 300, - "\u0141": 556, - "\u0142": 278, - "\u0143": 667, - "\u0144": 500, - "\u0145": 667, - "\u0146": 500, - "\u0147": 667, - "\u0148": 500, - "\u014c": 722, - "\u014d": 500, - "\u0150": 722, - "\u0151": 500, - "\u0152": 944, - "\u0153": 667, - "\u0154": 611, - "\u0155": 389, - "\u0156": 611, - "\u0157": 389, - "\u0158": 611, - "\u0159": 389, - "\u015a": 500, - "\u015b": 389, - "\u015e": 500, - "\u015f": 389, - "\u0160": 500, - "\u0161": 389, - "\u0162": 556, - "\u0163": 278, - "\u0164": 556, - "\u0165": 300, - "\u016a": 722, - "\u016b": 500, - "\u016e": 722, - "\u016f": 500, - "\u0170": 722, - "\u0171": 500, - "\u0172": 722, - "\u0173": 500, - "\u0178": 556, - "\u0179": 556, - "\u017a": 389, - "\u017b": 556, - "\u017c": 389, - "\u017d": 556, - "\u017e": 389, - "\u0192": 500, - "\u0218": 500, - "\u0219": 389, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 500, - "\u2014": 889, - "\u2018": 333, - "\u2019": 333, - "\u201a": 333, - "\u201c": 556, - "\u201d": 556, - "\u201e": 556, - "\u2020": 500, - "\u2021": 500, - "\u2022": 350, - "\u2026": 889, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 980, - "\u2202": 476, - "\u2206": 612, - "\u2211": 600, - "\u2212": 675, - "\u221a": 453, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 471, - "\uf6c3": 250, - "\ufb01": 500, - "\ufb02": 500, - }, - ), - "Times-Roman": ( - { - "FontName": "Times-Roman", - "Descent": -217.0, - "FontBBox": (-168.0, -218.0, 1000.0, 898.0), - "FontWeight": "Roman", - "CapHeight": 662.0, - "FontFamily": "Times", - "Flags": 0, - "XHeight": 450.0, - "ItalicAngle": 0.0, - "Ascent": 683.0, - }, - { - " ": 250, - "!": 333, - '"': 408, - "#": 500, - "$": 500, - "%": 833, - "&": 778, - "'": 180, - "(": 333, - ")": 333, - "*": 500, - "+": 564, - ",": 250, - "-": 333, - ".": 250, - "/": 278, - "0": 500, - "1": 500, - "2": 500, - "3": 500, - "4": 500, - "5": 500, - "6": 500, - "7": 500, - "8": 500, - "9": 500, - ":": 278, - ";": 278, - "<": 564, - "=": 564, - ">": 564, - "?": 444, - "@": 921, - "A": 722, - "B": 667, - "C": 667, - "D": 722, - "E": 611, - "F": 556, - "G": 722, - "H": 722, - "I": 333, - "J": 389, - "K": 722, - "L": 611, - "M": 889, - "N": 722, - "O": 722, - "P": 556, - "Q": 722, - "R": 667, - "S": 556, - "T": 611, - "U": 722, - "V": 722, - "W": 944, - "X": 722, - "Y": 722, - "Z": 611, - "[": 333, - "\\": 278, - "]": 333, - "^": 469, - "_": 500, - "`": 333, - "a": 444, - "b": 500, - "c": 444, - "d": 500, - "e": 444, - "f": 333, - "g": 500, - "h": 500, - "i": 278, - "j": 278, - "k": 500, - "l": 278, - "m": 778, - "n": 500, - "o": 500, - "p": 500, - "q": 500, - "r": 333, - "s": 389, - "t": 278, - "u": 500, - "v": 500, - "w": 722, - "x": 500, - "y": 500, - "z": 444, - "{": 480, - "|": 200, - "}": 480, - "~": 541, - "\xa1": 333, - "\xa2": 500, - "\xa3": 500, - "\xa4": 500, - "\xa5": 500, - "\xa6": 200, - "\xa7": 500, - "\xa8": 333, - "\xa9": 760, - "\xaa": 276, - "\xab": 500, - "\xac": 564, - "\xae": 760, - "\xaf": 333, - "\xb0": 400, - "\xb1": 564, - "\xb2": 300, - "\xb3": 300, - "\xb4": 333, - "\xb5": 500, - "\xb6": 453, - "\xb7": 250, - "\xb8": 333, - "\xb9": 300, - "\xba": 310, - "\xbb": 500, - "\xbc": 750, - "\xbd": 750, - "\xbe": 750, - "\xbf": 444, - "\xc0": 722, - "\xc1": 722, - "\xc2": 722, - "\xc3": 722, - "\xc4": 722, - "\xc5": 722, - "\xc6": 889, - "\xc7": 667, - "\xc8": 611, - "\xc9": 611, - "\xca": 611, - "\xcb": 611, - "\xcc": 333, - "\xcd": 333, - "\xce": 333, - "\xcf": 333, - "\xd0": 722, - "\xd1": 722, - "\xd2": 722, - "\xd3": 722, - "\xd4": 722, - "\xd5": 722, - "\xd6": 722, - "\xd7": 564, - "\xd8": 722, - "\xd9": 722, - "\xda": 722, - "\xdb": 722, - "\xdc": 722, - "\xdd": 722, - "\xde": 556, - "\xdf": 500, - "\xe0": 444, - "\xe1": 444, - "\xe2": 444, - "\xe3": 444, - "\xe4": 444, - "\xe5": 444, - "\xe6": 667, - "\xe7": 444, - "\xe8": 444, - "\xe9": 444, - "\xea": 444, - "\xeb": 444, - "\xec": 278, - "\xed": 278, - "\xee": 278, - "\xef": 278, - "\xf0": 500, - "\xf1": 500, - "\xf2": 500, - "\xf3": 500, - "\xf4": 500, - "\xf5": 500, - "\xf6": 500, - "\xf7": 564, - "\xf8": 500, - "\xf9": 500, - "\xfa": 500, - "\xfb": 500, - "\xfc": 500, - "\xfd": 500, - "\xfe": 500, - "\xff": 500, - "\u0100": 722, - "\u0101": 444, - "\u0102": 722, - "\u0103": 444, - "\u0104": 722, - "\u0105": 444, - "\u0106": 667, - "\u0107": 444, - "\u010c": 667, - "\u010d": 444, - "\u010e": 722, - "\u010f": 588, - "\u0110": 722, - "\u0111": 500, - "\u0112": 611, - "\u0113": 444, - "\u0116": 611, - "\u0117": 444, - "\u0118": 611, - "\u0119": 444, - "\u011a": 611, - "\u011b": 444, - "\u011e": 722, - "\u011f": 500, - "\u0122": 722, - "\u0123": 500, - "\u012a": 333, - "\u012b": 278, - "\u012e": 333, - "\u012f": 278, - "\u0130": 333, - "\u0131": 278, - "\u0136": 722, - "\u0137": 500, - "\u0139": 611, - "\u013a": 278, - "\u013b": 611, - "\u013c": 278, - "\u013d": 611, - "\u013e": 344, - "\u0141": 611, - "\u0142": 278, - "\u0143": 722, - "\u0144": 500, - "\u0145": 722, - "\u0146": 500, - "\u0147": 722, - "\u0148": 500, - "\u014c": 722, - "\u014d": 500, - "\u0150": 722, - "\u0151": 500, - "\u0152": 889, - "\u0153": 722, - "\u0154": 667, - "\u0155": 333, - "\u0156": 667, - "\u0157": 333, - "\u0158": 667, - "\u0159": 333, - "\u015a": 556, - "\u015b": 389, - "\u015e": 556, - "\u015f": 389, - "\u0160": 556, - "\u0161": 389, - "\u0162": 611, - "\u0163": 278, - "\u0164": 611, - "\u0165": 326, - "\u016a": 722, - "\u016b": 500, - "\u016e": 722, - "\u016f": 500, - "\u0170": 722, - "\u0171": 500, - "\u0172": 722, - "\u0173": 500, - "\u0178": 722, - "\u0179": 611, - "\u017a": 444, - "\u017b": 611, - "\u017c": 444, - "\u017d": 611, - "\u017e": 444, - "\u0192": 500, - "\u0218": 556, - "\u0219": 389, - "\u02c6": 333, - "\u02c7": 333, - "\u02d8": 333, - "\u02d9": 333, - "\u02da": 333, - "\u02db": 333, - "\u02dc": 333, - "\u02dd": 333, - "\u2013": 500, - "\u2014": 1000, - "\u2018": 333, - "\u2019": 333, - "\u201a": 333, - "\u201c": 444, - "\u201d": 444, - "\u201e": 444, - "\u2020": 500, - "\u2021": 500, - "\u2022": 350, - "\u2026": 1000, - "\u2030": 1000, - "\u2039": 333, - "\u203a": 333, - "\u2044": 167, - "\u2122": 980, - "\u2202": 476, - "\u2206": 612, - "\u2211": 600, - "\u2212": 564, - "\u221a": 453, - "\u2260": 549, - "\u2264": 549, - "\u2265": 549, - "\u25ca": 471, - "\uf6c3": 250, - "\ufb01": 556, - "\ufb02": 556, - }, - ), - "ZapfDingbats": ( - { - "FontName": "ZapfDingbats", - "FontBBox": (-1.0, -143.0, 981.0, 820.0), - "FontWeight": "Medium", - "FontFamily": "ITC", - "Flags": 0, - "ItalicAngle": 0.0, - }, - { - "\x01": 974, - "\x02": 961, - "\x03": 980, - "\x04": 719, - "\x05": 789, - "\x06": 494, - "\x07": 552, - "\x08": 537, - "\t": 577, - "\n": 692, - "\x0b": 960, - "\x0c": 939, - "\r": 549, - "\x0e": 855, - "\x0f": 911, - "\x10": 933, - "\x11": 945, - "\x12": 974, - "\x13": 755, - "\x14": 846, - "\x15": 762, - "\x16": 761, - "\x17": 571, - "\x18": 677, - "\x19": 763, - "\x1a": 760, - "\x1b": 759, - "\x1c": 754, - "\x1d": 786, - "\x1e": 788, - "\x1f": 788, - " ": 790, - "!": 793, - '"': 794, - "#": 816, - "$": 823, - "%": 789, - "&": 841, - "'": 823, - "(": 833, - ")": 816, - "*": 831, - "+": 923, - ",": 744, - "-": 723, - ".": 749, - "/": 790, - "0": 792, - "1": 695, - "2": 776, - "3": 768, - "4": 792, - "5": 759, - "6": 707, - "7": 708, - "8": 682, - "9": 701, - ":": 826, - ";": 815, - "<": 789, - "=": 789, - ">": 707, - "?": 687, - "@": 696, - "A": 689, - "B": 786, - "C": 787, - "D": 713, - "E": 791, - "F": 785, - "G": 791, - "H": 873, - "I": 761, - "J": 762, - "K": 759, - "L": 892, - "M": 892, - "N": 788, - "O": 784, - "Q": 438, - "R": 138, - "S": 277, - "T": 415, - "U": 509, - "V": 410, - "W": 234, - "X": 234, - "Y": 390, - "Z": 390, - "[": 276, - "\\": 276, - "]": 317, - "^": 317, - "_": 334, - "`": 334, - "a": 392, - "b": 392, - "c": 668, - "d": 668, - "e": 732, - "f": 544, - "g": 544, - "h": 910, - "i": 911, - "j": 667, - "k": 760, - "l": 760, - "m": 626, - "n": 694, - "o": 595, - "p": 776, - "u": 690, - "v": 791, - "w": 790, - "x": 788, - "y": 788, - "z": 788, - "{": 788, - "|": 788, - "}": 788, - "~": 788, - "\x7f": 788, - "\x80": 788, - "\x81": 788, - "\x82": 788, - "\x83": 788, - "\x84": 788, - "\x85": 788, - "\x86": 788, - "\x87": 788, - "\x88": 788, - "\x89": 788, - "\x8a": 788, - "\x8b": 788, - "\x8c": 788, - "\x8d": 788, - "\x8e": 788, - "\x8f": 788, - "\x90": 788, - "\x91": 788, - "\x92": 788, - "\x93": 788, - "\x94": 788, - "\x95": 788, - "\x96": 788, - "\x97": 788, - "\x98": 788, - "\x99": 788, - "\x9a": 788, - "\x9b": 788, - "\x9c": 788, - "\x9d": 788, - "\x9e": 788, - "\x9f": 788, - "\xa0": 894, - "\xa1": 838, - "\xa2": 924, - "\xa3": 1016, - "\xa4": 458, - "\xa5": 924, - "\xa6": 918, - "\xa7": 927, - "\xa8": 928, - "\xa9": 928, - "\xaa": 834, - "\xab": 873, - "\xac": 828, - "\xad": 924, - "\xae": 917, - "\xaf": 930, - "\xb0": 931, - "\xb1": 463, - "\xb2": 883, - "\xb3": 836, - "\xb4": 867, - "\xb5": 696, - "\xb6": 874, - "\xb7": 760, - "\xb8": 946, - "\xb9": 865, - "\xba": 967, - "\xbb": 831, - "\xbc": 873, - "\xbd": 927, - "\xbe": 970, - "\xbf": 918, - "\xc0": 748, - "\xc1": 836, - "\xc2": 771, - "\xc3": 888, - "\xc4": 748, - "\xc5": 771, - "\xc6": 888, - "\xc7": 867, - "\xc8": 696, - "\xc9": 874, - "\xca": 974, - "\xcb": 762, - "\xcc": 759, - "\xcd": 509, - "\xce": 410, - }, - ), -} - -# Aliases defined in implementation note 62 in Appecix H. related to section 5.5.1 -# (Type 1 Fonts) in the PDF Reference. -FONT_METRICS["Arial"] = FONT_METRICS["Helvetica"] -FONT_METRICS["Arial,Italic"] = FONT_METRICS["Helvetica-Oblique"] -FONT_METRICS["Arial,Bold"] = FONT_METRICS["Helvetica-Bold"] -FONT_METRICS["Arial,BoldItalic"] = FONT_METRICS["Helvetica-BoldOblique"] -FONT_METRICS["CourierNew"] = FONT_METRICS["Courier"] -FONT_METRICS["CourierNew,Italic"] = FONT_METRICS["Courier-Oblique"] -FONT_METRICS["CourierNew,Bold"] = FONT_METRICS["Courier-Bold"] -FONT_METRICS["CourierNew,BoldItalic"] = FONT_METRICS["Courier-BoldOblique"] -FONT_METRICS["TimesNewRoman"] = FONT_METRICS["Times-Roman"] -FONT_METRICS["TimesNewRoman,Italic"] = FONT_METRICS["Times-Italic"] -FONT_METRICS["TimesNewRoman,Bold"] = FONT_METRICS["Times-Bold"] -FONT_METRICS["TimesNewRoman,BoldItalic"] = FONT_METRICS["Times-BoldItalic"] diff --git a/pdf2zh/glyphlist.py b/pdf2zh/glyphlist.py deleted file mode 100644 index 2ee11a5..0000000 --- a/pdf2zh/glyphlist.py +++ /dev/null @@ -1,4366 +0,0 @@ -"""Mappings from Adobe glyph names to Unicode characters. - -In some CMap tables, Adobe glyph names are used for specifying -Unicode characters instead of using decimal/hex character code. - -The following data was taken by - - $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt - -```python -from pdf2zh.glyphlist import convert_glyphlist - -convert_glyphlist("glyphlist.txt") -""" - -# ################################################################################### -# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this documentation file to use, copy, publish, distribute, -# sublicense, and/or sell copies of the documentation, and to permit -# others to do the same, provided that: -# - No modification, editing or other alteration of this document is -# allowed; and -# - The above copyright notice and this permission notice shall be -# included in all copies of the documentation. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this documentation file, to create their own derivative works -# from the content of this document to use, copy, publish, distribute, -# sublicense, and/or sell the derivative works, and to permit others to do -# the same, provided that the derived work is not represented as being a -# copy or version of this document. -# -# Adobe shall not be liable to any party for any loss of revenue or profit -# or for indirect, incidental, special, consequential, or other similar -# damages, whether based on tort (including without limitation negligence -# or strict liability), contract or other legal or equitable grounds even -# if Adobe has been advised or had reason to know of the possibility of -# such damages. The Adobe materials are provided on an "AS IS" basis. -# Adobe specifically disclaims all express, statutory, or implied -# warranties relating to the Adobe materials, including but not limited to -# those concerning merchantability or fitness for a particular purpose or -# non-infringement of any third party rights regarding the Adobe -# materials. -# ################################################################################### -# Name: Adobe Glyph List -# Table version: 2.0 -# Date: September 20, 2002 -# -# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html -# -# Format: Semicolon-delimited fields: -# (1) glyph name -# (2) Unicode scalar value - - -def convert_glyphlist(path: str) -> None: - """Convert a glyph list into a python representation. - - See output below. - """ - state = 0 - with open(path) as fileinput: - for line in fileinput.readlines(): - line = line.strip() - if not line or line.startswith("#"): - if state == 1: - state = 2 - print("}\n") - print(line) - continue - if state == 0: - print("\nglyphname2unicode = {") - state = 1 - (name, x) = line.split(";") - codes = x.split(" ") - print( - " {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)), - ) - - -glyphname2unicode = { - "A": "\u0041", - "AE": "\u00c6", - "AEacute": "\u01fc", - "AEmacron": "\u01e2", - "AEsmall": "\uf7e6", - "Aacute": "\u00c1", - "Aacutesmall": "\uf7e1", - "Abreve": "\u0102", - "Abreveacute": "\u1eae", - "Abrevecyrillic": "\u04d0", - "Abrevedotbelow": "\u1eb6", - "Abrevegrave": "\u1eb0", - "Abrevehookabove": "\u1eb2", - "Abrevetilde": "\u1eb4", - "Acaron": "\u01cd", - "Acircle": "\u24b6", - "Acircumflex": "\u00c2", - "Acircumflexacute": "\u1ea4", - "Acircumflexdotbelow": "\u1eac", - "Acircumflexgrave": "\u1ea6", - "Acircumflexhookabove": "\u1ea8", - "Acircumflexsmall": "\uf7e2", - "Acircumflextilde": "\u1eaa", - "Acute": "\uf6c9", - "Acutesmall": "\uf7b4", - "Acyrillic": "\u0410", - "Adblgrave": "\u0200", - "Adieresis": "\u00c4", - "Adieresiscyrillic": "\u04d2", - "Adieresismacron": "\u01de", - "Adieresissmall": "\uf7e4", - "Adotbelow": "\u1ea0", - "Adotmacron": "\u01e0", - "Agrave": "\u00c0", - "Agravesmall": "\uf7e0", - "Ahookabove": "\u1ea2", - "Aiecyrillic": "\u04d4", - "Ainvertedbreve": "\u0202", - "Alpha": "\u0391", - "Alphatonos": "\u0386", - "Amacron": "\u0100", - "Amonospace": "\uff21", - "Aogonek": "\u0104", - "Aring": "\u00c5", - "Aringacute": "\u01fa", - "Aringbelow": "\u1e00", - "Aringsmall": "\uf7e5", - "Asmall": "\uf761", - "Atilde": "\u00c3", - "Atildesmall": "\uf7e3", - "Aybarmenian": "\u0531", - "B": "\u0042", - "Bcircle": "\u24b7", - "Bdotaccent": "\u1e02", - "Bdotbelow": "\u1e04", - "Becyrillic": "\u0411", - "Benarmenian": "\u0532", - "Beta": "\u0392", - "Bhook": "\u0181", - "Blinebelow": "\u1e06", - "Bmonospace": "\uff22", - "Brevesmall": "\uf6f4", - "Bsmall": "\uf762", - "Btopbar": "\u0182", - "C": "\u0043", - "Caarmenian": "\u053e", - "Cacute": "\u0106", - "Caron": "\uf6ca", - "Caronsmall": "\uf6f5", - "Ccaron": "\u010c", - "Ccedilla": "\u00c7", - "Ccedillaacute": "\u1e08", - "Ccedillasmall": "\uf7e7", - "Ccircle": "\u24b8", - "Ccircumflex": "\u0108", - "Cdot": "\u010a", - "Cdotaccent": "\u010a", - "Cedillasmall": "\uf7b8", - "Chaarmenian": "\u0549", - "Cheabkhasiancyrillic": "\u04bc", - "Checyrillic": "\u0427", - "Chedescenderabkhasiancyrillic": "\u04be", - "Chedescendercyrillic": "\u04b6", - "Chedieresiscyrillic": "\u04f4", - "Cheharmenian": "\u0543", - "Chekhakassiancyrillic": "\u04cb", - "Cheverticalstrokecyrillic": "\u04b8", - "Chi": "\u03a7", - "Chook": "\u0187", - "Circumflexsmall": "\uf6f6", - "Cmonospace": "\uff23", - "Coarmenian": "\u0551", - "Csmall": "\uf763", - "D": "\u0044", - "DZ": "\u01f1", - "DZcaron": "\u01c4", - "Daarmenian": "\u0534", - "Dafrican": "\u0189", - "Dcaron": "\u010e", - "Dcedilla": "\u1e10", - "Dcircle": "\u24b9", - "Dcircumflexbelow": "\u1e12", - "Dcroat": "\u0110", - "Ddotaccent": "\u1e0a", - "Ddotbelow": "\u1e0c", - "Decyrillic": "\u0414", - "Deicoptic": "\u03ee", - "Delta": "\u2206", - "Deltagreek": "\u0394", - "Dhook": "\u018a", - "Dieresis": "\uf6cb", - "DieresisAcute": "\uf6cc", - "DieresisGrave": "\uf6cd", - "Dieresissmall": "\uf7a8", - "Digammagreek": "\u03dc", - "Djecyrillic": "\u0402", - "Dlinebelow": "\u1e0e", - "Dmonospace": "\uff24", - "Dotaccentsmall": "\uf6f7", - "Dslash": "\u0110", - "Dsmall": "\uf764", - "Dtopbar": "\u018b", - "Dz": "\u01f2", - "Dzcaron": "\u01c5", - "Dzeabkhasiancyrillic": "\u04e0", - "Dzecyrillic": "\u0405", - "Dzhecyrillic": "\u040f", - "E": "\u0045", - "Eacute": "\u00c9", - "Eacutesmall": "\uf7e9", - "Ebreve": "\u0114", - "Ecaron": "\u011a", - "Ecedillabreve": "\u1e1c", - "Echarmenian": "\u0535", - "Ecircle": "\u24ba", - "Ecircumflex": "\u00ca", - "Ecircumflexacute": "\u1ebe", - "Ecircumflexbelow": "\u1e18", - "Ecircumflexdotbelow": "\u1ec6", - "Ecircumflexgrave": "\u1ec0", - "Ecircumflexhookabove": "\u1ec2", - "Ecircumflexsmall": "\uf7ea", - "Ecircumflextilde": "\u1ec4", - "Ecyrillic": "\u0404", - "Edblgrave": "\u0204", - "Edieresis": "\u00cb", - "Edieresissmall": "\uf7eb", - "Edot": "\u0116", - "Edotaccent": "\u0116", - "Edotbelow": "\u1eb8", - "Efcyrillic": "\u0424", - "Egrave": "\u00c8", - "Egravesmall": "\uf7e8", - "Eharmenian": "\u0537", - "Ehookabove": "\u1eba", - "Eightroman": "\u2167", - "Einvertedbreve": "\u0206", - "Eiotifiedcyrillic": "\u0464", - "Elcyrillic": "\u041b", - "Elevenroman": "\u216a", - "Emacron": "\u0112", - "Emacronacute": "\u1e16", - "Emacrongrave": "\u1e14", - "Emcyrillic": "\u041c", - "Emonospace": "\uff25", - "Encyrillic": "\u041d", - "Endescendercyrillic": "\u04a2", - "Eng": "\u014a", - "Enghecyrillic": "\u04a4", - "Enhookcyrillic": "\u04c7", - "Eogonek": "\u0118", - "Eopen": "\u0190", - "Epsilon": "\u0395", - "Epsilontonos": "\u0388", - "Ercyrillic": "\u0420", - "Ereversed": "\u018e", - "Ereversedcyrillic": "\u042d", - "Escyrillic": "\u0421", - "Esdescendercyrillic": "\u04aa", - "Esh": "\u01a9", - "Esmall": "\uf765", - "Eta": "\u0397", - "Etarmenian": "\u0538", - "Etatonos": "\u0389", - "Eth": "\u00d0", - "Ethsmall": "\uf7f0", - "Etilde": "\u1ebc", - "Etildebelow": "\u1e1a", - "Euro": "\u20ac", - "Ezh": "\u01b7", - "Ezhcaron": "\u01ee", - "Ezhreversed": "\u01b8", - "F": "\u0046", - "Fcircle": "\u24bb", - "Fdotaccent": "\u1e1e", - "Feharmenian": "\u0556", - "Feicoptic": "\u03e4", - "Fhook": "\u0191", - "Fitacyrillic": "\u0472", - "Fiveroman": "\u2164", - "Fmonospace": "\uff26", - "Fourroman": "\u2163", - "Fsmall": "\uf766", - "G": "\u0047", - "GBsquare": "\u3387", - "Gacute": "\u01f4", - "Gamma": "\u0393", - "Gammaafrican": "\u0194", - "Gangiacoptic": "\u03ea", - "Gbreve": "\u011e", - "Gcaron": "\u01e6", - "Gcedilla": "\u0122", - "Gcircle": "\u24bc", - "Gcircumflex": "\u011c", - "Gcommaaccent": "\u0122", - "Gdot": "\u0120", - "Gdotaccent": "\u0120", - "Gecyrillic": "\u0413", - "Ghadarmenian": "\u0542", - "Ghemiddlehookcyrillic": "\u0494", - "Ghestrokecyrillic": "\u0492", - "Gheupturncyrillic": "\u0490", - "Ghook": "\u0193", - "Gimarmenian": "\u0533", - "Gjecyrillic": "\u0403", - "Gmacron": "\u1e20", - "Gmonospace": "\uff27", - "Grave": "\uf6ce", - "Gravesmall": "\uf760", - "Gsmall": "\uf767", - "Gsmallhook": "\u029b", - "Gstroke": "\u01e4", - "H": "\u0048", - "H18533": "\u25cf", - "H18543": "\u25aa", - "H18551": "\u25ab", - "H22073": "\u25a1", - "HPsquare": "\u33cb", - "Haabkhasiancyrillic": "\u04a8", - "Hadescendercyrillic": "\u04b2", - "Hardsigncyrillic": "\u042a", - "Hbar": "\u0126", - "Hbrevebelow": "\u1e2a", - "Hcedilla": "\u1e28", - "Hcircle": "\u24bd", - "Hcircumflex": "\u0124", - "Hdieresis": "\u1e26", - "Hdotaccent": "\u1e22", - "Hdotbelow": "\u1e24", - "Hmonospace": "\uff28", - "Hoarmenian": "\u0540", - "Horicoptic": "\u03e8", - "Hsmall": "\uf768", - "Hungarumlaut": "\uf6cf", - "Hungarumlautsmall": "\uf6f8", - "Hzsquare": "\u3390", - "I": "\u0049", - "IAcyrillic": "\u042f", - "IJ": "\u0132", - "IUcyrillic": "\u042e", - "Iacute": "\u00cd", - "Iacutesmall": "\uf7ed", - "Ibreve": "\u012c", - "Icaron": "\u01cf", - "Icircle": "\u24be", - "Icircumflex": "\u00ce", - "Icircumflexsmall": "\uf7ee", - "Icyrillic": "\u0406", - "Idblgrave": "\u0208", - "Idieresis": "\u00cf", - "Idieresisacute": "\u1e2e", - "Idieresiscyrillic": "\u04e4", - "Idieresissmall": "\uf7ef", - "Idot": "\u0130", - "Idotaccent": "\u0130", - "Idotbelow": "\u1eca", - "Iebrevecyrillic": "\u04d6", - "Iecyrillic": "\u0415", - "Ifraktur": "\u2111", - "Igrave": "\u00cc", - "Igravesmall": "\uf7ec", - "Ihookabove": "\u1ec8", - "Iicyrillic": "\u0418", - "Iinvertedbreve": "\u020a", - "Iishortcyrillic": "\u0419", - "Imacron": "\u012a", - "Imacroncyrillic": "\u04e2", - "Imonospace": "\uff29", - "Iniarmenian": "\u053b", - "Iocyrillic": "\u0401", - "Iogonek": "\u012e", - "Iota": "\u0399", - "Iotaafrican": "\u0196", - "Iotadieresis": "\u03aa", - "Iotatonos": "\u038a", - "Ismall": "\uf769", - "Istroke": "\u0197", - "Itilde": "\u0128", - "Itildebelow": "\u1e2c", - "Izhitsacyrillic": "\u0474", - "Izhitsadblgravecyrillic": "\u0476", - "J": "\u004a", - "Jaarmenian": "\u0541", - "Jcircle": "\u24bf", - "Jcircumflex": "\u0134", - "Jecyrillic": "\u0408", - "Jheharmenian": "\u054b", - "Jmonospace": "\uff2a", - "Jsmall": "\uf76a", - "K": "\u004b", - "KBsquare": "\u3385", - "KKsquare": "\u33cd", - "Kabashkircyrillic": "\u04a0", - "Kacute": "\u1e30", - "Kacyrillic": "\u041a", - "Kadescendercyrillic": "\u049a", - "Kahookcyrillic": "\u04c3", - "Kappa": "\u039a", - "Kastrokecyrillic": "\u049e", - "Kaverticalstrokecyrillic": "\u049c", - "Kcaron": "\u01e8", - "Kcedilla": "\u0136", - "Kcircle": "\u24c0", - "Kcommaaccent": "\u0136", - "Kdotbelow": "\u1e32", - "Keharmenian": "\u0554", - "Kenarmenian": "\u053f", - "Khacyrillic": "\u0425", - "Kheicoptic": "\u03e6", - "Khook": "\u0198", - "Kjecyrillic": "\u040c", - "Klinebelow": "\u1e34", - "Kmonospace": "\uff2b", - "Koppacyrillic": "\u0480", - "Koppagreek": "\u03de", - "Ksicyrillic": "\u046e", - "Ksmall": "\uf76b", - "L": "\u004c", - "LJ": "\u01c7", - "LL": "\uf6bf", - "Lacute": "\u0139", - "Lambda": "\u039b", - "Lcaron": "\u013d", - "Lcedilla": "\u013b", - "Lcircle": "\u24c1", - "Lcircumflexbelow": "\u1e3c", - "Lcommaaccent": "\u013b", - "Ldot": "\u013f", - "Ldotaccent": "\u013f", - "Ldotbelow": "\u1e36", - "Ldotbelowmacron": "\u1e38", - "Liwnarmenian": "\u053c", - "Lj": "\u01c8", - "Ljecyrillic": "\u0409", - "Llinebelow": "\u1e3a", - "Lmonospace": "\uff2c", - "Lslash": "\u0141", - "Lslashsmall": "\uf6f9", - "Lsmall": "\uf76c", - "M": "\u004d", - "MBsquare": "\u3386", - "Macron": "\uf6d0", - "Macronsmall": "\uf7af", - "Macute": "\u1e3e", - "Mcircle": "\u24c2", - "Mdotaccent": "\u1e40", - "Mdotbelow": "\u1e42", - "Menarmenian": "\u0544", - "Mmonospace": "\uff2d", - "Msmall": "\uf76d", - "Mturned": "\u019c", - "Mu": "\u039c", - "N": "\u004e", - "NJ": "\u01ca", - "Nacute": "\u0143", - "Ncaron": "\u0147", - "Ncedilla": "\u0145", - "Ncircle": "\u24c3", - "Ncircumflexbelow": "\u1e4a", - "Ncommaaccent": "\u0145", - "Ndotaccent": "\u1e44", - "Ndotbelow": "\u1e46", - "Nhookleft": "\u019d", - "Nineroman": "\u2168", - "Nj": "\u01cb", - "Njecyrillic": "\u040a", - "Nlinebelow": "\u1e48", - "Nmonospace": "\uff2e", - "Nowarmenian": "\u0546", - "Nsmall": "\uf76e", - "Ntilde": "\u00d1", - "Ntildesmall": "\uf7f1", - "Nu": "\u039d", - "O": "\u004f", - "OE": "\u0152", - "OEsmall": "\uf6fa", - "Oacute": "\u00d3", - "Oacutesmall": "\uf7f3", - "Obarredcyrillic": "\u04e8", - "Obarreddieresiscyrillic": "\u04ea", - "Obreve": "\u014e", - "Ocaron": "\u01d1", - "Ocenteredtilde": "\u019f", - "Ocircle": "\u24c4", - "Ocircumflex": "\u00d4", - "Ocircumflexacute": "\u1ed0", - "Ocircumflexdotbelow": "\u1ed8", - "Ocircumflexgrave": "\u1ed2", - "Ocircumflexhookabove": "\u1ed4", - "Ocircumflexsmall": "\uf7f4", - "Ocircumflextilde": "\u1ed6", - "Ocyrillic": "\u041e", - "Odblacute": "\u0150", - "Odblgrave": "\u020c", - "Odieresis": "\u00d6", - "Odieresiscyrillic": "\u04e6", - "Odieresissmall": "\uf7f6", - "Odotbelow": "\u1ecc", - "Ogoneksmall": "\uf6fb", - "Ograve": "\u00d2", - "Ogravesmall": "\uf7f2", - "Oharmenian": "\u0555", - "Ohm": "\u2126", - "Ohookabove": "\u1ece", - "Ohorn": "\u01a0", - "Ohornacute": "\u1eda", - "Ohorndotbelow": "\u1ee2", - "Ohorngrave": "\u1edc", - "Ohornhookabove": "\u1ede", - "Ohorntilde": "\u1ee0", - "Ohungarumlaut": "\u0150", - "Oi": "\u01a2", - "Oinvertedbreve": "\u020e", - "Omacron": "\u014c", - "Omacronacute": "\u1e52", - "Omacrongrave": "\u1e50", - "Omega": "\u2126", - "Omegacyrillic": "\u0460", - "Omegagreek": "\u03a9", - "Omegaroundcyrillic": "\u047a", - "Omegatitlocyrillic": "\u047c", - "Omegatonos": "\u038f", - "Omicron": "\u039f", - "Omicrontonos": "\u038c", - "Omonospace": "\uff2f", - "Oneroman": "\u2160", - "Oogonek": "\u01ea", - "Oogonekmacron": "\u01ec", - "Oopen": "\u0186", - "Oslash": "\u00d8", - "Oslashacute": "\u01fe", - "Oslashsmall": "\uf7f8", - "Osmall": "\uf76f", - "Ostrokeacute": "\u01fe", - "Otcyrillic": "\u047e", - "Otilde": "\u00d5", - "Otildeacute": "\u1e4c", - "Otildedieresis": "\u1e4e", - "Otildesmall": "\uf7f5", - "P": "\u0050", - "Pacute": "\u1e54", - "Pcircle": "\u24c5", - "Pdotaccent": "\u1e56", - "Pecyrillic": "\u041f", - "Peharmenian": "\u054a", - "Pemiddlehookcyrillic": "\u04a6", - "Phi": "\u03a6", - "Phook": "\u01a4", - "Pi": "\u03a0", - "Piwrarmenian": "\u0553", - "Pmonospace": "\uff30", - "Psi": "\u03a8", - "Psicyrillic": "\u0470", - "Psmall": "\uf770", - "Q": "\u0051", - "Qcircle": "\u24c6", - "Qmonospace": "\uff31", - "Qsmall": "\uf771", - "R": "\u0052", - "Raarmenian": "\u054c", - "Racute": "\u0154", - "Rcaron": "\u0158", - "Rcedilla": "\u0156", - "Rcircle": "\u24c7", - "Rcommaaccent": "\u0156", - "Rdblgrave": "\u0210", - "Rdotaccent": "\u1e58", - "Rdotbelow": "\u1e5a", - "Rdotbelowmacron": "\u1e5c", - "Reharmenian": "\u0550", - "Rfraktur": "\u211c", - "Rho": "\u03a1", - "Ringsmall": "\uf6fc", - "Rinvertedbreve": "\u0212", - "Rlinebelow": "\u1e5e", - "Rmonospace": "\uff32", - "Rsmall": "\uf772", - "Rsmallinverted": "\u0281", - "Rsmallinvertedsuperior": "\u02b6", - "S": "\u0053", - "SF010000": "\u250c", - "SF020000": "\u2514", - "SF030000": "\u2510", - "SF040000": "\u2518", - "SF050000": "\u253c", - "SF060000": "\u252c", - "SF070000": "\u2534", - "SF080000": "\u251c", - "SF090000": "\u2524", - "SF100000": "\u2500", - "SF110000": "\u2502", - "SF190000": "\u2561", - "SF200000": "\u2562", - "SF210000": "\u2556", - "SF220000": "\u2555", - "SF230000": "\u2563", - "SF240000": "\u2551", - "SF250000": "\u2557", - "SF260000": "\u255d", - "SF270000": "\u255c", - "SF280000": "\u255b", - "SF360000": "\u255e", - "SF370000": "\u255f", - "SF380000": "\u255a", - "SF390000": "\u2554", - "SF400000": "\u2569", - "SF410000": "\u2566", - "SF420000": "\u2560", - "SF430000": "\u2550", - "SF440000": "\u256c", - "SF450000": "\u2567", - "SF460000": "\u2568", - "SF470000": "\u2564", - "SF480000": "\u2565", - "SF490000": "\u2559", - "SF500000": "\u2558", - "SF510000": "\u2552", - "SF520000": "\u2553", - "SF530000": "\u256b", - "SF540000": "\u256a", - "Sacute": "\u015a", - "Sacutedotaccent": "\u1e64", - "Sampigreek": "\u03e0", - "Scaron": "\u0160", - "Scarondotaccent": "\u1e66", - "Scaronsmall": "\uf6fd", - "Scedilla": "\u015e", - "Schwa": "\u018f", - "Schwacyrillic": "\u04d8", - "Schwadieresiscyrillic": "\u04da", - "Scircle": "\u24c8", - "Scircumflex": "\u015c", - "Scommaaccent": "\u0218", - "Sdotaccent": "\u1e60", - "Sdotbelow": "\u1e62", - "Sdotbelowdotaccent": "\u1e68", - "Seharmenian": "\u054d", - "Sevenroman": "\u2166", - "Shaarmenian": "\u0547", - "Shacyrillic": "\u0428", - "Shchacyrillic": "\u0429", - "Sheicoptic": "\u03e2", - "Shhacyrillic": "\u04ba", - "Shimacoptic": "\u03ec", - "Sigma": "\u03a3", - "Sixroman": "\u2165", - "Smonospace": "\uff33", - "Softsigncyrillic": "\u042c", - "Ssmall": "\uf773", - "Stigmagreek": "\u03da", - "T": "\u0054", - "Tau": "\u03a4", - "Tbar": "\u0166", - "Tcaron": "\u0164", - "Tcedilla": "\u0162", - "Tcircle": "\u24c9", - "Tcircumflexbelow": "\u1e70", - "Tcommaaccent": "\u0162", - "Tdotaccent": "\u1e6a", - "Tdotbelow": "\u1e6c", - "Tecyrillic": "\u0422", - "Tedescendercyrillic": "\u04ac", - "Tenroman": "\u2169", - "Tetsecyrillic": "\u04b4", - "Theta": "\u0398", - "Thook": "\u01ac", - "Thorn": "\u00de", - "Thornsmall": "\uf7fe", - "Threeroman": "\u2162", - "Tildesmall": "\uf6fe", - "Tiwnarmenian": "\u054f", - "Tlinebelow": "\u1e6e", - "Tmonospace": "\uff34", - "Toarmenian": "\u0539", - "Tonefive": "\u01bc", - "Tonesix": "\u0184", - "Tonetwo": "\u01a7", - "Tretroflexhook": "\u01ae", - "Tsecyrillic": "\u0426", - "Tshecyrillic": "\u040b", - "Tsmall": "\uf774", - "Twelveroman": "\u216b", - "Tworoman": "\u2161", - "U": "\u0055", - "Uacute": "\u00da", - "Uacutesmall": "\uf7fa", - "Ubreve": "\u016c", - "Ucaron": "\u01d3", - "Ucircle": "\u24ca", - "Ucircumflex": "\u00db", - "Ucircumflexbelow": "\u1e76", - "Ucircumflexsmall": "\uf7fb", - "Ucyrillic": "\u0423", - "Udblacute": "\u0170", - "Udblgrave": "\u0214", - "Udieresis": "\u00dc", - "Udieresisacute": "\u01d7", - "Udieresisbelow": "\u1e72", - "Udieresiscaron": "\u01d9", - "Udieresiscyrillic": "\u04f0", - "Udieresisgrave": "\u01db", - "Udieresismacron": "\u01d5", - "Udieresissmall": "\uf7fc", - "Udotbelow": "\u1ee4", - "Ugrave": "\u00d9", - "Ugravesmall": "\uf7f9", - "Uhookabove": "\u1ee6", - "Uhorn": "\u01af", - "Uhornacute": "\u1ee8", - "Uhorndotbelow": "\u1ef0", - "Uhorngrave": "\u1eea", - "Uhornhookabove": "\u1eec", - "Uhorntilde": "\u1eee", - "Uhungarumlaut": "\u0170", - "Uhungarumlautcyrillic": "\u04f2", - "Uinvertedbreve": "\u0216", - "Ukcyrillic": "\u0478", - "Umacron": "\u016a", - "Umacroncyrillic": "\u04ee", - "Umacrondieresis": "\u1e7a", - "Umonospace": "\uff35", - "Uogonek": "\u0172", - "Upsilon": "\u03a5", - "Upsilon1": "\u03d2", - "Upsilonacutehooksymbolgreek": "\u03d3", - "Upsilonafrican": "\u01b1", - "Upsilondieresis": "\u03ab", - "Upsilondieresishooksymbolgreek": "\u03d4", - "Upsilonhooksymbol": "\u03d2", - "Upsilontonos": "\u038e", - "Uring": "\u016e", - "Ushortcyrillic": "\u040e", - "Usmall": "\uf775", - "Ustraightcyrillic": "\u04ae", - "Ustraightstrokecyrillic": "\u04b0", - "Utilde": "\u0168", - "Utildeacute": "\u1e78", - "Utildebelow": "\u1e74", - "V": "\u0056", - "Vcircle": "\u24cb", - "Vdotbelow": "\u1e7e", - "Vecyrillic": "\u0412", - "Vewarmenian": "\u054e", - "Vhook": "\u01b2", - "Vmonospace": "\uff36", - "Voarmenian": "\u0548", - "Vsmall": "\uf776", - "Vtilde": "\u1e7c", - "W": "\u0057", - "Wacute": "\u1e82", - "Wcircle": "\u24cc", - "Wcircumflex": "\u0174", - "Wdieresis": "\u1e84", - "Wdotaccent": "\u1e86", - "Wdotbelow": "\u1e88", - "Wgrave": "\u1e80", - "Wmonospace": "\uff37", - "Wsmall": "\uf777", - "X": "\u0058", - "Xcircle": "\u24cd", - "Xdieresis": "\u1e8c", - "Xdotaccent": "\u1e8a", - "Xeharmenian": "\u053d", - "Xi": "\u039e", - "Xmonospace": "\uff38", - "Xsmall": "\uf778", - "Y": "\u0059", - "Yacute": "\u00dd", - "Yacutesmall": "\uf7fd", - "Yatcyrillic": "\u0462", - "Ycircle": "\u24ce", - "Ycircumflex": "\u0176", - "Ydieresis": "\u0178", - "Ydieresissmall": "\uf7ff", - "Ydotaccent": "\u1e8e", - "Ydotbelow": "\u1ef4", - "Yericyrillic": "\u042b", - "Yerudieresiscyrillic": "\u04f8", - "Ygrave": "\u1ef2", - "Yhook": "\u01b3", - "Yhookabove": "\u1ef6", - "Yiarmenian": "\u0545", - "Yicyrillic": "\u0407", - "Yiwnarmenian": "\u0552", - "Ymonospace": "\uff39", - "Ysmall": "\uf779", - "Ytilde": "\u1ef8", - "Yusbigcyrillic": "\u046a", - "Yusbigiotifiedcyrillic": "\u046c", - "Yuslittlecyrillic": "\u0466", - "Yuslittleiotifiedcyrillic": "\u0468", - "Z": "\u005a", - "Zaarmenian": "\u0536", - "Zacute": "\u0179", - "Zcaron": "\u017d", - "Zcaronsmall": "\uf6ff", - "Zcircle": "\u24cf", - "Zcircumflex": "\u1e90", - "Zdot": "\u017b", - "Zdotaccent": "\u017b", - "Zdotbelow": "\u1e92", - "Zecyrillic": "\u0417", - "Zedescendercyrillic": "\u0498", - "Zedieresiscyrillic": "\u04de", - "Zeta": "\u0396", - "Zhearmenian": "\u053a", - "Zhebrevecyrillic": "\u04c1", - "Zhecyrillic": "\u0416", - "Zhedescendercyrillic": "\u0496", - "Zhedieresiscyrillic": "\u04dc", - "Zlinebelow": "\u1e94", - "Zmonospace": "\uff3a", - "Zsmall": "\uf77a", - "Zstroke": "\u01b5", - "a": "\u0061", - "aabengali": "\u0986", - "aacute": "\u00e1", - "aadeva": "\u0906", - "aagujarati": "\u0a86", - "aagurmukhi": "\u0a06", - "aamatragurmukhi": "\u0a3e", - "aarusquare": "\u3303", - "aavowelsignbengali": "\u09be", - "aavowelsigndeva": "\u093e", - "aavowelsigngujarati": "\u0abe", - "abbreviationmarkarmenian": "\u055f", - "abbreviationsigndeva": "\u0970", - "abengali": "\u0985", - "abopomofo": "\u311a", - "abreve": "\u0103", - "abreveacute": "\u1eaf", - "abrevecyrillic": "\u04d1", - "abrevedotbelow": "\u1eb7", - "abrevegrave": "\u1eb1", - "abrevehookabove": "\u1eb3", - "abrevetilde": "\u1eb5", - "acaron": "\u01ce", - "acircle": "\u24d0", - "acircumflex": "\u00e2", - "acircumflexacute": "\u1ea5", - "acircumflexdotbelow": "\u1ead", - "acircumflexgrave": "\u1ea7", - "acircumflexhookabove": "\u1ea9", - "acircumflextilde": "\u1eab", - "acute": "\u00b4", - "acutebelowcmb": "\u0317", - "acutecmb": "\u0301", - "acutecomb": "\u0301", - "acutedeva": "\u0954", - "acutelowmod": "\u02cf", - "acutetonecmb": "\u0341", - "acyrillic": "\u0430", - "adblgrave": "\u0201", - "addakgurmukhi": "\u0a71", - "adeva": "\u0905", - "adieresis": "\u00e4", - "adieresiscyrillic": "\u04d3", - "adieresismacron": "\u01df", - "adotbelow": "\u1ea1", - "adotmacron": "\u01e1", - "ae": "\u00e6", - "aeacute": "\u01fd", - "aekorean": "\u3150", - "aemacron": "\u01e3", - "afii00208": "\u2015", - "afii08941": "\u20a4", - "afii10017": "\u0410", - "afii10018": "\u0411", - "afii10019": "\u0412", - "afii10020": "\u0413", - "afii10021": "\u0414", - "afii10022": "\u0415", - "afii10023": "\u0401", - "afii10024": "\u0416", - "afii10025": "\u0417", - "afii10026": "\u0418", - "afii10027": "\u0419", - "afii10028": "\u041a", - "afii10029": "\u041b", - "afii10030": "\u041c", - "afii10031": "\u041d", - "afii10032": "\u041e", - "afii10033": "\u041f", - "afii10034": "\u0420", - "afii10035": "\u0421", - "afii10036": "\u0422", - "afii10037": "\u0423", - "afii10038": "\u0424", - "afii10039": "\u0425", - "afii10040": "\u0426", - "afii10041": "\u0427", - "afii10042": "\u0428", - "afii10043": "\u0429", - "afii10044": "\u042a", - "afii10045": "\u042b", - "afii10046": "\u042c", - "afii10047": "\u042d", - "afii10048": "\u042e", - "afii10049": "\u042f", - "afii10050": "\u0490", - "afii10051": "\u0402", - "afii10052": "\u0403", - "afii10053": "\u0404", - "afii10054": "\u0405", - "afii10055": "\u0406", - "afii10056": "\u0407", - "afii10057": "\u0408", - "afii10058": "\u0409", - "afii10059": "\u040a", - "afii10060": "\u040b", - "afii10061": "\u040c", - "afii10062": "\u040e", - "afii10063": "\uf6c4", - "afii10064": "\uf6c5", - "afii10065": "\u0430", - "afii10066": "\u0431", - "afii10067": "\u0432", - "afii10068": "\u0433", - "afii10069": "\u0434", - "afii10070": "\u0435", - "afii10071": "\u0451", - "afii10072": "\u0436", - "afii10073": "\u0437", - "afii10074": "\u0438", - "afii10075": "\u0439", - "afii10076": "\u043a", - "afii10077": "\u043b", - "afii10078": "\u043c", - "afii10079": "\u043d", - "afii10080": "\u043e", - "afii10081": "\u043f", - "afii10082": "\u0440", - "afii10083": "\u0441", - "afii10084": "\u0442", - "afii10085": "\u0443", - "afii10086": "\u0444", - "afii10087": "\u0445", - "afii10088": "\u0446", - "afii10089": "\u0447", - "afii10090": "\u0448", - "afii10091": "\u0449", - "afii10092": "\u044a", - "afii10093": "\u044b", - "afii10094": "\u044c", - "afii10095": "\u044d", - "afii10096": "\u044e", - "afii10097": "\u044f", - "afii10098": "\u0491", - "afii10099": "\u0452", - "afii10100": "\u0453", - "afii10101": "\u0454", - "afii10102": "\u0455", - "afii10103": "\u0456", - "afii10104": "\u0457", - "afii10105": "\u0458", - "afii10106": "\u0459", - "afii10107": "\u045a", - "afii10108": "\u045b", - "afii10109": "\u045c", - "afii10110": "\u045e", - "afii10145": "\u040f", - "afii10146": "\u0462", - "afii10147": "\u0472", - "afii10148": "\u0474", - "afii10192": "\uf6c6", - "afii10193": "\u045f", - "afii10194": "\u0463", - "afii10195": "\u0473", - "afii10196": "\u0475", - "afii10831": "\uf6c7", - "afii10832": "\uf6c8", - "afii10846": "\u04d9", - "afii299": "\u200e", - "afii300": "\u200f", - "afii301": "\u200d", - "afii57381": "\u066a", - "afii57388": "\u060c", - "afii57392": "\u0660", - "afii57393": "\u0661", - "afii57394": "\u0662", - "afii57395": "\u0663", - "afii57396": "\u0664", - "afii57397": "\u0665", - "afii57398": "\u0666", - "afii57399": "\u0667", - "afii57400": "\u0668", - "afii57401": "\u0669", - "afii57403": "\u061b", - "afii57407": "\u061f", - "afii57409": "\u0621", - "afii57410": "\u0622", - "afii57411": "\u0623", - "afii57412": "\u0624", - "afii57413": "\u0625", - "afii57414": "\u0626", - "afii57415": "\u0627", - "afii57416": "\u0628", - "afii57417": "\u0629", - "afii57418": "\u062a", - "afii57419": "\u062b", - "afii57420": "\u062c", - "afii57421": "\u062d", - "afii57422": "\u062e", - "afii57423": "\u062f", - "afii57424": "\u0630", - "afii57425": "\u0631", - "afii57426": "\u0632", - "afii57427": "\u0633", - "afii57428": "\u0634", - "afii57429": "\u0635", - "afii57430": "\u0636", - "afii57431": "\u0637", - "afii57432": "\u0638", - "afii57433": "\u0639", - "afii57434": "\u063a", - "afii57440": "\u0640", - "afii57441": "\u0641", - "afii57442": "\u0642", - "afii57443": "\u0643", - "afii57444": "\u0644", - "afii57445": "\u0645", - "afii57446": "\u0646", - "afii57448": "\u0648", - "afii57449": "\u0649", - "afii57450": "\u064a", - "afii57451": "\u064b", - "afii57452": "\u064c", - "afii57453": "\u064d", - "afii57454": "\u064e", - "afii57455": "\u064f", - "afii57456": "\u0650", - "afii57457": "\u0651", - "afii57458": "\u0652", - "afii57470": "\u0647", - "afii57505": "\u06a4", - "afii57506": "\u067e", - "afii57507": "\u0686", - "afii57508": "\u0698", - "afii57509": "\u06af", - "afii57511": "\u0679", - "afii57512": "\u0688", - "afii57513": "\u0691", - "afii57514": "\u06ba", - "afii57519": "\u06d2", - "afii57534": "\u06d5", - "afii57636": "\u20aa", - "afii57645": "\u05be", - "afii57658": "\u05c3", - "afii57664": "\u05d0", - "afii57665": "\u05d1", - "afii57666": "\u05d2", - "afii57667": "\u05d3", - "afii57668": "\u05d4", - "afii57669": "\u05d5", - "afii57670": "\u05d6", - "afii57671": "\u05d7", - "afii57672": "\u05d8", - "afii57673": "\u05d9", - "afii57674": "\u05da", - "afii57675": "\u05db", - "afii57676": "\u05dc", - "afii57677": "\u05dd", - "afii57678": "\u05de", - "afii57679": "\u05df", - "afii57680": "\u05e0", - "afii57681": "\u05e1", - "afii57682": "\u05e2", - "afii57683": "\u05e3", - "afii57684": "\u05e4", - "afii57685": "\u05e5", - "afii57686": "\u05e6", - "afii57687": "\u05e7", - "afii57688": "\u05e8", - "afii57689": "\u05e9", - "afii57690": "\u05ea", - "afii57694": "\ufb2a", - "afii57695": "\ufb2b", - "afii57700": "\ufb4b", - "afii57705": "\ufb1f", - "afii57716": "\u05f0", - "afii57717": "\u05f1", - "afii57718": "\u05f2", - "afii57723": "\ufb35", - "afii57793": "\u05b4", - "afii57794": "\u05b5", - "afii57795": "\u05b6", - "afii57796": "\u05bb", - "afii57797": "\u05b8", - "afii57798": "\u05b7", - "afii57799": "\u05b0", - "afii57800": "\u05b2", - "afii57801": "\u05b1", - "afii57802": "\u05b3", - "afii57803": "\u05c2", - "afii57804": "\u05c1", - "afii57806": "\u05b9", - "afii57807": "\u05bc", - "afii57839": "\u05bd", - "afii57841": "\u05bf", - "afii57842": "\u05c0", - "afii57929": "\u02bc", - "afii61248": "\u2105", - "afii61289": "\u2113", - "afii61352": "\u2116", - "afii61573": "\u202c", - "afii61574": "\u202d", - "afii61575": "\u202e", - "afii61664": "\u200c", - "afii63167": "\u066d", - "afii64937": "\u02bd", - "agrave": "\u00e0", - "agujarati": "\u0a85", - "agurmukhi": "\u0a05", - "ahiragana": "\u3042", - "ahookabove": "\u1ea3", - "aibengali": "\u0990", - "aibopomofo": "\u311e", - "aideva": "\u0910", - "aiecyrillic": "\u04d5", - "aigujarati": "\u0a90", - "aigurmukhi": "\u0a10", - "aimatragurmukhi": "\u0a48", - "ainarabic": "\u0639", - "ainfinalarabic": "\ufeca", - "aininitialarabic": "\ufecb", - "ainmedialarabic": "\ufecc", - "ainvertedbreve": "\u0203", - "aivowelsignbengali": "\u09c8", - "aivowelsigndeva": "\u0948", - "aivowelsigngujarati": "\u0ac8", - "akatakana": "\u30a2", - "akatakanahalfwidth": "\uff71", - "akorean": "\u314f", - "alef": "\u05d0", - "alefarabic": "\u0627", - "alefdageshhebrew": "\ufb30", - "aleffinalarabic": "\ufe8e", - "alefhamzaabovearabic": "\u0623", - "alefhamzaabovefinalarabic": "\ufe84", - "alefhamzabelowarabic": "\u0625", - "alefhamzabelowfinalarabic": "\ufe88", - "alefhebrew": "\u05d0", - "aleflamedhebrew": "\ufb4f", - "alefmaddaabovearabic": "\u0622", - "alefmaddaabovefinalarabic": "\ufe82", - "alefmaksuraarabic": "\u0649", - "alefmaksurafinalarabic": "\ufef0", - "alefmaksurainitialarabic": "\ufef3", - "alefmaksuramedialarabic": "\ufef4", - "alefpatahhebrew": "\ufb2e", - "alefqamatshebrew": "\ufb2f", - "aleph": "\u2135", - "allequal": "\u224c", - "alpha": "\u03b1", - "alphatonos": "\u03ac", - "amacron": "\u0101", - "amonospace": "\uff41", - "ampersand": "\u0026", - "ampersandmonospace": "\uff06", - "ampersandsmall": "\uf726", - "amsquare": "\u33c2", - "anbopomofo": "\u3122", - "angbopomofo": "\u3124", - "angkhankhuthai": "\u0e5a", - "angle": "\u2220", - "anglebracketleft": "\u3008", - "anglebracketleftvertical": "\ufe3f", - "anglebracketright": "\u3009", - "anglebracketrightvertical": "\ufe40", - "angleleft": "\u2329", - "angleright": "\u232a", - "angstrom": "\u212b", - "anoteleia": "\u0387", - "anudattadeva": "\u0952", - "anusvarabengali": "\u0982", - "anusvaradeva": "\u0902", - "anusvaragujarati": "\u0a82", - "aogonek": "\u0105", - "apaatosquare": "\u3300", - "aparen": "\u249c", - "apostrophearmenian": "\u055a", - "apostrophemod": "\u02bc", - "apple": "\uf8ff", - "approaches": "\u2250", - "approxequal": "\u2248", - "approxequalorimage": "\u2252", - "approximatelyequal": "\u2245", - "araeaekorean": "\u318e", - "araeakorean": "\u318d", - "arc": "\u2312", - "arighthalfring": "\u1e9a", - "aring": "\u00e5", - "aringacute": "\u01fb", - "aringbelow": "\u1e01", - "arrowboth": "\u2194", - "arrowdashdown": "\u21e3", - "arrowdashleft": "\u21e0", - "arrowdashright": "\u21e2", - "arrowdashup": "\u21e1", - "arrowdblboth": "\u21d4", - "arrowdbldown": "\u21d3", - "arrowdblleft": "\u21d0", - "arrowdblright": "\u21d2", - "arrowdblup": "\u21d1", - "arrowdown": "\u2193", - "arrowdownleft": "\u2199", - "arrowdownright": "\u2198", - "arrowdownwhite": "\u21e9", - "arrowheaddownmod": "\u02c5", - "arrowheadleftmod": "\u02c2", - "arrowheadrightmod": "\u02c3", - "arrowheadupmod": "\u02c4", - "arrowhorizex": "\uf8e7", - "arrowleft": "\u2190", - "arrowleftdbl": "\u21d0", - "arrowleftdblstroke": "\u21cd", - "arrowleftoverright": "\u21c6", - "arrowleftwhite": "\u21e6", - "arrowright": "\u2192", - "arrowrightdblstroke": "\u21cf", - "arrowrightheavy": "\u279e", - "arrowrightoverleft": "\u21c4", - "arrowrightwhite": "\u21e8", - "arrowtableft": "\u21e4", - "arrowtabright": "\u21e5", - "arrowup": "\u2191", - "arrowupdn": "\u2195", - "arrowupdnbse": "\u21a8", - "arrowupdownbase": "\u21a8", - "arrowupleft": "\u2196", - "arrowupleftofdown": "\u21c5", - "arrowupright": "\u2197", - "arrowupwhite": "\u21e7", - "arrowvertex": "\uf8e6", - "asciicircum": "\u005e", - "asciicircummonospace": "\uff3e", - "asciitilde": "\u007e", - "asciitildemonospace": "\uff5e", - "ascript": "\u0251", - "ascriptturned": "\u0252", - "asmallhiragana": "\u3041", - "asmallkatakana": "\u30a1", - "asmallkatakanahalfwidth": "\uff67", - "asterisk": "\u002a", - "asteriskaltonearabic": "\u066d", - "asteriskarabic": "\u066d", - "asteriskmath": "\u2217", - "asteriskmonospace": "\uff0a", - "asterisksmall": "\ufe61", - "asterism": "\u2042", - "asuperior": "\uf6e9", - "asymptoticallyequal": "\u2243", - "at": "\u0040", - "atilde": "\u00e3", - "atmonospace": "\uff20", - "atsmall": "\ufe6b", - "aturned": "\u0250", - "aubengali": "\u0994", - "aubopomofo": "\u3120", - "audeva": "\u0914", - "augujarati": "\u0a94", - "augurmukhi": "\u0a14", - "aulengthmarkbengali": "\u09d7", - "aumatragurmukhi": "\u0a4c", - "auvowelsignbengali": "\u09cc", - "auvowelsigndeva": "\u094c", - "auvowelsigngujarati": "\u0acc", - "avagrahadeva": "\u093d", - "aybarmenian": "\u0561", - "ayin": "\u05e2", - "ayinaltonehebrew": "\ufb20", - "ayinhebrew": "\u05e2", - "b": "\u0062", - "babengali": "\u09ac", - "backslash": "\u005c", - "backslashmonospace": "\uff3c", - "badeva": "\u092c", - "bagujarati": "\u0aac", - "bagurmukhi": "\u0a2c", - "bahiragana": "\u3070", - "bahtthai": "\u0e3f", - "bakatakana": "\u30d0", - "bar": "\u007c", - "barmonospace": "\uff5c", - "bbopomofo": "\u3105", - "bcircle": "\u24d1", - "bdotaccent": "\u1e03", - "bdotbelow": "\u1e05", - "beamedsixteenthnotes": "\u266c", - "because": "\u2235", - "becyrillic": "\u0431", - "beharabic": "\u0628", - "behfinalarabic": "\ufe90", - "behinitialarabic": "\ufe91", - "behiragana": "\u3079", - "behmedialarabic": "\ufe92", - "behmeeminitialarabic": "\ufc9f", - "behmeemisolatedarabic": "\ufc08", - "behnoonfinalarabic": "\ufc6d", - "bekatakana": "\u30d9", - "benarmenian": "\u0562", - "bet": "\u05d1", - "beta": "\u03b2", - "betasymbolgreek": "\u03d0", - "betdagesh": "\ufb31", - "betdageshhebrew": "\ufb31", - "bethebrew": "\u05d1", - "betrafehebrew": "\ufb4c", - "bhabengali": "\u09ad", - "bhadeva": "\u092d", - "bhagujarati": "\u0aad", - "bhagurmukhi": "\u0a2d", - "bhook": "\u0253", - "bihiragana": "\u3073", - "bikatakana": "\u30d3", - "bilabialclick": "\u0298", - "bindigurmukhi": "\u0a02", - "birusquare": "\u3331", - "blackcircle": "\u25cf", - "blackdiamond": "\u25c6", - "blackdownpointingtriangle": "\u25bc", - "blackleftpointingpointer": "\u25c4", - "blackleftpointingtriangle": "\u25c0", - "blacklenticularbracketleft": "\u3010", - "blacklenticularbracketleftvertical": "\ufe3b", - "blacklenticularbracketright": "\u3011", - "blacklenticularbracketrightvertical": "\ufe3c", - "blacklowerlefttriangle": "\u25e3", - "blacklowerrighttriangle": "\u25e2", - "blackrectangle": "\u25ac", - "blackrightpointingpointer": "\u25ba", - "blackrightpointingtriangle": "\u25b6", - "blacksmallsquare": "\u25aa", - "blacksmilingface": "\u263b", - "blacksquare": "\u25a0", - "blackstar": "\u2605", - "blackupperlefttriangle": "\u25e4", - "blackupperrighttriangle": "\u25e5", - "blackuppointingsmalltriangle": "\u25b4", - "blackuppointingtriangle": "\u25b2", - "blank": "\u2423", - "blinebelow": "\u1e07", - "block": "\u2588", - "bmonospace": "\uff42", - "bobaimaithai": "\u0e1a", - "bohiragana": "\u307c", - "bokatakana": "\u30dc", - "bparen": "\u249d", - "bqsquare": "\u33c3", - "braceex": "\uf8f4", - "braceleft": "\u007b", - "braceleftbt": "\uf8f3", - "braceleftmid": "\uf8f2", - "braceleftmonospace": "\uff5b", - "braceleftsmall": "\ufe5b", - "bracelefttp": "\uf8f1", - "braceleftvertical": "\ufe37", - "braceright": "\u007d", - "bracerightbt": "\uf8fe", - "bracerightmid": "\uf8fd", - "bracerightmonospace": "\uff5d", - "bracerightsmall": "\ufe5c", - "bracerighttp": "\uf8fc", - "bracerightvertical": "\ufe38", - "bracketleft": "\u005b", - "bracketleftbt": "\uf8f0", - "bracketleftex": "\uf8ef", - "bracketleftmonospace": "\uff3b", - "bracketlefttp": "\uf8ee", - "bracketright": "\u005d", - "bracketrightbt": "\uf8fb", - "bracketrightex": "\uf8fa", - "bracketrightmonospace": "\uff3d", - "bracketrighttp": "\uf8f9", - "breve": "\u02d8", - "brevebelowcmb": "\u032e", - "brevecmb": "\u0306", - "breveinvertedbelowcmb": "\u032f", - "breveinvertedcmb": "\u0311", - "breveinverteddoublecmb": "\u0361", - "bridgebelowcmb": "\u032a", - "bridgeinvertedbelowcmb": "\u033a", - "brokenbar": "\u00a6", - "bstroke": "\u0180", - "bsuperior": "\uf6ea", - "btopbar": "\u0183", - "buhiragana": "\u3076", - "bukatakana": "\u30d6", - "bullet": "\u2022", - "bulletinverse": "\u25d8", - "bulletoperator": "\u2219", - "bullseye": "\u25ce", - "c": "\u0063", - "caarmenian": "\u056e", - "cabengali": "\u099a", - "cacute": "\u0107", - "cadeva": "\u091a", - "cagujarati": "\u0a9a", - "cagurmukhi": "\u0a1a", - "calsquare": "\u3388", - "candrabindubengali": "\u0981", - "candrabinducmb": "\u0310", - "candrabindudeva": "\u0901", - "candrabindugujarati": "\u0a81", - "capslock": "\u21ea", - "careof": "\u2105", - "caron": "\u02c7", - "caronbelowcmb": "\u032c", - "caroncmb": "\u030c", - "carriagereturn": "\u21b5", - "cbopomofo": "\u3118", - "ccaron": "\u010d", - "ccedilla": "\u00e7", - "ccedillaacute": "\u1e09", - "ccircle": "\u24d2", - "ccircumflex": "\u0109", - "ccurl": "\u0255", - "cdot": "\u010b", - "cdotaccent": "\u010b", - "cdsquare": "\u33c5", - "cedilla": "\u00b8", - "cedillacmb": "\u0327", - "cent": "\u00a2", - "centigrade": "\u2103", - "centinferior": "\uf6df", - "centmonospace": "\uffe0", - "centoldstyle": "\uf7a2", - "centsuperior": "\uf6e0", - "chaarmenian": "\u0579", - "chabengali": "\u099b", - "chadeva": "\u091b", - "chagujarati": "\u0a9b", - "chagurmukhi": "\u0a1b", - "chbopomofo": "\u3114", - "cheabkhasiancyrillic": "\u04bd", - "checkmark": "\u2713", - "checyrillic": "\u0447", - "chedescenderabkhasiancyrillic": "\u04bf", - "chedescendercyrillic": "\u04b7", - "chedieresiscyrillic": "\u04f5", - "cheharmenian": "\u0573", - "chekhakassiancyrillic": "\u04cc", - "cheverticalstrokecyrillic": "\u04b9", - "chi": "\u03c7", - "chieuchacirclekorean": "\u3277", - "chieuchaparenkorean": "\u3217", - "chieuchcirclekorean": "\u3269", - "chieuchkorean": "\u314a", - "chieuchparenkorean": "\u3209", - "chochangthai": "\u0e0a", - "chochanthai": "\u0e08", - "chochingthai": "\u0e09", - "chochoethai": "\u0e0c", - "chook": "\u0188", - "cieucacirclekorean": "\u3276", - "cieucaparenkorean": "\u3216", - "cieuccirclekorean": "\u3268", - "cieuckorean": "\u3148", - "cieucparenkorean": "\u3208", - "cieucuparenkorean": "\u321c", - "circle": "\u25cb", - "circlemultiply": "\u2297", - "circleot": "\u2299", - "circleplus": "\u2295", - "circlepostalmark": "\u3036", - "circlewithlefthalfblack": "\u25d0", - "circlewithrighthalfblack": "\u25d1", - "circumflex": "\u02c6", - "circumflexbelowcmb": "\u032d", - "circumflexcmb": "\u0302", - "clear": "\u2327", - "clickalveolar": "\u01c2", - "clickdental": "\u01c0", - "clicklateral": "\u01c1", - "clickretroflex": "\u01c3", - "club": "\u2663", - "clubsuitblack": "\u2663", - "clubsuitwhite": "\u2667", - "cmcubedsquare": "\u33a4", - "cmonospace": "\uff43", - "cmsquaredsquare": "\u33a0", - "coarmenian": "\u0581", - "colon": "\u003a", - "colonmonetary": "\u20a1", - "colonmonospace": "\uff1a", - "colonsign": "\u20a1", - "colonsmall": "\ufe55", - "colontriangularhalfmod": "\u02d1", - "colontriangularmod": "\u02d0", - "comma": "\u002c", - "commaabovecmb": "\u0313", - "commaaboverightcmb": "\u0315", - "commaaccent": "\uf6c3", - "commaarabic": "\u060c", - "commaarmenian": "\u055d", - "commainferior": "\uf6e1", - "commamonospace": "\uff0c", - "commareversedabovecmb": "\u0314", - "commareversedmod": "\u02bd", - "commasmall": "\ufe50", - "commasuperior": "\uf6e2", - "commaturnedabovecmb": "\u0312", - "commaturnedmod": "\u02bb", - "compass": "\u263c", - "congruent": "\u2245", - "contourintegral": "\u222e", - "control": "\u2303", - "controlACK": "\u0006", - "controlBEL": "\u0007", - "controlBS": "\u0008", - "controlCAN": "\u0018", - "controlCR": "\u000d", - "controlDC1": "\u0011", - "controlDC2": "\u0012", - "controlDC3": "\u0013", - "controlDC4": "\u0014", - "controlDEL": "\u007f", - "controlDLE": "\u0010", - "controlEM": "\u0019", - "controlENQ": "\u0005", - "controlEOT": "\u0004", - "controlESC": "\u001b", - "controlETB": "\u0017", - "controlETX": "\u0003", - "controlFF": "\u000c", - "controlFS": "\u001c", - "controlGS": "\u001d", - "controlHT": "\u0009", - "controlLF": "\u000a", - "controlNAK": "\u0015", - "controlRS": "\u001e", - "controlSI": "\u000f", - "controlSO": "\u000e", - "controlSOT": "\u0002", - "controlSTX": "\u0001", - "controlSUB": "\u001a", - "controlSYN": "\u0016", - "controlUS": "\u001f", - "controlVT": "\u000b", - "copyright": "\u00a9", - "copyrightsans": "\uf8e9", - "copyrightserif": "\uf6d9", - "cornerbracketleft": "\u300c", - "cornerbracketlefthalfwidth": "\uff62", - "cornerbracketleftvertical": "\ufe41", - "cornerbracketright": "\u300d", - "cornerbracketrighthalfwidth": "\uff63", - "cornerbracketrightvertical": "\ufe42", - "corporationsquare": "\u337f", - "cosquare": "\u33c7", - "coverkgsquare": "\u33c6", - "cparen": "\u249e", - "cruzeiro": "\u20a2", - "cstretched": "\u0297", - "curlyand": "\u22cf", - "curlyor": "\u22ce", - "currency": "\u00a4", - "cyrBreve": "\uf6d1", - "cyrFlex": "\uf6d2", - "cyrbreve": "\uf6d4", - "cyrflex": "\uf6d5", - "d": "\u0064", - "daarmenian": "\u0564", - "dabengali": "\u09a6", - "dadarabic": "\u0636", - "dadeva": "\u0926", - "dadfinalarabic": "\ufebe", - "dadinitialarabic": "\ufebf", - "dadmedialarabic": "\ufec0", - "dagesh": "\u05bc", - "dageshhebrew": "\u05bc", - "dagger": "\u2020", - "daggerdbl": "\u2021", - "dagujarati": "\u0aa6", - "dagurmukhi": "\u0a26", - "dahiragana": "\u3060", - "dakatakana": "\u30c0", - "dalarabic": "\u062f", - "dalet": "\u05d3", - "daletdagesh": "\ufb33", - "daletdageshhebrew": "\ufb33", - "dalethatafpatah": "\u05d3\u05b2", - "dalethatafpatahhebrew": "\u05d3\u05b2", - "dalethatafsegol": "\u05d3\u05b1", - "dalethatafsegolhebrew": "\u05d3\u05b1", - "dalethebrew": "\u05d3", - "dalethiriq": "\u05d3\u05b4", - "dalethiriqhebrew": "\u05d3\u05b4", - "daletholam": "\u05d3\u05b9", - "daletholamhebrew": "\u05d3\u05b9", - "daletpatah": "\u05d3\u05b7", - "daletpatahhebrew": "\u05d3\u05b7", - "daletqamats": "\u05d3\u05b8", - "daletqamatshebrew": "\u05d3\u05b8", - "daletqubuts": "\u05d3\u05bb", - "daletqubutshebrew": "\u05d3\u05bb", - "daletsegol": "\u05d3\u05b6", - "daletsegolhebrew": "\u05d3\u05b6", - "daletsheva": "\u05d3\u05b0", - "daletshevahebrew": "\u05d3\u05b0", - "dalettsere": "\u05d3\u05b5", - "dalettserehebrew": "\u05d3\u05b5", - "dalfinalarabic": "\ufeaa", - "dammaarabic": "\u064f", - "dammalowarabic": "\u064f", - "dammatanaltonearabic": "\u064c", - "dammatanarabic": "\u064c", - "danda": "\u0964", - "dargahebrew": "\u05a7", - "dargalefthebrew": "\u05a7", - "dasiapneumatacyrilliccmb": "\u0485", - "dblGrave": "\uf6d3", - "dblanglebracketleft": "\u300a", - "dblanglebracketleftvertical": "\ufe3d", - "dblanglebracketright": "\u300b", - "dblanglebracketrightvertical": "\ufe3e", - "dblarchinvertedbelowcmb": "\u032b", - "dblarrowleft": "\u21d4", - "dblarrowright": "\u21d2", - "dbldanda": "\u0965", - "dblgrave": "\uf6d6", - "dblgravecmb": "\u030f", - "dblintegral": "\u222c", - "dbllowline": "\u2017", - "dbllowlinecmb": "\u0333", - "dbloverlinecmb": "\u033f", - "dblprimemod": "\u02ba", - "dblverticalbar": "\u2016", - "dblverticallineabovecmb": "\u030e", - "dbopomofo": "\u3109", - "dbsquare": "\u33c8", - "dcaron": "\u010f", - "dcedilla": "\u1e11", - "dcircle": "\u24d3", - "dcircumflexbelow": "\u1e13", - "dcroat": "\u0111", - "ddabengali": "\u09a1", - "ddadeva": "\u0921", - "ddagujarati": "\u0aa1", - "ddagurmukhi": "\u0a21", - "ddalarabic": "\u0688", - "ddalfinalarabic": "\ufb89", - "dddhadeva": "\u095c", - "ddhabengali": "\u09a2", - "ddhadeva": "\u0922", - "ddhagujarati": "\u0aa2", - "ddhagurmukhi": "\u0a22", - "ddotaccent": "\u1e0b", - "ddotbelow": "\u1e0d", - "decimalseparatorarabic": "\u066b", - "decimalseparatorpersian": "\u066b", - "decyrillic": "\u0434", - "degree": "\u00b0", - "dehihebrew": "\u05ad", - "dehiragana": "\u3067", - "deicoptic": "\u03ef", - "dekatakana": "\u30c7", - "deleteleft": "\u232b", - "deleteright": "\u2326", - "delta": "\u03b4", - "deltaturned": "\u018d", - "denominatorminusonenumeratorbengali": "\u09f8", - "dezh": "\u02a4", - "dhabengali": "\u09a7", - "dhadeva": "\u0927", - "dhagujarati": "\u0aa7", - "dhagurmukhi": "\u0a27", - "dhook": "\u0257", - "dialytikatonos": "\u0385", - "dialytikatonoscmb": "\u0344", - "diamond": "\u2666", - "diamondsuitwhite": "\u2662", - "dieresis": "\u00a8", - "dieresisacute": "\uf6d7", - "dieresisbelowcmb": "\u0324", - "dieresiscmb": "\u0308", - "dieresisgrave": "\uf6d8", - "dieresistonos": "\u0385", - "dihiragana": "\u3062", - "dikatakana": "\u30c2", - "dittomark": "\u3003", - "divide": "\u00f7", - "divides": "\u2223", - "divisionslash": "\u2215", - "djecyrillic": "\u0452", - "dkshade": "\u2593", - "dlinebelow": "\u1e0f", - "dlsquare": "\u3397", - "dmacron": "\u0111", - "dmonospace": "\uff44", - "dnblock": "\u2584", - "dochadathai": "\u0e0e", - "dodekthai": "\u0e14", - "dohiragana": "\u3069", - "dokatakana": "\u30c9", - "dollar": "\u0024", - "dollarinferior": "\uf6e3", - "dollarmonospace": "\uff04", - "dollaroldstyle": "\uf724", - "dollarsmall": "\ufe69", - "dollarsuperior": "\uf6e4", - "dong": "\u20ab", - "dorusquare": "\u3326", - "dotaccent": "\u02d9", - "dotaccentcmb": "\u0307", - "dotbelowcmb": "\u0323", - "dotbelowcomb": "\u0323", - "dotkatakana": "\u30fb", - "dotlessi": "\u0131", - "dotlessj": "\uf6be", - "dotlessjstrokehook": "\u0284", - "dotmath": "\u22c5", - "dottedcircle": "\u25cc", - "doubleyodpatah": "\ufb1f", - "doubleyodpatahhebrew": "\ufb1f", - "downtackbelowcmb": "\u031e", - "downtackmod": "\u02d5", - "dparen": "\u249f", - "dsuperior": "\uf6eb", - "dtail": "\u0256", - "dtopbar": "\u018c", - "duhiragana": "\u3065", - "dukatakana": "\u30c5", - "dz": "\u01f3", - "dzaltone": "\u02a3", - "dzcaron": "\u01c6", - "dzcurl": "\u02a5", - "dzeabkhasiancyrillic": "\u04e1", - "dzecyrillic": "\u0455", - "dzhecyrillic": "\u045f", - "e": "\u0065", - "eacute": "\u00e9", - "earth": "\u2641", - "ebengali": "\u098f", - "ebopomofo": "\u311c", - "ebreve": "\u0115", - "ecandradeva": "\u090d", - "ecandragujarati": "\u0a8d", - "ecandravowelsigndeva": "\u0945", - "ecandravowelsigngujarati": "\u0ac5", - "ecaron": "\u011b", - "ecedillabreve": "\u1e1d", - "echarmenian": "\u0565", - "echyiwnarmenian": "\u0587", - "ecircle": "\u24d4", - "ecircumflex": "\u00ea", - "ecircumflexacute": "\u1ebf", - "ecircumflexbelow": "\u1e19", - "ecircumflexdotbelow": "\u1ec7", - "ecircumflexgrave": "\u1ec1", - "ecircumflexhookabove": "\u1ec3", - "ecircumflextilde": "\u1ec5", - "ecyrillic": "\u0454", - "edblgrave": "\u0205", - "edeva": "\u090f", - "edieresis": "\u00eb", - "edot": "\u0117", - "edotaccent": "\u0117", - "edotbelow": "\u1eb9", - "eegurmukhi": "\u0a0f", - "eematragurmukhi": "\u0a47", - "efcyrillic": "\u0444", - "egrave": "\u00e8", - "egujarati": "\u0a8f", - "eharmenian": "\u0567", - "ehbopomofo": "\u311d", - "ehiragana": "\u3048", - "ehookabove": "\u1ebb", - "eibopomofo": "\u311f", - "eight": "\u0038", - "eightarabic": "\u0668", - "eightbengali": "\u09ee", - "eightcircle": "\u2467", - "eightcircleinversesansserif": "\u2791", - "eightdeva": "\u096e", - "eighteencircle": "\u2471", - "eighteenparen": "\u2485", - "eighteenperiod": "\u2499", - "eightgujarati": "\u0aee", - "eightgurmukhi": "\u0a6e", - "eighthackarabic": "\u0668", - "eighthangzhou": "\u3028", - "eighthnotebeamed": "\u266b", - "eightideographicparen": "\u3227", - "eightinferior": "\u2088", - "eightmonospace": "\uff18", - "eightoldstyle": "\uf738", - "eightparen": "\u247b", - "eightperiod": "\u248f", - "eightpersian": "\u06f8", - "eightroman": "\u2177", - "eightsuperior": "\u2078", - "eightthai": "\u0e58", - "einvertedbreve": "\u0207", - "eiotifiedcyrillic": "\u0465", - "ekatakana": "\u30a8", - "ekatakanahalfwidth": "\uff74", - "ekonkargurmukhi": "\u0a74", - "ekorean": "\u3154", - "elcyrillic": "\u043b", - "element": "\u2208", - "elevencircle": "\u246a", - "elevenparen": "\u247e", - "elevenperiod": "\u2492", - "elevenroman": "\u217a", - "ellipsis": "\u2026", - "ellipsisvertical": "\u22ee", - "emacron": "\u0113", - "emacronacute": "\u1e17", - "emacrongrave": "\u1e15", - "emcyrillic": "\u043c", - "emdash": "\u2014", - "emdashvertical": "\ufe31", - "emonospace": "\uff45", - "emphasismarkarmenian": "\u055b", - "emptyset": "\u2205", - "enbopomofo": "\u3123", - "encyrillic": "\u043d", - "endash": "\u2013", - "endashvertical": "\ufe32", - "endescendercyrillic": "\u04a3", - "eng": "\u014b", - "engbopomofo": "\u3125", - "enghecyrillic": "\u04a5", - "enhookcyrillic": "\u04c8", - "enspace": "\u2002", - "eogonek": "\u0119", - "eokorean": "\u3153", - "eopen": "\u025b", - "eopenclosed": "\u029a", - "eopenreversed": "\u025c", - "eopenreversedclosed": "\u025e", - "eopenreversedhook": "\u025d", - "eparen": "\u24a0", - "epsilon": "\u03b5", - "epsilontonos": "\u03ad", - "equal": "\u003d", - "equalmonospace": "\uff1d", - "equalsmall": "\ufe66", - "equalsuperior": "\u207c", - "equivalence": "\u2261", - "erbopomofo": "\u3126", - "ercyrillic": "\u0440", - "ereversed": "\u0258", - "ereversedcyrillic": "\u044d", - "escyrillic": "\u0441", - "esdescendercyrillic": "\u04ab", - "esh": "\u0283", - "eshcurl": "\u0286", - "eshortdeva": "\u090e", - "eshortvowelsigndeva": "\u0946", - "eshreversedloop": "\u01aa", - "eshsquatreversed": "\u0285", - "esmallhiragana": "\u3047", - "esmallkatakana": "\u30a7", - "esmallkatakanahalfwidth": "\uff6a", - "estimated": "\u212e", - "esuperior": "\uf6ec", - "eta": "\u03b7", - "etarmenian": "\u0568", - "etatonos": "\u03ae", - "eth": "\u00f0", - "etilde": "\u1ebd", - "etildebelow": "\u1e1b", - "etnahtafoukhhebrew": "\u0591", - "etnahtafoukhlefthebrew": "\u0591", - "etnahtahebrew": "\u0591", - "etnahtalefthebrew": "\u0591", - "eturned": "\u01dd", - "eukorean": "\u3161", - "euro": "\u20ac", - "evowelsignbengali": "\u09c7", - "evowelsigndeva": "\u0947", - "evowelsigngujarati": "\u0ac7", - "exclam": "\u0021", - "exclamarmenian": "\u055c", - "exclamdbl": "\u203c", - "exclamdown": "\u00a1", - "exclamdownsmall": "\uf7a1", - "exclammonospace": "\uff01", - "exclamsmall": "\uf721", - "existential": "\u2203", - "ezh": "\u0292", - "ezhcaron": "\u01ef", - "ezhcurl": "\u0293", - "ezhreversed": "\u01b9", - "ezhtail": "\u01ba", - "f": "\u0066", - "fadeva": "\u095e", - "fagurmukhi": "\u0a5e", - "fahrenheit": "\u2109", - "fathaarabic": "\u064e", - "fathalowarabic": "\u064e", - "fathatanarabic": "\u064b", - "fbopomofo": "\u3108", - "fcircle": "\u24d5", - "fdotaccent": "\u1e1f", - "feharabic": "\u0641", - "feharmenian": "\u0586", - "fehfinalarabic": "\ufed2", - "fehinitialarabic": "\ufed3", - "fehmedialarabic": "\ufed4", - "feicoptic": "\u03e5", - "female": "\u2640", - "ff": "\ufb00", - "ffi": "\ufb03", - "ffl": "\ufb04", - "fi": "\ufb01", - "fifteencircle": "\u246e", - "fifteenparen": "\u2482", - "fifteenperiod": "\u2496", - "figuredash": "\u2012", - "filledbox": "\u25a0", - "filledrect": "\u25ac", - "finalkaf": "\u05da", - "finalkafdagesh": "\ufb3a", - "finalkafdageshhebrew": "\ufb3a", - "finalkafhebrew": "\u05da", - "finalkafqamats": "\u05da\u05b8", - "finalkafqamatshebrew": "\u05da\u05b8", - "finalkafsheva": "\u05da\u05b0", - "finalkafshevahebrew": "\u05da\u05b0", - "finalmem": "\u05dd", - "finalmemhebrew": "\u05dd", - "finalnun": "\u05df", - "finalnunhebrew": "\u05df", - "finalpe": "\u05e3", - "finalpehebrew": "\u05e3", - "finaltsadi": "\u05e5", - "finaltsadihebrew": "\u05e5", - "firsttonechinese": "\u02c9", - "fisheye": "\u25c9", - "fitacyrillic": "\u0473", - "five": "\u0035", - "fivearabic": "\u0665", - "fivebengali": "\u09eb", - "fivecircle": "\u2464", - "fivecircleinversesansserif": "\u278e", - "fivedeva": "\u096b", - "fiveeighths": "\u215d", - "fivegujarati": "\u0aeb", - "fivegurmukhi": "\u0a6b", - "fivehackarabic": "\u0665", - "fivehangzhou": "\u3025", - "fiveideographicparen": "\u3224", - "fiveinferior": "\u2085", - "fivemonospace": "\uff15", - "fiveoldstyle": "\uf735", - "fiveparen": "\u2478", - "fiveperiod": "\u248c", - "fivepersian": "\u06f5", - "fiveroman": "\u2174", - "fivesuperior": "\u2075", - "fivethai": "\u0e55", - "fl": "\ufb02", - "florin": "\u0192", - "fmonospace": "\uff46", - "fmsquare": "\u3399", - "fofanthai": "\u0e1f", - "fofathai": "\u0e1d", - "fongmanthai": "\u0e4f", - "forall": "\u2200", - "four": "\u0034", - "fourarabic": "\u0664", - "fourbengali": "\u09ea", - "fourcircle": "\u2463", - "fourcircleinversesansserif": "\u278d", - "fourdeva": "\u096a", - "fourgujarati": "\u0aea", - "fourgurmukhi": "\u0a6a", - "fourhackarabic": "\u0664", - "fourhangzhou": "\u3024", - "fourideographicparen": "\u3223", - "fourinferior": "\u2084", - "fourmonospace": "\uff14", - "fournumeratorbengali": "\u09f7", - "fouroldstyle": "\uf734", - "fourparen": "\u2477", - "fourperiod": "\u248b", - "fourpersian": "\u06f4", - "fourroman": "\u2173", - "foursuperior": "\u2074", - "fourteencircle": "\u246d", - "fourteenparen": "\u2481", - "fourteenperiod": "\u2495", - "fourthai": "\u0e54", - "fourthtonechinese": "\u02cb", - "fparen": "\u24a1", - "fraction": "\u2044", - "franc": "\u20a3", - "g": "\u0067", - "gabengali": "\u0997", - "gacute": "\u01f5", - "gadeva": "\u0917", - "gafarabic": "\u06af", - "gaffinalarabic": "\ufb93", - "gafinitialarabic": "\ufb94", - "gafmedialarabic": "\ufb95", - "gagujarati": "\u0a97", - "gagurmukhi": "\u0a17", - "gahiragana": "\u304c", - "gakatakana": "\u30ac", - "gamma": "\u03b3", - "gammalatinsmall": "\u0263", - "gammasuperior": "\u02e0", - "gangiacoptic": "\u03eb", - "gbopomofo": "\u310d", - "gbreve": "\u011f", - "gcaron": "\u01e7", - "gcedilla": "\u0123", - "gcircle": "\u24d6", - "gcircumflex": "\u011d", - "gcommaaccent": "\u0123", - "gdot": "\u0121", - "gdotaccent": "\u0121", - "gecyrillic": "\u0433", - "gehiragana": "\u3052", - "gekatakana": "\u30b2", - "geometricallyequal": "\u2251", - "gereshaccenthebrew": "\u059c", - "gereshhebrew": "\u05f3", - "gereshmuqdamhebrew": "\u059d", - "germandbls": "\u00df", - "gershayimaccenthebrew": "\u059e", - "gershayimhebrew": "\u05f4", - "getamark": "\u3013", - "ghabengali": "\u0998", - "ghadarmenian": "\u0572", - "ghadeva": "\u0918", - "ghagujarati": "\u0a98", - "ghagurmukhi": "\u0a18", - "ghainarabic": "\u063a", - "ghainfinalarabic": "\ufece", - "ghaininitialarabic": "\ufecf", - "ghainmedialarabic": "\ufed0", - "ghemiddlehookcyrillic": "\u0495", - "ghestrokecyrillic": "\u0493", - "gheupturncyrillic": "\u0491", - "ghhadeva": "\u095a", - "ghhagurmukhi": "\u0a5a", - "ghook": "\u0260", - "ghzsquare": "\u3393", - "gihiragana": "\u304e", - "gikatakana": "\u30ae", - "gimarmenian": "\u0563", - "gimel": "\u05d2", - "gimeldagesh": "\ufb32", - "gimeldageshhebrew": "\ufb32", - "gimelhebrew": "\u05d2", - "gjecyrillic": "\u0453", - "glottalinvertedstroke": "\u01be", - "glottalstop": "\u0294", - "glottalstopinverted": "\u0296", - "glottalstopmod": "\u02c0", - "glottalstopreversed": "\u0295", - "glottalstopreversedmod": "\u02c1", - "glottalstopreversedsuperior": "\u02e4", - "glottalstopstroke": "\u02a1", - "glottalstopstrokereversed": "\u02a2", - "gmacron": "\u1e21", - "gmonospace": "\uff47", - "gohiragana": "\u3054", - "gokatakana": "\u30b4", - "gparen": "\u24a2", - "gpasquare": "\u33ac", - "gradient": "\u2207", - "grave": "\u0060", - "gravebelowcmb": "\u0316", - "gravecmb": "\u0300", - "gravecomb": "\u0300", - "gravedeva": "\u0953", - "gravelowmod": "\u02ce", - "gravemonospace": "\uff40", - "gravetonecmb": "\u0340", - "greater": "\u003e", - "greaterequal": "\u2265", - "greaterequalorless": "\u22db", - "greatermonospace": "\uff1e", - "greaterorequivalent": "\u2273", - "greaterorless": "\u2277", - "greateroverequal": "\u2267", - "greatersmall": "\ufe65", - "gscript": "\u0261", - "gstroke": "\u01e5", - "guhiragana": "\u3050", - "guillemotleft": "\u00ab", - "guillemotright": "\u00bb", - "guilsinglleft": "\u2039", - "guilsinglright": "\u203a", - "gukatakana": "\u30b0", - "guramusquare": "\u3318", - "gysquare": "\u33c9", - "h": "\u0068", - "haabkhasiancyrillic": "\u04a9", - "haaltonearabic": "\u06c1", - "habengali": "\u09b9", - "hadescendercyrillic": "\u04b3", - "hadeva": "\u0939", - "hagujarati": "\u0ab9", - "hagurmukhi": "\u0a39", - "haharabic": "\u062d", - "hahfinalarabic": "\ufea2", - "hahinitialarabic": "\ufea3", - "hahiragana": "\u306f", - "hahmedialarabic": "\ufea4", - "haitusquare": "\u332a", - "hakatakana": "\u30cf", - "hakatakanahalfwidth": "\uff8a", - "halantgurmukhi": "\u0a4d", - "hamzaarabic": "\u0621", - "hamzadammaarabic": "\u0621\u064f", - "hamzadammatanarabic": "\u0621\u064c", - "hamzafathaarabic": "\u0621\u064e", - "hamzafathatanarabic": "\u0621\u064b", - "hamzalowarabic": "\u0621", - "hamzalowkasraarabic": "\u0621\u0650", - "hamzalowkasratanarabic": "\u0621\u064d", - "hamzasukunarabic": "\u0621\u0652", - "hangulfiller": "\u3164", - "hardsigncyrillic": "\u044a", - "harpoonleftbarbup": "\u21bc", - "harpoonrightbarbup": "\u21c0", - "hasquare": "\u33ca", - "hatafpatah": "\u05b2", - "hatafpatah16": "\u05b2", - "hatafpatah23": "\u05b2", - "hatafpatah2f": "\u05b2", - "hatafpatahhebrew": "\u05b2", - "hatafpatahnarrowhebrew": "\u05b2", - "hatafpatahquarterhebrew": "\u05b2", - "hatafpatahwidehebrew": "\u05b2", - "hatafqamats": "\u05b3", - "hatafqamats1b": "\u05b3", - "hatafqamats28": "\u05b3", - "hatafqamats34": "\u05b3", - "hatafqamatshebrew": "\u05b3", - "hatafqamatsnarrowhebrew": "\u05b3", - "hatafqamatsquarterhebrew": "\u05b3", - "hatafqamatswidehebrew": "\u05b3", - "hatafsegol": "\u05b1", - "hatafsegol17": "\u05b1", - "hatafsegol24": "\u05b1", - "hatafsegol30": "\u05b1", - "hatafsegolhebrew": "\u05b1", - "hatafsegolnarrowhebrew": "\u05b1", - "hatafsegolquarterhebrew": "\u05b1", - "hatafsegolwidehebrew": "\u05b1", - "hbar": "\u0127", - "hbopomofo": "\u310f", - "hbrevebelow": "\u1e2b", - "hcedilla": "\u1e29", - "hcircle": "\u24d7", - "hcircumflex": "\u0125", - "hdieresis": "\u1e27", - "hdotaccent": "\u1e23", - "hdotbelow": "\u1e25", - "he": "\u05d4", - "heart": "\u2665", - "heartsuitblack": "\u2665", - "heartsuitwhite": "\u2661", - "hedagesh": "\ufb34", - "hedageshhebrew": "\ufb34", - "hehaltonearabic": "\u06c1", - "heharabic": "\u0647", - "hehebrew": "\u05d4", - "hehfinalaltonearabic": "\ufba7", - "hehfinalalttwoarabic": "\ufeea", - "hehfinalarabic": "\ufeea", - "hehhamzaabovefinalarabic": "\ufba5", - "hehhamzaaboveisolatedarabic": "\ufba4", - "hehinitialaltonearabic": "\ufba8", - "hehinitialarabic": "\ufeeb", - "hehiragana": "\u3078", - "hehmedialaltonearabic": "\ufba9", - "hehmedialarabic": "\ufeec", - "heiseierasquare": "\u337b", - "hekatakana": "\u30d8", - "hekatakanahalfwidth": "\uff8d", - "hekutaarusquare": "\u3336", - "henghook": "\u0267", - "herutusquare": "\u3339", - "het": "\u05d7", - "hethebrew": "\u05d7", - "hhook": "\u0266", - "hhooksuperior": "\u02b1", - "hieuhacirclekorean": "\u327b", - "hieuhaparenkorean": "\u321b", - "hieuhcirclekorean": "\u326d", - "hieuhkorean": "\u314e", - "hieuhparenkorean": "\u320d", - "hihiragana": "\u3072", - "hikatakana": "\u30d2", - "hikatakanahalfwidth": "\uff8b", - "hiriq": "\u05b4", - "hiriq14": "\u05b4", - "hiriq21": "\u05b4", - "hiriq2d": "\u05b4", - "hiriqhebrew": "\u05b4", - "hiriqnarrowhebrew": "\u05b4", - "hiriqquarterhebrew": "\u05b4", - "hiriqwidehebrew": "\u05b4", - "hlinebelow": "\u1e96", - "hmonospace": "\uff48", - "hoarmenian": "\u0570", - "hohipthai": "\u0e2b", - "hohiragana": "\u307b", - "hokatakana": "\u30db", - "hokatakanahalfwidth": "\uff8e", - "holam": "\u05b9", - "holam19": "\u05b9", - "holam26": "\u05b9", - "holam32": "\u05b9", - "holamhebrew": "\u05b9", - "holamnarrowhebrew": "\u05b9", - "holamquarterhebrew": "\u05b9", - "holamwidehebrew": "\u05b9", - "honokhukthai": "\u0e2e", - "hookabovecomb": "\u0309", - "hookcmb": "\u0309", - "hookpalatalizedbelowcmb": "\u0321", - "hookretroflexbelowcmb": "\u0322", - "hoonsquare": "\u3342", - "horicoptic": "\u03e9", - "horizontalbar": "\u2015", - "horncmb": "\u031b", - "hotsprings": "\u2668", - "house": "\u2302", - "hparen": "\u24a3", - "hsuperior": "\u02b0", - "hturned": "\u0265", - "huhiragana": "\u3075", - "huiitosquare": "\u3333", - "hukatakana": "\u30d5", - "hukatakanahalfwidth": "\uff8c", - "hungarumlaut": "\u02dd", - "hungarumlautcmb": "\u030b", - "hv": "\u0195", - "hyphen": "\u002d", - "hypheninferior": "\uf6e5", - "hyphenmonospace": "\uff0d", - "hyphensmall": "\ufe63", - "hyphensuperior": "\uf6e6", - "hyphentwo": "\u2010", - "i": "\u0069", - "iacute": "\u00ed", - "iacyrillic": "\u044f", - "ibengali": "\u0987", - "ibopomofo": "\u3127", - "ibreve": "\u012d", - "icaron": "\u01d0", - "icircle": "\u24d8", - "icircumflex": "\u00ee", - "icyrillic": "\u0456", - "idblgrave": "\u0209", - "ideographearthcircle": "\u328f", - "ideographfirecircle": "\u328b", - "ideographicallianceparen": "\u323f", - "ideographiccallparen": "\u323a", - "ideographiccentrecircle": "\u32a5", - "ideographicclose": "\u3006", - "ideographiccomma": "\u3001", - "ideographiccommaleft": "\uff64", - "ideographiccongratulationparen": "\u3237", - "ideographiccorrectcircle": "\u32a3", - "ideographicearthparen": "\u322f", - "ideographicenterpriseparen": "\u323d", - "ideographicexcellentcircle": "\u329d", - "ideographicfestivalparen": "\u3240", - "ideographicfinancialcircle": "\u3296", - "ideographicfinancialparen": "\u3236", - "ideographicfireparen": "\u322b", - "ideographichaveparen": "\u3232", - "ideographichighcircle": "\u32a4", - "ideographiciterationmark": "\u3005", - "ideographiclaborcircle": "\u3298", - "ideographiclaborparen": "\u3238", - "ideographicleftcircle": "\u32a7", - "ideographiclowcircle": "\u32a6", - "ideographicmedicinecircle": "\u32a9", - "ideographicmetalparen": "\u322e", - "ideographicmoonparen": "\u322a", - "ideographicnameparen": "\u3234", - "ideographicperiod": "\u3002", - "ideographicprintcircle": "\u329e", - "ideographicreachparen": "\u3243", - "ideographicrepresentparen": "\u3239", - "ideographicresourceparen": "\u323e", - "ideographicrightcircle": "\u32a8", - "ideographicsecretcircle": "\u3299", - "ideographicselfparen": "\u3242", - "ideographicsocietyparen": "\u3233", - "ideographicspace": "\u3000", - "ideographicspecialparen": "\u3235", - "ideographicstockparen": "\u3231", - "ideographicstudyparen": "\u323b", - "ideographicsunparen": "\u3230", - "ideographicsuperviseparen": "\u323c", - "ideographicwaterparen": "\u322c", - "ideographicwoodparen": "\u322d", - "ideographiczero": "\u3007", - "ideographmetalcircle": "\u328e", - "ideographmooncircle": "\u328a", - "ideographnamecircle": "\u3294", - "ideographsuncircle": "\u3290", - "ideographwatercircle": "\u328c", - "ideographwoodcircle": "\u328d", - "ideva": "\u0907", - "idieresis": "\u00ef", - "idieresisacute": "\u1e2f", - "idieresiscyrillic": "\u04e5", - "idotbelow": "\u1ecb", - "iebrevecyrillic": "\u04d7", - "iecyrillic": "\u0435", - "ieungacirclekorean": "\u3275", - "ieungaparenkorean": "\u3215", - "ieungcirclekorean": "\u3267", - "ieungkorean": "\u3147", - "ieungparenkorean": "\u3207", - "igrave": "\u00ec", - "igujarati": "\u0a87", - "igurmukhi": "\u0a07", - "ihiragana": "\u3044", - "ihookabove": "\u1ec9", - "iibengali": "\u0988", - "iicyrillic": "\u0438", - "iideva": "\u0908", - "iigujarati": "\u0a88", - "iigurmukhi": "\u0a08", - "iimatragurmukhi": "\u0a40", - "iinvertedbreve": "\u020b", - "iishortcyrillic": "\u0439", - "iivowelsignbengali": "\u09c0", - "iivowelsigndeva": "\u0940", - "iivowelsigngujarati": "\u0ac0", - "ij": "\u0133", - "ikatakana": "\u30a4", - "ikatakanahalfwidth": "\uff72", - "ikorean": "\u3163", - "ilde": "\u02dc", - "iluyhebrew": "\u05ac", - "imacron": "\u012b", - "imacroncyrillic": "\u04e3", - "imageorapproximatelyequal": "\u2253", - "imatragurmukhi": "\u0a3f", - "imonospace": "\uff49", - "increment": "\u2206", - "infinity": "\u221e", - "iniarmenian": "\u056b", - "integral": "\u222b", - "integralbottom": "\u2321", - "integralbt": "\u2321", - "integralex": "\uf8f5", - "integraltop": "\u2320", - "integraltp": "\u2320", - "intersection": "\u2229", - "intisquare": "\u3305", - "invbullet": "\u25d8", - "invcircle": "\u25d9", - "invsmileface": "\u263b", - "iocyrillic": "\u0451", - "iogonek": "\u012f", - "iota": "\u03b9", - "iotadieresis": "\u03ca", - "iotadieresistonos": "\u0390", - "iotalatin": "\u0269", - "iotatonos": "\u03af", - "iparen": "\u24a4", - "irigurmukhi": "\u0a72", - "ismallhiragana": "\u3043", - "ismallkatakana": "\u30a3", - "ismallkatakanahalfwidth": "\uff68", - "issharbengali": "\u09fa", - "istroke": "\u0268", - "isuperior": "\uf6ed", - "iterationhiragana": "\u309d", - "iterationkatakana": "\u30fd", - "itilde": "\u0129", - "itildebelow": "\u1e2d", - "iubopomofo": "\u3129", - "iucyrillic": "\u044e", - "ivowelsignbengali": "\u09bf", - "ivowelsigndeva": "\u093f", - "ivowelsigngujarati": "\u0abf", - "izhitsacyrillic": "\u0475", - "izhitsadblgravecyrillic": "\u0477", - "j": "\u006a", - "jaarmenian": "\u0571", - "jabengali": "\u099c", - "jadeva": "\u091c", - "jagujarati": "\u0a9c", - "jagurmukhi": "\u0a1c", - "jbopomofo": "\u3110", - "jcaron": "\u01f0", - "jcircle": "\u24d9", - "jcircumflex": "\u0135", - "jcrossedtail": "\u029d", - "jdotlessstroke": "\u025f", - "jecyrillic": "\u0458", - "jeemarabic": "\u062c", - "jeemfinalarabic": "\ufe9e", - "jeeminitialarabic": "\ufe9f", - "jeemmedialarabic": "\ufea0", - "jeharabic": "\u0698", - "jehfinalarabic": "\ufb8b", - "jhabengali": "\u099d", - "jhadeva": "\u091d", - "jhagujarati": "\u0a9d", - "jhagurmukhi": "\u0a1d", - "jheharmenian": "\u057b", - "jis": "\u3004", - "jmonospace": "\uff4a", - "jparen": "\u24a5", - "jsuperior": "\u02b2", - "k": "\u006b", - "kabashkircyrillic": "\u04a1", - "kabengali": "\u0995", - "kacute": "\u1e31", - "kacyrillic": "\u043a", - "kadescendercyrillic": "\u049b", - "kadeva": "\u0915", - "kaf": "\u05db", - "kafarabic": "\u0643", - "kafdagesh": "\ufb3b", - "kafdageshhebrew": "\ufb3b", - "kaffinalarabic": "\ufeda", - "kafhebrew": "\u05db", - "kafinitialarabic": "\ufedb", - "kafmedialarabic": "\ufedc", - "kafrafehebrew": "\ufb4d", - "kagujarati": "\u0a95", - "kagurmukhi": "\u0a15", - "kahiragana": "\u304b", - "kahookcyrillic": "\u04c4", - "kakatakana": "\u30ab", - "kakatakanahalfwidth": "\uff76", - "kappa": "\u03ba", - "kappasymbolgreek": "\u03f0", - "kapyeounmieumkorean": "\u3171", - "kapyeounphieuphkorean": "\u3184", - "kapyeounpieupkorean": "\u3178", - "kapyeounssangpieupkorean": "\u3179", - "karoriisquare": "\u330d", - "kashidaautoarabic": "\u0640", - "kashidaautonosidebearingarabic": "\u0640", - "kasmallkatakana": "\u30f5", - "kasquare": "\u3384", - "kasraarabic": "\u0650", - "kasratanarabic": "\u064d", - "kastrokecyrillic": "\u049f", - "katahiraprolongmarkhalfwidth": "\uff70", - "kaverticalstrokecyrillic": "\u049d", - "kbopomofo": "\u310e", - "kcalsquare": "\u3389", - "kcaron": "\u01e9", - "kcedilla": "\u0137", - "kcircle": "\u24da", - "kcommaaccent": "\u0137", - "kdotbelow": "\u1e33", - "keharmenian": "\u0584", - "kehiragana": "\u3051", - "kekatakana": "\u30b1", - "kekatakanahalfwidth": "\uff79", - "kenarmenian": "\u056f", - "kesmallkatakana": "\u30f6", - "kgreenlandic": "\u0138", - "khabengali": "\u0996", - "khacyrillic": "\u0445", - "khadeva": "\u0916", - "khagujarati": "\u0a96", - "khagurmukhi": "\u0a16", - "khaharabic": "\u062e", - "khahfinalarabic": "\ufea6", - "khahinitialarabic": "\ufea7", - "khahmedialarabic": "\ufea8", - "kheicoptic": "\u03e7", - "khhadeva": "\u0959", - "khhagurmukhi": "\u0a59", - "khieukhacirclekorean": "\u3278", - "khieukhaparenkorean": "\u3218", - "khieukhcirclekorean": "\u326a", - "khieukhkorean": "\u314b", - "khieukhparenkorean": "\u320a", - "khokhaithai": "\u0e02", - "khokhonthai": "\u0e05", - "khokhuatthai": "\u0e03", - "khokhwaithai": "\u0e04", - "khomutthai": "\u0e5b", - "khook": "\u0199", - "khorakhangthai": "\u0e06", - "khzsquare": "\u3391", - "kihiragana": "\u304d", - "kikatakana": "\u30ad", - "kikatakanahalfwidth": "\uff77", - "kiroguramusquare": "\u3315", - "kiromeetorusquare": "\u3316", - "kirosquare": "\u3314", - "kiyeokacirclekorean": "\u326e", - "kiyeokaparenkorean": "\u320e", - "kiyeokcirclekorean": "\u3260", - "kiyeokkorean": "\u3131", - "kiyeokparenkorean": "\u3200", - "kiyeoksioskorean": "\u3133", - "kjecyrillic": "\u045c", - "klinebelow": "\u1e35", - "klsquare": "\u3398", - "kmcubedsquare": "\u33a6", - "kmonospace": "\uff4b", - "kmsquaredsquare": "\u33a2", - "kohiragana": "\u3053", - "kohmsquare": "\u33c0", - "kokaithai": "\u0e01", - "kokatakana": "\u30b3", - "kokatakanahalfwidth": "\uff7a", - "kooposquare": "\u331e", - "koppacyrillic": "\u0481", - "koreanstandardsymbol": "\u327f", - "koroniscmb": "\u0343", - "kparen": "\u24a6", - "kpasquare": "\u33aa", - "ksicyrillic": "\u046f", - "ktsquare": "\u33cf", - "kturned": "\u029e", - "kuhiragana": "\u304f", - "kukatakana": "\u30af", - "kukatakanahalfwidth": "\uff78", - "kvsquare": "\u33b8", - "kwsquare": "\u33be", - "l": "\u006c", - "labengali": "\u09b2", - "lacute": "\u013a", - "ladeva": "\u0932", - "lagujarati": "\u0ab2", - "lagurmukhi": "\u0a32", - "lakkhangyaothai": "\u0e45", - "lamaleffinalarabic": "\ufefc", - "lamalefhamzaabovefinalarabic": "\ufef8", - "lamalefhamzaaboveisolatedarabic": "\ufef7", - "lamalefhamzabelowfinalarabic": "\ufefa", - "lamalefhamzabelowisolatedarabic": "\ufef9", - "lamalefisolatedarabic": "\ufefb", - "lamalefmaddaabovefinalarabic": "\ufef6", - "lamalefmaddaaboveisolatedarabic": "\ufef5", - "lamarabic": "\u0644", - "lambda": "\u03bb", - "lambdastroke": "\u019b", - "lamed": "\u05dc", - "lameddagesh": "\ufb3c", - "lameddageshhebrew": "\ufb3c", - "lamedhebrew": "\u05dc", - "lamedholam": "\u05dc\u05b9", - "lamedholamdagesh": "\u05dc\u05b9\u05bc", - "lamedholamdageshhebrew": "\u05dc\u05b9\u05bc", - "lamedholamhebrew": "\u05dc\u05b9", - "lamfinalarabic": "\ufede", - "lamhahinitialarabic": "\ufcca", - "laminitialarabic": "\ufedf", - "lamjeeminitialarabic": "\ufcc9", - "lamkhahinitialarabic": "\ufccb", - "lamlamhehisolatedarabic": "\ufdf2", - "lammedialarabic": "\ufee0", - "lammeemhahinitialarabic": "\ufd88", - "lammeeminitialarabic": "\ufccc", - "lammeemjeeminitialarabic": "\ufedf\ufee4\ufea0", - "lammeemkhahinitialarabic": "\ufedf\ufee4\ufea8", - "largecircle": "\u25ef", - "lbar": "\u019a", - "lbelt": "\u026c", - "lbopomofo": "\u310c", - "lcaron": "\u013e", - "lcedilla": "\u013c", - "lcircle": "\u24db", - "lcircumflexbelow": "\u1e3d", - "lcommaaccent": "\u013c", - "ldot": "\u0140", - "ldotaccent": "\u0140", - "ldotbelow": "\u1e37", - "ldotbelowmacron": "\u1e39", - "leftangleabovecmb": "\u031a", - "lefttackbelowcmb": "\u0318", - "less": "\u003c", - "lessequal": "\u2264", - "lessequalorgreater": "\u22da", - "lessmonospace": "\uff1c", - "lessorequivalent": "\u2272", - "lessorgreater": "\u2276", - "lessoverequal": "\u2266", - "lesssmall": "\ufe64", - "lezh": "\u026e", - "lfblock": "\u258c", - "lhookretroflex": "\u026d", - "lira": "\u20a4", - "liwnarmenian": "\u056c", - "lj": "\u01c9", - "ljecyrillic": "\u0459", - "ll": "\uf6c0", - "lladeva": "\u0933", - "llagujarati": "\u0ab3", - "llinebelow": "\u1e3b", - "llladeva": "\u0934", - "llvocalicbengali": "\u09e1", - "llvocalicdeva": "\u0961", - "llvocalicvowelsignbengali": "\u09e3", - "llvocalicvowelsigndeva": "\u0963", - "lmiddletilde": "\u026b", - "lmonospace": "\uff4c", - "lmsquare": "\u33d0", - "lochulathai": "\u0e2c", - "logicaland": "\u2227", - "logicalnot": "\u00ac", - "logicalnotreversed": "\u2310", - "logicalor": "\u2228", - "lolingthai": "\u0e25", - "longs": "\u017f", - "lowlinecenterline": "\ufe4e", - "lowlinecmb": "\u0332", - "lowlinedashed": "\ufe4d", - "lozenge": "\u25ca", - "lparen": "\u24a7", - "lslash": "\u0142", - "lsquare": "\u2113", - "lsuperior": "\uf6ee", - "ltshade": "\u2591", - "luthai": "\u0e26", - "lvocalicbengali": "\u098c", - "lvocalicdeva": "\u090c", - "lvocalicvowelsignbengali": "\u09e2", - "lvocalicvowelsigndeva": "\u0962", - "lxsquare": "\u33d3", - "m": "\u006d", - "mabengali": "\u09ae", - "macron": "\u00af", - "macronbelowcmb": "\u0331", - "macroncmb": "\u0304", - "macronlowmod": "\u02cd", - "macronmonospace": "\uffe3", - "macute": "\u1e3f", - "madeva": "\u092e", - "magujarati": "\u0aae", - "magurmukhi": "\u0a2e", - "mahapakhhebrew": "\u05a4", - "mahapakhlefthebrew": "\u05a4", - "mahiragana": "\u307e", - "maichattawalowleftthai": "\uf895", - "maichattawalowrightthai": "\uf894", - "maichattawathai": "\u0e4b", - "maichattawaupperleftthai": "\uf893", - "maieklowleftthai": "\uf88c", - "maieklowrightthai": "\uf88b", - "maiekthai": "\u0e48", - "maiekupperleftthai": "\uf88a", - "maihanakatleftthai": "\uf884", - "maihanakatthai": "\u0e31", - "maitaikhuleftthai": "\uf889", - "maitaikhuthai": "\u0e47", - "maitholowleftthai": "\uf88f", - "maitholowrightthai": "\uf88e", - "maithothai": "\u0e49", - "maithoupperleftthai": "\uf88d", - "maitrilowleftthai": "\uf892", - "maitrilowrightthai": "\uf891", - "maitrithai": "\u0e4a", - "maitriupperleftthai": "\uf890", - "maiyamokthai": "\u0e46", - "makatakana": "\u30de", - "makatakanahalfwidth": "\uff8f", - "male": "\u2642", - "mansyonsquare": "\u3347", - "maqafhebrew": "\u05be", - "mars": "\u2642", - "masoracirclehebrew": "\u05af", - "masquare": "\u3383", - "mbopomofo": "\u3107", - "mbsquare": "\u33d4", - "mcircle": "\u24dc", - "mcubedsquare": "\u33a5", - "mdotaccent": "\u1e41", - "mdotbelow": "\u1e43", - "meemarabic": "\u0645", - "meemfinalarabic": "\ufee2", - "meeminitialarabic": "\ufee3", - "meemmedialarabic": "\ufee4", - "meemmeeminitialarabic": "\ufcd1", - "meemmeemisolatedarabic": "\ufc48", - "meetorusquare": "\u334d", - "mehiragana": "\u3081", - "meizierasquare": "\u337e", - "mekatakana": "\u30e1", - "mekatakanahalfwidth": "\uff92", - "mem": "\u05de", - "memdagesh": "\ufb3e", - "memdageshhebrew": "\ufb3e", - "memhebrew": "\u05de", - "menarmenian": "\u0574", - "merkhahebrew": "\u05a5", - "merkhakefulahebrew": "\u05a6", - "merkhakefulalefthebrew": "\u05a6", - "merkhalefthebrew": "\u05a5", - "mhook": "\u0271", - "mhzsquare": "\u3392", - "middledotkatakanahalfwidth": "\uff65", - "middot": "\u00b7", - "mieumacirclekorean": "\u3272", - "mieumaparenkorean": "\u3212", - "mieumcirclekorean": "\u3264", - "mieumkorean": "\u3141", - "mieumpansioskorean": "\u3170", - "mieumparenkorean": "\u3204", - "mieumpieupkorean": "\u316e", - "mieumsioskorean": "\u316f", - "mihiragana": "\u307f", - "mikatakana": "\u30df", - "mikatakanahalfwidth": "\uff90", - "minus": "\u2212", - "minusbelowcmb": "\u0320", - "minuscircle": "\u2296", - "minusmod": "\u02d7", - "minusplus": "\u2213", - "minute": "\u2032", - "miribaarusquare": "\u334a", - "mirisquare": "\u3349", - "mlonglegturned": "\u0270", - "mlsquare": "\u3396", - "mmcubedsquare": "\u33a3", - "mmonospace": "\uff4d", - "mmsquaredsquare": "\u339f", - "mohiragana": "\u3082", - "mohmsquare": "\u33c1", - "mokatakana": "\u30e2", - "mokatakanahalfwidth": "\uff93", - "molsquare": "\u33d6", - "momathai": "\u0e21", - "moverssquare": "\u33a7", - "moverssquaredsquare": "\u33a8", - "mparen": "\u24a8", - "mpasquare": "\u33ab", - "mssquare": "\u33b3", - "msuperior": "\uf6ef", - "mturned": "\u026f", - "mu": "\u00b5", - "mu1": "\u00b5", - "muasquare": "\u3382", - "muchgreater": "\u226b", - "muchless": "\u226a", - "mufsquare": "\u338c", - "mugreek": "\u03bc", - "mugsquare": "\u338d", - "muhiragana": "\u3080", - "mukatakana": "\u30e0", - "mukatakanahalfwidth": "\uff91", - "mulsquare": "\u3395", - "multiply": "\u00d7", - "mumsquare": "\u339b", - "munahhebrew": "\u05a3", - "munahlefthebrew": "\u05a3", - "musicalnote": "\u266a", - "musicalnotedbl": "\u266b", - "musicflatsign": "\u266d", - "musicsharpsign": "\u266f", - "mussquare": "\u33b2", - "muvsquare": "\u33b6", - "muwsquare": "\u33bc", - "mvmegasquare": "\u33b9", - "mvsquare": "\u33b7", - "mwmegasquare": "\u33bf", - "mwsquare": "\u33bd", - "n": "\u006e", - "nabengali": "\u09a8", - "nabla": "\u2207", - "nacute": "\u0144", - "nadeva": "\u0928", - "nagujarati": "\u0aa8", - "nagurmukhi": "\u0a28", - "nahiragana": "\u306a", - "nakatakana": "\u30ca", - "nakatakanahalfwidth": "\uff85", - "napostrophe": "\u0149", - "nasquare": "\u3381", - "nbopomofo": "\u310b", - "nbspace": "\u00a0", - "ncaron": "\u0148", - "ncedilla": "\u0146", - "ncircle": "\u24dd", - "ncircumflexbelow": "\u1e4b", - "ncommaaccent": "\u0146", - "ndotaccent": "\u1e45", - "ndotbelow": "\u1e47", - "nehiragana": "\u306d", - "nekatakana": "\u30cd", - "nekatakanahalfwidth": "\uff88", - "newsheqelsign": "\u20aa", - "nfsquare": "\u338b", - "ngabengali": "\u0999", - "ngadeva": "\u0919", - "ngagujarati": "\u0a99", - "ngagurmukhi": "\u0a19", - "ngonguthai": "\u0e07", - "nhiragana": "\u3093", - "nhookleft": "\u0272", - "nhookretroflex": "\u0273", - "nieunacirclekorean": "\u326f", - "nieunaparenkorean": "\u320f", - "nieuncieuckorean": "\u3135", - "nieuncirclekorean": "\u3261", - "nieunhieuhkorean": "\u3136", - "nieunkorean": "\u3134", - "nieunpansioskorean": "\u3168", - "nieunparenkorean": "\u3201", - "nieunsioskorean": "\u3167", - "nieuntikeutkorean": "\u3166", - "nihiragana": "\u306b", - "nikatakana": "\u30cb", - "nikatakanahalfwidth": "\uff86", - "nikhahitleftthai": "\uf899", - "nikhahitthai": "\u0e4d", - "nine": "\u0039", - "ninearabic": "\u0669", - "ninebengali": "\u09ef", - "ninecircle": "\u2468", - "ninecircleinversesansserif": "\u2792", - "ninedeva": "\u096f", - "ninegujarati": "\u0aef", - "ninegurmukhi": "\u0a6f", - "ninehackarabic": "\u0669", - "ninehangzhou": "\u3029", - "nineideographicparen": "\u3228", - "nineinferior": "\u2089", - "ninemonospace": "\uff19", - "nineoldstyle": "\uf739", - "nineparen": "\u247c", - "nineperiod": "\u2490", - "ninepersian": "\u06f9", - "nineroman": "\u2178", - "ninesuperior": "\u2079", - "nineteencircle": "\u2472", - "nineteenparen": "\u2486", - "nineteenperiod": "\u249a", - "ninethai": "\u0e59", - "nj": "\u01cc", - "njecyrillic": "\u045a", - "nkatakana": "\u30f3", - "nkatakanahalfwidth": "\uff9d", - "nlegrightlong": "\u019e", - "nlinebelow": "\u1e49", - "nmonospace": "\uff4e", - "nmsquare": "\u339a", - "nnabengali": "\u09a3", - "nnadeva": "\u0923", - "nnagujarati": "\u0aa3", - "nnagurmukhi": "\u0a23", - "nnnadeva": "\u0929", - "nohiragana": "\u306e", - "nokatakana": "\u30ce", - "nokatakanahalfwidth": "\uff89", - "nonbreakingspace": "\u00a0", - "nonenthai": "\u0e13", - "nonuthai": "\u0e19", - "noonarabic": "\u0646", - "noonfinalarabic": "\ufee6", - "noonghunnaarabic": "\u06ba", - "noonghunnafinalarabic": "\ufb9f", - "noonhehinitialarabic": "\ufee7\ufeec", - "nooninitialarabic": "\ufee7", - "noonjeeminitialarabic": "\ufcd2", - "noonjeemisolatedarabic": "\ufc4b", - "noonmedialarabic": "\ufee8", - "noonmeeminitialarabic": "\ufcd5", - "noonmeemisolatedarabic": "\ufc4e", - "noonnoonfinalarabic": "\ufc8d", - "notcontains": "\u220c", - "notelement": "\u2209", - "notelementof": "\u2209", - "notequal": "\u2260", - "notgreater": "\u226f", - "notgreaternorequal": "\u2271", - "notgreaternorless": "\u2279", - "notidentical": "\u2262", - "notless": "\u226e", - "notlessnorequal": "\u2270", - "notparallel": "\u2226", - "notprecedes": "\u2280", - "notsubset": "\u2284", - "notsucceeds": "\u2281", - "notsuperset": "\u2285", - "nowarmenian": "\u0576", - "nparen": "\u24a9", - "nssquare": "\u33b1", - "nsuperior": "\u207f", - "ntilde": "\u00f1", - "nu": "\u03bd", - "nuhiragana": "\u306c", - "nukatakana": "\u30cc", - "nukatakanahalfwidth": "\uff87", - "nuktabengali": "\u09bc", - "nuktadeva": "\u093c", - "nuktagujarati": "\u0abc", - "nuktagurmukhi": "\u0a3c", - "numbersign": "\u0023", - "numbersignmonospace": "\uff03", - "numbersignsmall": "\ufe5f", - "numeralsigngreek": "\u0374", - "numeralsignlowergreek": "\u0375", - "numero": "\u2116", - "nun": "\u05e0", - "nundagesh": "\ufb40", - "nundageshhebrew": "\ufb40", - "nunhebrew": "\u05e0", - "nvsquare": "\u33b5", - "nwsquare": "\u33bb", - "nyabengali": "\u099e", - "nyadeva": "\u091e", - "nyagujarati": "\u0a9e", - "nyagurmukhi": "\u0a1e", - "o": "\u006f", - "oacute": "\u00f3", - "oangthai": "\u0e2d", - "obarred": "\u0275", - "obarredcyrillic": "\u04e9", - "obarreddieresiscyrillic": "\u04eb", - "obengali": "\u0993", - "obopomofo": "\u311b", - "obreve": "\u014f", - "ocandradeva": "\u0911", - "ocandragujarati": "\u0a91", - "ocandravowelsigndeva": "\u0949", - "ocandravowelsigngujarati": "\u0ac9", - "ocaron": "\u01d2", - "ocircle": "\u24de", - "ocircumflex": "\u00f4", - "ocircumflexacute": "\u1ed1", - "ocircumflexdotbelow": "\u1ed9", - "ocircumflexgrave": "\u1ed3", - "ocircumflexhookabove": "\u1ed5", - "ocircumflextilde": "\u1ed7", - "ocyrillic": "\u043e", - "odblacute": "\u0151", - "odblgrave": "\u020d", - "odeva": "\u0913", - "odieresis": "\u00f6", - "odieresiscyrillic": "\u04e7", - "odotbelow": "\u1ecd", - "oe": "\u0153", - "oekorean": "\u315a", - "ogonek": "\u02db", - "ogonekcmb": "\u0328", - "ograve": "\u00f2", - "ogujarati": "\u0a93", - "oharmenian": "\u0585", - "ohiragana": "\u304a", - "ohookabove": "\u1ecf", - "ohorn": "\u01a1", - "ohornacute": "\u1edb", - "ohorndotbelow": "\u1ee3", - "ohorngrave": "\u1edd", - "ohornhookabove": "\u1edf", - "ohorntilde": "\u1ee1", - "ohungarumlaut": "\u0151", - "oi": "\u01a3", - "oinvertedbreve": "\u020f", - "okatakana": "\u30aa", - "okatakanahalfwidth": "\uff75", - "okorean": "\u3157", - "olehebrew": "\u05ab", - "omacron": "\u014d", - "omacronacute": "\u1e53", - "omacrongrave": "\u1e51", - "omdeva": "\u0950", - "omega": "\u03c9", - "omega1": "\u03d6", - "omegacyrillic": "\u0461", - "omegalatinclosed": "\u0277", - "omegaroundcyrillic": "\u047b", - "omegatitlocyrillic": "\u047d", - "omegatonos": "\u03ce", - "omgujarati": "\u0ad0", - "omicron": "\u03bf", - "omicrontonos": "\u03cc", - "omonospace": "\uff4f", - "one": "\u0031", - "onearabic": "\u0661", - "onebengali": "\u09e7", - "onecircle": "\u2460", - "onecircleinversesansserif": "\u278a", - "onedeva": "\u0967", - "onedotenleader": "\u2024", - "oneeighth": "\u215b", - "onefitted": "\uf6dc", - "onegujarati": "\u0ae7", - "onegurmukhi": "\u0a67", - "onehackarabic": "\u0661", - "onehalf": "\u00bd", - "onehangzhou": "\u3021", - "oneideographicparen": "\u3220", - "oneinferior": "\u2081", - "onemonospace": "\uff11", - "onenumeratorbengali": "\u09f4", - "oneoldstyle": "\uf731", - "oneparen": "\u2474", - "oneperiod": "\u2488", - "onepersian": "\u06f1", - "onequarter": "\u00bc", - "oneroman": "\u2170", - "onesuperior": "\u00b9", - "onethai": "\u0e51", - "onethird": "\u2153", - "oogonek": "\u01eb", - "oogonekmacron": "\u01ed", - "oogurmukhi": "\u0a13", - "oomatragurmukhi": "\u0a4b", - "oopen": "\u0254", - "oparen": "\u24aa", - "openbullet": "\u25e6", - "option": "\u2325", - "ordfeminine": "\u00aa", - "ordmasculine": "\u00ba", - "orthogonal": "\u221f", - "oshortdeva": "\u0912", - "oshortvowelsigndeva": "\u094a", - "oslash": "\u00f8", - "oslashacute": "\u01ff", - "osmallhiragana": "\u3049", - "osmallkatakana": "\u30a9", - "osmallkatakanahalfwidth": "\uff6b", - "ostrokeacute": "\u01ff", - "osuperior": "\uf6f0", - "otcyrillic": "\u047f", - "otilde": "\u00f5", - "otildeacute": "\u1e4d", - "otildedieresis": "\u1e4f", - "oubopomofo": "\u3121", - "overline": "\u203e", - "overlinecenterline": "\ufe4a", - "overlinecmb": "\u0305", - "overlinedashed": "\ufe49", - "overlinedblwavy": "\ufe4c", - "overlinewavy": "\ufe4b", - "overscore": "\u00af", - "ovowelsignbengali": "\u09cb", - "ovowelsigndeva": "\u094b", - "ovowelsigngujarati": "\u0acb", - "p": "\u0070", - "paampssquare": "\u3380", - "paasentosquare": "\u332b", - "pabengali": "\u09aa", - "pacute": "\u1e55", - "padeva": "\u092a", - "pagedown": "\u21df", - "pageup": "\u21de", - "pagujarati": "\u0aaa", - "pagurmukhi": "\u0a2a", - "pahiragana": "\u3071", - "paiyannoithai": "\u0e2f", - "pakatakana": "\u30d1", - "palatalizationcyrilliccmb": "\u0484", - "palochkacyrillic": "\u04c0", - "pansioskorean": "\u317f", - "paragraph": "\u00b6", - "parallel": "\u2225", - "parenleft": "\u0028", - "parenleftaltonearabic": "\ufd3e", - "parenleftbt": "\uf8ed", - "parenleftex": "\uf8ec", - "parenleftinferior": "\u208d", - "parenleftmonospace": "\uff08", - "parenleftsmall": "\ufe59", - "parenleftsuperior": "\u207d", - "parenlefttp": "\uf8eb", - "parenleftvertical": "\ufe35", - "parenright": "\u0029", - "parenrightaltonearabic": "\ufd3f", - "parenrightbt": "\uf8f8", - "parenrightex": "\uf8f7", - "parenrightinferior": "\u208e", - "parenrightmonospace": "\uff09", - "parenrightsmall": "\ufe5a", - "parenrightsuperior": "\u207e", - "parenrighttp": "\uf8f6", - "parenrightvertical": "\ufe36", - "partialdiff": "\u2202", - "paseqhebrew": "\u05c0", - "pashtahebrew": "\u0599", - "pasquare": "\u33a9", - "patah": "\u05b7", - "patah11": "\u05b7", - "patah1d": "\u05b7", - "patah2a": "\u05b7", - "patahhebrew": "\u05b7", - "patahnarrowhebrew": "\u05b7", - "patahquarterhebrew": "\u05b7", - "patahwidehebrew": "\u05b7", - "pazerhebrew": "\u05a1", - "pbopomofo": "\u3106", - "pcircle": "\u24df", - "pdotaccent": "\u1e57", - "pe": "\u05e4", - "pecyrillic": "\u043f", - "pedagesh": "\ufb44", - "pedageshhebrew": "\ufb44", - "peezisquare": "\u333b", - "pefinaldageshhebrew": "\ufb43", - "peharabic": "\u067e", - "peharmenian": "\u057a", - "pehebrew": "\u05e4", - "pehfinalarabic": "\ufb57", - "pehinitialarabic": "\ufb58", - "pehiragana": "\u307a", - "pehmedialarabic": "\ufb59", - "pekatakana": "\u30da", - "pemiddlehookcyrillic": "\u04a7", - "perafehebrew": "\ufb4e", - "percent": "\u0025", - "percentarabic": "\u066a", - "percentmonospace": "\uff05", - "percentsmall": "\ufe6a", - "period": "\u002e", - "periodarmenian": "\u0589", - "periodcentered": "\u00b7", - "periodhalfwidth": "\uff61", - "periodinferior": "\uf6e7", - "periodmonospace": "\uff0e", - "periodsmall": "\ufe52", - "periodsuperior": "\uf6e8", - "perispomenigreekcmb": "\u0342", - "perpendicular": "\u22a5", - "perthousand": "\u2030", - "peseta": "\u20a7", - "pfsquare": "\u338a", - "phabengali": "\u09ab", - "phadeva": "\u092b", - "phagujarati": "\u0aab", - "phagurmukhi": "\u0a2b", - "phi": "\u03c6", - "phi1": "\u03d5", - "phieuphacirclekorean": "\u327a", - "phieuphaparenkorean": "\u321a", - "phieuphcirclekorean": "\u326c", - "phieuphkorean": "\u314d", - "phieuphparenkorean": "\u320c", - "philatin": "\u0278", - "phinthuthai": "\u0e3a", - "phisymbolgreek": "\u03d5", - "phook": "\u01a5", - "phophanthai": "\u0e1e", - "phophungthai": "\u0e1c", - "phosamphaothai": "\u0e20", - "pi": "\u03c0", - "pieupacirclekorean": "\u3273", - "pieupaparenkorean": "\u3213", - "pieupcieuckorean": "\u3176", - "pieupcirclekorean": "\u3265", - "pieupkiyeokkorean": "\u3172", - "pieupkorean": "\u3142", - "pieupparenkorean": "\u3205", - "pieupsioskiyeokkorean": "\u3174", - "pieupsioskorean": "\u3144", - "pieupsiostikeutkorean": "\u3175", - "pieupthieuthkorean": "\u3177", - "pieuptikeutkorean": "\u3173", - "pihiragana": "\u3074", - "pikatakana": "\u30d4", - "pisymbolgreek": "\u03d6", - "piwrarmenian": "\u0583", - "plus": "\u002b", - "plusbelowcmb": "\u031f", - "pluscircle": "\u2295", - "plusminus": "\u00b1", - "plusmod": "\u02d6", - "plusmonospace": "\uff0b", - "plussmall": "\ufe62", - "plussuperior": "\u207a", - "pmonospace": "\uff50", - "pmsquare": "\u33d8", - "pohiragana": "\u307d", - "pointingindexdownwhite": "\u261f", - "pointingindexleftwhite": "\u261c", - "pointingindexrightwhite": "\u261e", - "pointingindexupwhite": "\u261d", - "pokatakana": "\u30dd", - "poplathai": "\u0e1b", - "postalmark": "\u3012", - "postalmarkface": "\u3020", - "pparen": "\u24ab", - "precedes": "\u227a", - "prescription": "\u211e", - "primemod": "\u02b9", - "primereversed": "\u2035", - "product": "\u220f", - "projective": "\u2305", - "prolongedkana": "\u30fc", - "propellor": "\u2318", - "propersubset": "\u2282", - "propersuperset": "\u2283", - "proportion": "\u2237", - "proportional": "\u221d", - "psi": "\u03c8", - "psicyrillic": "\u0471", - "psilipneumatacyrilliccmb": "\u0486", - "pssquare": "\u33b0", - "puhiragana": "\u3077", - "pukatakana": "\u30d7", - "pvsquare": "\u33b4", - "pwsquare": "\u33ba", - "q": "\u0071", - "qadeva": "\u0958", - "qadmahebrew": "\u05a8", - "qafarabic": "\u0642", - "qaffinalarabic": "\ufed6", - "qafinitialarabic": "\ufed7", - "qafmedialarabic": "\ufed8", - "qamats": "\u05b8", - "qamats10": "\u05b8", - "qamats1a": "\u05b8", - "qamats1c": "\u05b8", - "qamats27": "\u05b8", - "qamats29": "\u05b8", - "qamats33": "\u05b8", - "qamatsde": "\u05b8", - "qamatshebrew": "\u05b8", - "qamatsnarrowhebrew": "\u05b8", - "qamatsqatanhebrew": "\u05b8", - "qamatsqatannarrowhebrew": "\u05b8", - "qamatsqatanquarterhebrew": "\u05b8", - "qamatsqatanwidehebrew": "\u05b8", - "qamatsquarterhebrew": "\u05b8", - "qamatswidehebrew": "\u05b8", - "qarneyparahebrew": "\u059f", - "qbopomofo": "\u3111", - "qcircle": "\u24e0", - "qhook": "\u02a0", - "qmonospace": "\uff51", - "qof": "\u05e7", - "qofdagesh": "\ufb47", - "qofdageshhebrew": "\ufb47", - "qofhatafpatah": "\u05e7\u05b2", - "qofhatafpatahhebrew": "\u05e7\u05b2", - "qofhatafsegol": "\u05e7\u05b1", - "qofhatafsegolhebrew": "\u05e7\u05b1", - "qofhebrew": "\u05e7", - "qofhiriq": "\u05e7\u05b4", - "qofhiriqhebrew": "\u05e7\u05b4", - "qofholam": "\u05e7\u05b9", - "qofholamhebrew": "\u05e7\u05b9", - "qofpatah": "\u05e7\u05b7", - "qofpatahhebrew": "\u05e7\u05b7", - "qofqamats": "\u05e7\u05b8", - "qofqamatshebrew": "\u05e7\u05b8", - "qofqubuts": "\u05e7\u05bb", - "qofqubutshebrew": "\u05e7\u05bb", - "qofsegol": "\u05e7\u05b6", - "qofsegolhebrew": "\u05e7\u05b6", - "qofsheva": "\u05e7\u05b0", - "qofshevahebrew": "\u05e7\u05b0", - "qoftsere": "\u05e7\u05b5", - "qoftserehebrew": "\u05e7\u05b5", - "qparen": "\u24ac", - "quarternote": "\u2669", - "qubuts": "\u05bb", - "qubuts18": "\u05bb", - "qubuts25": "\u05bb", - "qubuts31": "\u05bb", - "qubutshebrew": "\u05bb", - "qubutsnarrowhebrew": "\u05bb", - "qubutsquarterhebrew": "\u05bb", - "qubutswidehebrew": "\u05bb", - "question": "\u003f", - "questionarabic": "\u061f", - "questionarmenian": "\u055e", - "questiondown": "\u00bf", - "questiondownsmall": "\uf7bf", - "questiongreek": "\u037e", - "questionmonospace": "\uff1f", - "questionsmall": "\uf73f", - "quotedbl": "\u0022", - "quotedblbase": "\u201e", - "quotedblleft": "\u201c", - "quotedblmonospace": "\uff02", - "quotedblprime": "\u301e", - "quotedblprimereversed": "\u301d", - "quotedblright": "\u201d", - "quoteleft": "\u2018", - "quoteleftreversed": "\u201b", - "quotereversed": "\u201b", - "quoteright": "\u2019", - "quoterightn": "\u0149", - "quotesinglbase": "\u201a", - "quotesingle": "\u0027", - "quotesinglemonospace": "\uff07", - "r": "\u0072", - "raarmenian": "\u057c", - "rabengali": "\u09b0", - "racute": "\u0155", - "radeva": "\u0930", - "radical": "\u221a", - "radicalex": "\uf8e5", - "radoverssquare": "\u33ae", - "radoverssquaredsquare": "\u33af", - "radsquare": "\u33ad", - "rafe": "\u05bf", - "rafehebrew": "\u05bf", - "ragujarati": "\u0ab0", - "ragurmukhi": "\u0a30", - "rahiragana": "\u3089", - "rakatakana": "\u30e9", - "rakatakanahalfwidth": "\uff97", - "ralowerdiagonalbengali": "\u09f1", - "ramiddlediagonalbengali": "\u09f0", - "ramshorn": "\u0264", - "ratio": "\u2236", - "rbopomofo": "\u3116", - "rcaron": "\u0159", - "rcedilla": "\u0157", - "rcircle": "\u24e1", - "rcommaaccent": "\u0157", - "rdblgrave": "\u0211", - "rdotaccent": "\u1e59", - "rdotbelow": "\u1e5b", - "rdotbelowmacron": "\u1e5d", - "referencemark": "\u203b", - "reflexsubset": "\u2286", - "reflexsuperset": "\u2287", - "registered": "\u00ae", - "registersans": "\uf8e8", - "registerserif": "\uf6da", - "reharabic": "\u0631", - "reharmenian": "\u0580", - "rehfinalarabic": "\ufeae", - "rehiragana": "\u308c", - "rehyehaleflamarabic": "\u0631\ufef3\ufe8e\u0644", - "rekatakana": "\u30ec", - "rekatakanahalfwidth": "\uff9a", - "resh": "\u05e8", - "reshdageshhebrew": "\ufb48", - "reshhatafpatah": "\u05e8\u05b2", - "reshhatafpatahhebrew": "\u05e8\u05b2", - "reshhatafsegol": "\u05e8\u05b1", - "reshhatafsegolhebrew": "\u05e8\u05b1", - "reshhebrew": "\u05e8", - "reshhiriq": "\u05e8\u05b4", - "reshhiriqhebrew": "\u05e8\u05b4", - "reshholam": "\u05e8\u05b9", - "reshholamhebrew": "\u05e8\u05b9", - "reshpatah": "\u05e8\u05b7", - "reshpatahhebrew": "\u05e8\u05b7", - "reshqamats": "\u05e8\u05b8", - "reshqamatshebrew": "\u05e8\u05b8", - "reshqubuts": "\u05e8\u05bb", - "reshqubutshebrew": "\u05e8\u05bb", - "reshsegol": "\u05e8\u05b6", - "reshsegolhebrew": "\u05e8\u05b6", - "reshsheva": "\u05e8\u05b0", - "reshshevahebrew": "\u05e8\u05b0", - "reshtsere": "\u05e8\u05b5", - "reshtserehebrew": "\u05e8\u05b5", - "reversedtilde": "\u223d", - "reviahebrew": "\u0597", - "reviamugrashhebrew": "\u0597", - "revlogicalnot": "\u2310", - "rfishhook": "\u027e", - "rfishhookreversed": "\u027f", - "rhabengali": "\u09dd", - "rhadeva": "\u095d", - "rho": "\u03c1", - "rhook": "\u027d", - "rhookturned": "\u027b", - "rhookturnedsuperior": "\u02b5", - "rhosymbolgreek": "\u03f1", - "rhotichookmod": "\u02de", - "rieulacirclekorean": "\u3271", - "rieulaparenkorean": "\u3211", - "rieulcirclekorean": "\u3263", - "rieulhieuhkorean": "\u3140", - "rieulkiyeokkorean": "\u313a", - "rieulkiyeoksioskorean": "\u3169", - "rieulkorean": "\u3139", - "rieulmieumkorean": "\u313b", - "rieulpansioskorean": "\u316c", - "rieulparenkorean": "\u3203", - "rieulphieuphkorean": "\u313f", - "rieulpieupkorean": "\u313c", - "rieulpieupsioskorean": "\u316b", - "rieulsioskorean": "\u313d", - "rieulthieuthkorean": "\u313e", - "rieultikeutkorean": "\u316a", - "rieulyeorinhieuhkorean": "\u316d", - "rightangle": "\u221f", - "righttackbelowcmb": "\u0319", - "righttriangle": "\u22bf", - "rihiragana": "\u308a", - "rikatakana": "\u30ea", - "rikatakanahalfwidth": "\uff98", - "ring": "\u02da", - "ringbelowcmb": "\u0325", - "ringcmb": "\u030a", - "ringhalfleft": "\u02bf", - "ringhalfleftarmenian": "\u0559", - "ringhalfleftbelowcmb": "\u031c", - "ringhalfleftcentered": "\u02d3", - "ringhalfright": "\u02be", - "ringhalfrightbelowcmb": "\u0339", - "ringhalfrightcentered": "\u02d2", - "rinvertedbreve": "\u0213", - "rittorusquare": "\u3351", - "rlinebelow": "\u1e5f", - "rlongleg": "\u027c", - "rlonglegturned": "\u027a", - "rmonospace": "\uff52", - "rohiragana": "\u308d", - "rokatakana": "\u30ed", - "rokatakanahalfwidth": "\uff9b", - "roruathai": "\u0e23", - "rparen": "\u24ad", - "rrabengali": "\u09dc", - "rradeva": "\u0931", - "rragurmukhi": "\u0a5c", - "rreharabic": "\u0691", - "rrehfinalarabic": "\ufb8d", - "rrvocalicbengali": "\u09e0", - "rrvocalicdeva": "\u0960", - "rrvocalicgujarati": "\u0ae0", - "rrvocalicvowelsignbengali": "\u09c4", - "rrvocalicvowelsigndeva": "\u0944", - "rrvocalicvowelsigngujarati": "\u0ac4", - "rsuperior": "\uf6f1", - "rtblock": "\u2590", - "rturned": "\u0279", - "rturnedsuperior": "\u02b4", - "ruhiragana": "\u308b", - "rukatakana": "\u30eb", - "rukatakanahalfwidth": "\uff99", - "rupeemarkbengali": "\u09f2", - "rupeesignbengali": "\u09f3", - "rupiah": "\uf6dd", - "ruthai": "\u0e24", - "rvocalicbengali": "\u098b", - "rvocalicdeva": "\u090b", - "rvocalicgujarati": "\u0a8b", - "rvocalicvowelsignbengali": "\u09c3", - "rvocalicvowelsigndeva": "\u0943", - "rvocalicvowelsigngujarati": "\u0ac3", - "s": "\u0073", - "sabengali": "\u09b8", - "sacute": "\u015b", - "sacutedotaccent": "\u1e65", - "sadarabic": "\u0635", - "sadeva": "\u0938", - "sadfinalarabic": "\ufeba", - "sadinitialarabic": "\ufebb", - "sadmedialarabic": "\ufebc", - "sagujarati": "\u0ab8", - "sagurmukhi": "\u0a38", - "sahiragana": "\u3055", - "sakatakana": "\u30b5", - "sakatakanahalfwidth": "\uff7b", - "sallallahoualayhewasallamarabic": "\ufdfa", - "samekh": "\u05e1", - "samekhdagesh": "\ufb41", - "samekhdageshhebrew": "\ufb41", - "samekhhebrew": "\u05e1", - "saraaathai": "\u0e32", - "saraaethai": "\u0e41", - "saraaimaimalaithai": "\u0e44", - "saraaimaimuanthai": "\u0e43", - "saraamthai": "\u0e33", - "saraathai": "\u0e30", - "saraethai": "\u0e40", - "saraiileftthai": "\uf886", - "saraiithai": "\u0e35", - "saraileftthai": "\uf885", - "saraithai": "\u0e34", - "saraothai": "\u0e42", - "saraueeleftthai": "\uf888", - "saraueethai": "\u0e37", - "saraueleftthai": "\uf887", - "sarauethai": "\u0e36", - "sarauthai": "\u0e38", - "sarauuthai": "\u0e39", - "sbopomofo": "\u3119", - "scaron": "\u0161", - "scarondotaccent": "\u1e67", - "scedilla": "\u015f", - "schwa": "\u0259", - "schwacyrillic": "\u04d9", - "schwadieresiscyrillic": "\u04db", - "schwahook": "\u025a", - "scircle": "\u24e2", - "scircumflex": "\u015d", - "scommaaccent": "\u0219", - "sdotaccent": "\u1e61", - "sdotbelow": "\u1e63", - "sdotbelowdotaccent": "\u1e69", - "seagullbelowcmb": "\u033c", - "second": "\u2033", - "secondtonechinese": "\u02ca", - "section": "\u00a7", - "seenarabic": "\u0633", - "seenfinalarabic": "\ufeb2", - "seeninitialarabic": "\ufeb3", - "seenmedialarabic": "\ufeb4", - "segol": "\u05b6", - "segol13": "\u05b6", - "segol1f": "\u05b6", - "segol2c": "\u05b6", - "segolhebrew": "\u05b6", - "segolnarrowhebrew": "\u05b6", - "segolquarterhebrew": "\u05b6", - "segoltahebrew": "\u0592", - "segolwidehebrew": "\u05b6", - "seharmenian": "\u057d", - "sehiragana": "\u305b", - "sekatakana": "\u30bb", - "sekatakanahalfwidth": "\uff7e", - "semicolon": "\u003b", - "semicolonarabic": "\u061b", - "semicolonmonospace": "\uff1b", - "semicolonsmall": "\ufe54", - "semivoicedmarkkana": "\u309c", - "semivoicedmarkkanahalfwidth": "\uff9f", - "sentisquare": "\u3322", - "sentosquare": "\u3323", - "seven": "\u0037", - "sevenarabic": "\u0667", - "sevenbengali": "\u09ed", - "sevencircle": "\u2466", - "sevencircleinversesansserif": "\u2790", - "sevendeva": "\u096d", - "seveneighths": "\u215e", - "sevengujarati": "\u0aed", - "sevengurmukhi": "\u0a6d", - "sevenhackarabic": "\u0667", - "sevenhangzhou": "\u3027", - "sevenideographicparen": "\u3226", - "seveninferior": "\u2087", - "sevenmonospace": "\uff17", - "sevenoldstyle": "\uf737", - "sevenparen": "\u247a", - "sevenperiod": "\u248e", - "sevenpersian": "\u06f7", - "sevenroman": "\u2176", - "sevensuperior": "\u2077", - "seventeencircle": "\u2470", - "seventeenparen": "\u2484", - "seventeenperiod": "\u2498", - "seventhai": "\u0e57", - "sfthyphen": "\u00ad", - "shaarmenian": "\u0577", - "shabengali": "\u09b6", - "shacyrillic": "\u0448", - "shaddaarabic": "\u0651", - "shaddadammaarabic": "\ufc61", - "shaddadammatanarabic": "\ufc5e", - "shaddafathaarabic": "\ufc60", - "shaddafathatanarabic": "\u0651\u064b", - "shaddakasraarabic": "\ufc62", - "shaddakasratanarabic": "\ufc5f", - "shade": "\u2592", - "shadedark": "\u2593", - "shadelight": "\u2591", - "shademedium": "\u2592", - "shadeva": "\u0936", - "shagujarati": "\u0ab6", - "shagurmukhi": "\u0a36", - "shalshelethebrew": "\u0593", - "shbopomofo": "\u3115", - "shchacyrillic": "\u0449", - "sheenarabic": "\u0634", - "sheenfinalarabic": "\ufeb6", - "sheeninitialarabic": "\ufeb7", - "sheenmedialarabic": "\ufeb8", - "sheicoptic": "\u03e3", - "sheqel": "\u20aa", - "sheqelhebrew": "\u20aa", - "sheva": "\u05b0", - "sheva115": "\u05b0", - "sheva15": "\u05b0", - "sheva22": "\u05b0", - "sheva2e": "\u05b0", - "shevahebrew": "\u05b0", - "shevanarrowhebrew": "\u05b0", - "shevaquarterhebrew": "\u05b0", - "shevawidehebrew": "\u05b0", - "shhacyrillic": "\u04bb", - "shimacoptic": "\u03ed", - "shin": "\u05e9", - "shindagesh": "\ufb49", - "shindageshhebrew": "\ufb49", - "shindageshshindot": "\ufb2c", - "shindageshshindothebrew": "\ufb2c", - "shindageshsindot": "\ufb2d", - "shindageshsindothebrew": "\ufb2d", - "shindothebrew": "\u05c1", - "shinhebrew": "\u05e9", - "shinshindot": "\ufb2a", - "shinshindothebrew": "\ufb2a", - "shinsindot": "\ufb2b", - "shinsindothebrew": "\ufb2b", - "shook": "\u0282", - "sigma": "\u03c3", - "sigma1": "\u03c2", - "sigmafinal": "\u03c2", - "sigmalunatesymbolgreek": "\u03f2", - "sihiragana": "\u3057", - "sikatakana": "\u30b7", - "sikatakanahalfwidth": "\uff7c", - "siluqhebrew": "\u05bd", - "siluqlefthebrew": "\u05bd", - "similar": "\u223c", - "sindothebrew": "\u05c2", - "siosacirclekorean": "\u3274", - "siosaparenkorean": "\u3214", - "sioscieuckorean": "\u317e", - "sioscirclekorean": "\u3266", - "sioskiyeokkorean": "\u317a", - "sioskorean": "\u3145", - "siosnieunkorean": "\u317b", - "siosparenkorean": "\u3206", - "siospieupkorean": "\u317d", - "siostikeutkorean": "\u317c", - "six": "\u0036", - "sixarabic": "\u0666", - "sixbengali": "\u09ec", - "sixcircle": "\u2465", - "sixcircleinversesansserif": "\u278f", - "sixdeva": "\u096c", - "sixgujarati": "\u0aec", - "sixgurmukhi": "\u0a6c", - "sixhackarabic": "\u0666", - "sixhangzhou": "\u3026", - "sixideographicparen": "\u3225", - "sixinferior": "\u2086", - "sixmonospace": "\uff16", - "sixoldstyle": "\uf736", - "sixparen": "\u2479", - "sixperiod": "\u248d", - "sixpersian": "\u06f6", - "sixroman": "\u2175", - "sixsuperior": "\u2076", - "sixteencircle": "\u246f", - "sixteencurrencydenominatorbengali": "\u09f9", - "sixteenparen": "\u2483", - "sixteenperiod": "\u2497", - "sixthai": "\u0e56", - "slash": "\u002f", - "slashmonospace": "\uff0f", - "slong": "\u017f", - "slongdotaccent": "\u1e9b", - "smileface": "\u263a", - "smonospace": "\uff53", - "sofpasuqhebrew": "\u05c3", - "softhyphen": "\u00ad", - "softsigncyrillic": "\u044c", - "sohiragana": "\u305d", - "sokatakana": "\u30bd", - "sokatakanahalfwidth": "\uff7f", - "soliduslongoverlaycmb": "\u0338", - "solidusshortoverlaycmb": "\u0337", - "sorusithai": "\u0e29", - "sosalathai": "\u0e28", - "sosothai": "\u0e0b", - "sosuathai": "\u0e2a", - "space": "\u0020", - "spacehackarabic": "\u0020", - "spade": "\u2660", - "spadesuitblack": "\u2660", - "spadesuitwhite": "\u2664", - "sparen": "\u24ae", - "squarebelowcmb": "\u033b", - "squarecc": "\u33c4", - "squarecm": "\u339d", - "squarediagonalcrosshatchfill": "\u25a9", - "squarehorizontalfill": "\u25a4", - "squarekg": "\u338f", - "squarekm": "\u339e", - "squarekmcapital": "\u33ce", - "squareln": "\u33d1", - "squarelog": "\u33d2", - "squaremg": "\u338e", - "squaremil": "\u33d5", - "squaremm": "\u339c", - "squaremsquared": "\u33a1", - "squareorthogonalcrosshatchfill": "\u25a6", - "squareupperlefttolowerrightfill": "\u25a7", - "squareupperrighttolowerleftfill": "\u25a8", - "squareverticalfill": "\u25a5", - "squarewhitewithsmallblack": "\u25a3", - "srsquare": "\u33db", - "ssabengali": "\u09b7", - "ssadeva": "\u0937", - "ssagujarati": "\u0ab7", - "ssangcieuckorean": "\u3149", - "ssanghieuhkorean": "\u3185", - "ssangieungkorean": "\u3180", - "ssangkiyeokkorean": "\u3132", - "ssangnieunkorean": "\u3165", - "ssangpieupkorean": "\u3143", - "ssangsioskorean": "\u3146", - "ssangtikeutkorean": "\u3138", - "ssuperior": "\uf6f2", - "sterling": "\u00a3", - "sterlingmonospace": "\uffe1", - "strokelongoverlaycmb": "\u0336", - "strokeshortoverlaycmb": "\u0335", - "subset": "\u2282", - "subsetnotequal": "\u228a", - "subsetorequal": "\u2286", - "succeeds": "\u227b", - "suchthat": "\u220b", - "suhiragana": "\u3059", - "sukatakana": "\u30b9", - "sukatakanahalfwidth": "\uff7d", - "sukunarabic": "\u0652", - "summation": "\u2211", - "sun": "\u263c", - "superset": "\u2283", - "supersetnotequal": "\u228b", - "supersetorequal": "\u2287", - "svsquare": "\u33dc", - "syouwaerasquare": "\u337c", - "t": "\u0074", - "tabengali": "\u09a4", - "tackdown": "\u22a4", - "tackleft": "\u22a3", - "tadeva": "\u0924", - "tagujarati": "\u0aa4", - "tagurmukhi": "\u0a24", - "taharabic": "\u0637", - "tahfinalarabic": "\ufec2", - "tahinitialarabic": "\ufec3", - "tahiragana": "\u305f", - "tahmedialarabic": "\ufec4", - "taisyouerasquare": "\u337d", - "takatakana": "\u30bf", - "takatakanahalfwidth": "\uff80", - "tatweelarabic": "\u0640", - "tau": "\u03c4", - "tav": "\u05ea", - "tavdages": "\ufb4a", - "tavdagesh": "\ufb4a", - "tavdageshhebrew": "\ufb4a", - "tavhebrew": "\u05ea", - "tbar": "\u0167", - "tbopomofo": "\u310a", - "tcaron": "\u0165", - "tccurl": "\u02a8", - "tcedilla": "\u0163", - "tcheharabic": "\u0686", - "tchehfinalarabic": "\ufb7b", - "tchehinitialarabic": "\ufb7c", - "tchehmedialarabic": "\ufb7d", - "tchehmeeminitialarabic": "\ufb7c\ufee4", - "tcircle": "\u24e3", - "tcircumflexbelow": "\u1e71", - "tcommaaccent": "\u0163", - "tdieresis": "\u1e97", - "tdotaccent": "\u1e6b", - "tdotbelow": "\u1e6d", - "tecyrillic": "\u0442", - "tedescendercyrillic": "\u04ad", - "teharabic": "\u062a", - "tehfinalarabic": "\ufe96", - "tehhahinitialarabic": "\ufca2", - "tehhahisolatedarabic": "\ufc0c", - "tehinitialarabic": "\ufe97", - "tehiragana": "\u3066", - "tehjeeminitialarabic": "\ufca1", - "tehjeemisolatedarabic": "\ufc0b", - "tehmarbutaarabic": "\u0629", - "tehmarbutafinalarabic": "\ufe94", - "tehmedialarabic": "\ufe98", - "tehmeeminitialarabic": "\ufca4", - "tehmeemisolatedarabic": "\ufc0e", - "tehnoonfinalarabic": "\ufc73", - "tekatakana": "\u30c6", - "tekatakanahalfwidth": "\uff83", - "telephone": "\u2121", - "telephoneblack": "\u260e", - "telishagedolahebrew": "\u05a0", - "telishaqetanahebrew": "\u05a9", - "tencircle": "\u2469", - "tenideographicparen": "\u3229", - "tenparen": "\u247d", - "tenperiod": "\u2491", - "tenroman": "\u2179", - "tesh": "\u02a7", - "tet": "\u05d8", - "tetdagesh": "\ufb38", - "tetdageshhebrew": "\ufb38", - "tethebrew": "\u05d8", - "tetsecyrillic": "\u04b5", - "tevirhebrew": "\u059b", - "tevirlefthebrew": "\u059b", - "thabengali": "\u09a5", - "thadeva": "\u0925", - "thagujarati": "\u0aa5", - "thagurmukhi": "\u0a25", - "thalarabic": "\u0630", - "thalfinalarabic": "\ufeac", - "thanthakhatlowleftthai": "\uf898", - "thanthakhatlowrightthai": "\uf897", - "thanthakhatthai": "\u0e4c", - "thanthakhatupperleftthai": "\uf896", - "theharabic": "\u062b", - "thehfinalarabic": "\ufe9a", - "thehinitialarabic": "\ufe9b", - "thehmedialarabic": "\ufe9c", - "thereexists": "\u2203", - "therefore": "\u2234", - "theta": "\u03b8", - "theta1": "\u03d1", - "thetasymbolgreek": "\u03d1", - "thieuthacirclekorean": "\u3279", - "thieuthaparenkorean": "\u3219", - "thieuthcirclekorean": "\u326b", - "thieuthkorean": "\u314c", - "thieuthparenkorean": "\u320b", - "thirteencircle": "\u246c", - "thirteenparen": "\u2480", - "thirteenperiod": "\u2494", - "thonangmonthothai": "\u0e11", - "thook": "\u01ad", - "thophuthaothai": "\u0e12", - "thorn": "\u00fe", - "thothahanthai": "\u0e17", - "thothanthai": "\u0e10", - "thothongthai": "\u0e18", - "thothungthai": "\u0e16", - "thousandcyrillic": "\u0482", - "thousandsseparatorarabic": "\u066c", - "thousandsseparatorpersian": "\u066c", - "three": "\u0033", - "threearabic": "\u0663", - "threebengali": "\u09e9", - "threecircle": "\u2462", - "threecircleinversesansserif": "\u278c", - "threedeva": "\u0969", - "threeeighths": "\u215c", - "threegujarati": "\u0ae9", - "threegurmukhi": "\u0a69", - "threehackarabic": "\u0663", - "threehangzhou": "\u3023", - "threeideographicparen": "\u3222", - "threeinferior": "\u2083", - "threemonospace": "\uff13", - "threenumeratorbengali": "\u09f6", - "threeoldstyle": "\uf733", - "threeparen": "\u2476", - "threeperiod": "\u248a", - "threepersian": "\u06f3", - "threequarters": "\u00be", - "threequartersemdash": "\uf6de", - "threeroman": "\u2172", - "threesuperior": "\u00b3", - "threethai": "\u0e53", - "thzsquare": "\u3394", - "tihiragana": "\u3061", - "tikatakana": "\u30c1", - "tikatakanahalfwidth": "\uff81", - "tikeutacirclekorean": "\u3270", - "tikeutaparenkorean": "\u3210", - "tikeutcirclekorean": "\u3262", - "tikeutkorean": "\u3137", - "tikeutparenkorean": "\u3202", - "tilde": "\u02dc", - "tildebelowcmb": "\u0330", - "tildecmb": "\u0303", - "tildecomb": "\u0303", - "tildedoublecmb": "\u0360", - "tildeoperator": "\u223c", - "tildeoverlaycmb": "\u0334", - "tildeverticalcmb": "\u033e", - "timescircle": "\u2297", - "tipehahebrew": "\u0596", - "tipehalefthebrew": "\u0596", - "tippigurmukhi": "\u0a70", - "titlocyrilliccmb": "\u0483", - "tiwnarmenian": "\u057f", - "tlinebelow": "\u1e6f", - "tmonospace": "\uff54", - "toarmenian": "\u0569", - "tohiragana": "\u3068", - "tokatakana": "\u30c8", - "tokatakanahalfwidth": "\uff84", - "tonebarextrahighmod": "\u02e5", - "tonebarextralowmod": "\u02e9", - "tonebarhighmod": "\u02e6", - "tonebarlowmod": "\u02e8", - "tonebarmidmod": "\u02e7", - "tonefive": "\u01bd", - "tonesix": "\u0185", - "tonetwo": "\u01a8", - "tonos": "\u0384", - "tonsquare": "\u3327", - "topatakthai": "\u0e0f", - "tortoiseshellbracketleft": "\u3014", - "tortoiseshellbracketleftsmall": "\ufe5d", - "tortoiseshellbracketleftvertical": "\ufe39", - "tortoiseshellbracketright": "\u3015", - "tortoiseshellbracketrightsmall": "\ufe5e", - "tortoiseshellbracketrightvertical": "\ufe3a", - "totaothai": "\u0e15", - "tpalatalhook": "\u01ab", - "tparen": "\u24af", - "trademark": "\u2122", - "trademarksans": "\uf8ea", - "trademarkserif": "\uf6db", - "tretroflexhook": "\u0288", - "triagdn": "\u25bc", - "triaglf": "\u25c4", - "triagrt": "\u25ba", - "triagup": "\u25b2", - "ts": "\u02a6", - "tsadi": "\u05e6", - "tsadidagesh": "\ufb46", - "tsadidageshhebrew": "\ufb46", - "tsadihebrew": "\u05e6", - "tsecyrillic": "\u0446", - "tsere": "\u05b5", - "tsere12": "\u05b5", - "tsere1e": "\u05b5", - "tsere2b": "\u05b5", - "tserehebrew": "\u05b5", - "tserenarrowhebrew": "\u05b5", - "tserequarterhebrew": "\u05b5", - "tserewidehebrew": "\u05b5", - "tshecyrillic": "\u045b", - "tsuperior": "\uf6f3", - "ttabengali": "\u099f", - "ttadeva": "\u091f", - "ttagujarati": "\u0a9f", - "ttagurmukhi": "\u0a1f", - "tteharabic": "\u0679", - "ttehfinalarabic": "\ufb67", - "ttehinitialarabic": "\ufb68", - "ttehmedialarabic": "\ufb69", - "tthabengali": "\u09a0", - "tthadeva": "\u0920", - "tthagujarati": "\u0aa0", - "tthagurmukhi": "\u0a20", - "tturned": "\u0287", - "tuhiragana": "\u3064", - "tukatakana": "\u30c4", - "tukatakanahalfwidth": "\uff82", - "tusmallhiragana": "\u3063", - "tusmallkatakana": "\u30c3", - "tusmallkatakanahalfwidth": "\uff6f", - "twelvecircle": "\u246b", - "twelveparen": "\u247f", - "twelveperiod": "\u2493", - "twelveroman": "\u217b", - "twentycircle": "\u2473", - "twentyhangzhou": "\u5344", - "twentyparen": "\u2487", - "twentyperiod": "\u249b", - "two": "\u0032", - "twoarabic": "\u0662", - "twobengali": "\u09e8", - "twocircle": "\u2461", - "twocircleinversesansserif": "\u278b", - "twodeva": "\u0968", - "twodotenleader": "\u2025", - "twodotleader": "\u2025", - "twodotleadervertical": "\ufe30", - "twogujarati": "\u0ae8", - "twogurmukhi": "\u0a68", - "twohackarabic": "\u0662", - "twohangzhou": "\u3022", - "twoideographicparen": "\u3221", - "twoinferior": "\u2082", - "twomonospace": "\uff12", - "twonumeratorbengali": "\u09f5", - "twooldstyle": "\uf732", - "twoparen": "\u2475", - "twoperiod": "\u2489", - "twopersian": "\u06f2", - "tworoman": "\u2171", - "twostroke": "\u01bb", - "twosuperior": "\u00b2", - "twothai": "\u0e52", - "twothirds": "\u2154", - "u": "\u0075", - "uacute": "\u00fa", - "ubar": "\u0289", - "ubengali": "\u0989", - "ubopomofo": "\u3128", - "ubreve": "\u016d", - "ucaron": "\u01d4", - "ucircle": "\u24e4", - "ucircumflex": "\u00fb", - "ucircumflexbelow": "\u1e77", - "ucyrillic": "\u0443", - "udattadeva": "\u0951", - "udblacute": "\u0171", - "udblgrave": "\u0215", - "udeva": "\u0909", - "udieresis": "\u00fc", - "udieresisacute": "\u01d8", - "udieresisbelow": "\u1e73", - "udieresiscaron": "\u01da", - "udieresiscyrillic": "\u04f1", - "udieresisgrave": "\u01dc", - "udieresismacron": "\u01d6", - "udotbelow": "\u1ee5", - "ugrave": "\u00f9", - "ugujarati": "\u0a89", - "ugurmukhi": "\u0a09", - "uhiragana": "\u3046", - "uhookabove": "\u1ee7", - "uhorn": "\u01b0", - "uhornacute": "\u1ee9", - "uhorndotbelow": "\u1ef1", - "uhorngrave": "\u1eeb", - "uhornhookabove": "\u1eed", - "uhorntilde": "\u1eef", - "uhungarumlaut": "\u0171", - "uhungarumlautcyrillic": "\u04f3", - "uinvertedbreve": "\u0217", - "ukatakana": "\u30a6", - "ukatakanahalfwidth": "\uff73", - "ukcyrillic": "\u0479", - "ukorean": "\u315c", - "umacron": "\u016b", - "umacroncyrillic": "\u04ef", - "umacrondieresis": "\u1e7b", - "umatragurmukhi": "\u0a41", - "umonospace": "\uff55", - "underscore": "\u005f", - "underscoredbl": "\u2017", - "underscoremonospace": "\uff3f", - "underscorevertical": "\ufe33", - "underscorewavy": "\ufe4f", - "union": "\u222a", - "universal": "\u2200", - "uogonek": "\u0173", - "uparen": "\u24b0", - "upblock": "\u2580", - "upperdothebrew": "\u05c4", - "upsilon": "\u03c5", - "upsilondieresis": "\u03cb", - "upsilondieresistonos": "\u03b0", - "upsilonlatin": "\u028a", - "upsilontonos": "\u03cd", - "uptackbelowcmb": "\u031d", - "uptackmod": "\u02d4", - "uragurmukhi": "\u0a73", - "uring": "\u016f", - "ushortcyrillic": "\u045e", - "usmallhiragana": "\u3045", - "usmallkatakana": "\u30a5", - "usmallkatakanahalfwidth": "\uff69", - "ustraightcyrillic": "\u04af", - "ustraightstrokecyrillic": "\u04b1", - "utilde": "\u0169", - "utildeacute": "\u1e79", - "utildebelow": "\u1e75", - "uubengali": "\u098a", - "uudeva": "\u090a", - "uugujarati": "\u0a8a", - "uugurmukhi": "\u0a0a", - "uumatragurmukhi": "\u0a42", - "uuvowelsignbengali": "\u09c2", - "uuvowelsigndeva": "\u0942", - "uuvowelsigngujarati": "\u0ac2", - "uvowelsignbengali": "\u09c1", - "uvowelsigndeva": "\u0941", - "uvowelsigngujarati": "\u0ac1", - "v": "\u0076", - "vadeva": "\u0935", - "vagujarati": "\u0ab5", - "vagurmukhi": "\u0a35", - "vakatakana": "\u30f7", - "vav": "\u05d5", - "vavdagesh": "\ufb35", - "vavdagesh65": "\ufb35", - "vavdageshhebrew": "\ufb35", - "vavhebrew": "\u05d5", - "vavholam": "\ufb4b", - "vavholamhebrew": "\ufb4b", - "vavvavhebrew": "\u05f0", - "vavyodhebrew": "\u05f1", - "vcircle": "\u24e5", - "vdotbelow": "\u1e7f", - "vecyrillic": "\u0432", - "veharabic": "\u06a4", - "vehfinalarabic": "\ufb6b", - "vehinitialarabic": "\ufb6c", - "vehmedialarabic": "\ufb6d", - "vekatakana": "\u30f9", - "venus": "\u2640", - "verticalbar": "\u007c", - "verticallineabovecmb": "\u030d", - "verticallinebelowcmb": "\u0329", - "verticallinelowmod": "\u02cc", - "verticallinemod": "\u02c8", - "vewarmenian": "\u057e", - "vhook": "\u028b", - "vikatakana": "\u30f8", - "viramabengali": "\u09cd", - "viramadeva": "\u094d", - "viramagujarati": "\u0acd", - "visargabengali": "\u0983", - "visargadeva": "\u0903", - "visargagujarati": "\u0a83", - "vmonospace": "\uff56", - "voarmenian": "\u0578", - "voicediterationhiragana": "\u309e", - "voicediterationkatakana": "\u30fe", - "voicedmarkkana": "\u309b", - "voicedmarkkanahalfwidth": "\uff9e", - "vokatakana": "\u30fa", - "vparen": "\u24b1", - "vtilde": "\u1e7d", - "vturned": "\u028c", - "vuhiragana": "\u3094", - "vukatakana": "\u30f4", - "w": "\u0077", - "wacute": "\u1e83", - "waekorean": "\u3159", - "wahiragana": "\u308f", - "wakatakana": "\u30ef", - "wakatakanahalfwidth": "\uff9c", - "wakorean": "\u3158", - "wasmallhiragana": "\u308e", - "wasmallkatakana": "\u30ee", - "wattosquare": "\u3357", - "wavedash": "\u301c", - "wavyunderscorevertical": "\ufe34", - "wawarabic": "\u0648", - "wawfinalarabic": "\ufeee", - "wawhamzaabovearabic": "\u0624", - "wawhamzaabovefinalarabic": "\ufe86", - "wbsquare": "\u33dd", - "wcircle": "\u24e6", - "wcircumflex": "\u0175", - "wdieresis": "\u1e85", - "wdotaccent": "\u1e87", - "wdotbelow": "\u1e89", - "wehiragana": "\u3091", - "weierstrass": "\u2118", - "wekatakana": "\u30f1", - "wekorean": "\u315e", - "weokorean": "\u315d", - "wgrave": "\u1e81", - "whitebullet": "\u25e6", - "whitecircle": "\u25cb", - "whitecircleinverse": "\u25d9", - "whitecornerbracketleft": "\u300e", - "whitecornerbracketleftvertical": "\ufe43", - "whitecornerbracketright": "\u300f", - "whitecornerbracketrightvertical": "\ufe44", - "whitediamond": "\u25c7", - "whitediamondcontainingblacksmalldiamond": "\u25c8", - "whitedownpointingsmalltriangle": "\u25bf", - "whitedownpointingtriangle": "\u25bd", - "whiteleftpointingsmalltriangle": "\u25c3", - "whiteleftpointingtriangle": "\u25c1", - "whitelenticularbracketleft": "\u3016", - "whitelenticularbracketright": "\u3017", - "whiterightpointingsmalltriangle": "\u25b9", - "whiterightpointingtriangle": "\u25b7", - "whitesmallsquare": "\u25ab", - "whitesmilingface": "\u263a", - "whitesquare": "\u25a1", - "whitestar": "\u2606", - "whitetelephone": "\u260f", - "whitetortoiseshellbracketleft": "\u3018", - "whitetortoiseshellbracketright": "\u3019", - "whiteuppointingsmalltriangle": "\u25b5", - "whiteuppointingtriangle": "\u25b3", - "wihiragana": "\u3090", - "wikatakana": "\u30f0", - "wikorean": "\u315f", - "wmonospace": "\uff57", - "wohiragana": "\u3092", - "wokatakana": "\u30f2", - "wokatakanahalfwidth": "\uff66", - "won": "\u20a9", - "wonmonospace": "\uffe6", - "wowaenthai": "\u0e27", - "wparen": "\u24b2", - "wring": "\u1e98", - "wsuperior": "\u02b7", - "wturned": "\u028d", - "wynn": "\u01bf", - "x": "\u0078", - "xabovecmb": "\u033d", - "xbopomofo": "\u3112", - "xcircle": "\u24e7", - "xdieresis": "\u1e8d", - "xdotaccent": "\u1e8b", - "xeharmenian": "\u056d", - "xi": "\u03be", - "xmonospace": "\uff58", - "xparen": "\u24b3", - "xsuperior": "\u02e3", - "y": "\u0079", - "yaadosquare": "\u334e", - "yabengali": "\u09af", - "yacute": "\u00fd", - "yadeva": "\u092f", - "yaekorean": "\u3152", - "yagujarati": "\u0aaf", - "yagurmukhi": "\u0a2f", - "yahiragana": "\u3084", - "yakatakana": "\u30e4", - "yakatakanahalfwidth": "\uff94", - "yakorean": "\u3151", - "yamakkanthai": "\u0e4e", - "yasmallhiragana": "\u3083", - "yasmallkatakana": "\u30e3", - "yasmallkatakanahalfwidth": "\uff6c", - "yatcyrillic": "\u0463", - "ycircle": "\u24e8", - "ycircumflex": "\u0177", - "ydieresis": "\u00ff", - "ydotaccent": "\u1e8f", - "ydotbelow": "\u1ef5", - "yeharabic": "\u064a", - "yehbarreearabic": "\u06d2", - "yehbarreefinalarabic": "\ufbaf", - "yehfinalarabic": "\ufef2", - "yehhamzaabovearabic": "\u0626", - "yehhamzaabovefinalarabic": "\ufe8a", - "yehhamzaaboveinitialarabic": "\ufe8b", - "yehhamzaabovemedialarabic": "\ufe8c", - "yehinitialarabic": "\ufef3", - "yehmedialarabic": "\ufef4", - "yehmeeminitialarabic": "\ufcdd", - "yehmeemisolatedarabic": "\ufc58", - "yehnoonfinalarabic": "\ufc94", - "yehthreedotsbelowarabic": "\u06d1", - "yekorean": "\u3156", - "yen": "\u00a5", - "yenmonospace": "\uffe5", - "yeokorean": "\u3155", - "yeorinhieuhkorean": "\u3186", - "yerahbenyomohebrew": "\u05aa", - "yerahbenyomolefthebrew": "\u05aa", - "yericyrillic": "\u044b", - "yerudieresiscyrillic": "\u04f9", - "yesieungkorean": "\u3181", - "yesieungpansioskorean": "\u3183", - "yesieungsioskorean": "\u3182", - "yetivhebrew": "\u059a", - "ygrave": "\u1ef3", - "yhook": "\u01b4", - "yhookabove": "\u1ef7", - "yiarmenian": "\u0575", - "yicyrillic": "\u0457", - "yikorean": "\u3162", - "yinyang": "\u262f", - "yiwnarmenian": "\u0582", - "ymonospace": "\uff59", - "yod": "\u05d9", - "yoddagesh": "\ufb39", - "yoddageshhebrew": "\ufb39", - "yodhebrew": "\u05d9", - "yodyodhebrew": "\u05f2", - "yodyodpatahhebrew": "\ufb1f", - "yohiragana": "\u3088", - "yoikorean": "\u3189", - "yokatakana": "\u30e8", - "yokatakanahalfwidth": "\uff96", - "yokorean": "\u315b", - "yosmallhiragana": "\u3087", - "yosmallkatakana": "\u30e7", - "yosmallkatakanahalfwidth": "\uff6e", - "yotgreek": "\u03f3", - "yoyaekorean": "\u3188", - "yoyakorean": "\u3187", - "yoyakthai": "\u0e22", - "yoyingthai": "\u0e0d", - "yparen": "\u24b4", - "ypogegrammeni": "\u037a", - "ypogegrammenigreekcmb": "\u0345", - "yr": "\u01a6", - "yring": "\u1e99", - "ysuperior": "\u02b8", - "ytilde": "\u1ef9", - "yturned": "\u028e", - "yuhiragana": "\u3086", - "yuikorean": "\u318c", - "yukatakana": "\u30e6", - "yukatakanahalfwidth": "\uff95", - "yukorean": "\u3160", - "yusbigcyrillic": "\u046b", - "yusbigiotifiedcyrillic": "\u046d", - "yuslittlecyrillic": "\u0467", - "yuslittleiotifiedcyrillic": "\u0469", - "yusmallhiragana": "\u3085", - "yusmallkatakana": "\u30e5", - "yusmallkatakanahalfwidth": "\uff6d", - "yuyekorean": "\u318b", - "yuyeokorean": "\u318a", - "yyabengali": "\u09df", - "yyadeva": "\u095f", - "z": "\u007a", - "zaarmenian": "\u0566", - "zacute": "\u017a", - "zadeva": "\u095b", - "zagurmukhi": "\u0a5b", - "zaharabic": "\u0638", - "zahfinalarabic": "\ufec6", - "zahinitialarabic": "\ufec7", - "zahiragana": "\u3056", - "zahmedialarabic": "\ufec8", - "zainarabic": "\u0632", - "zainfinalarabic": "\ufeb0", - "zakatakana": "\u30b6", - "zaqefgadolhebrew": "\u0595", - "zaqefqatanhebrew": "\u0594", - "zarqahebrew": "\u0598", - "zayin": "\u05d6", - "zayindagesh": "\ufb36", - "zayindageshhebrew": "\ufb36", - "zayinhebrew": "\u05d6", - "zbopomofo": "\u3117", - "zcaron": "\u017e", - "zcircle": "\u24e9", - "zcircumflex": "\u1e91", - "zcurl": "\u0291", - "zdot": "\u017c", - "zdotaccent": "\u017c", - "zdotbelow": "\u1e93", - "zecyrillic": "\u0437", - "zedescendercyrillic": "\u0499", - "zedieresiscyrillic": "\u04df", - "zehiragana": "\u305c", - "zekatakana": "\u30bc", - "zero": "\u0030", - "zeroarabic": "\u0660", - "zerobengali": "\u09e6", - "zerodeva": "\u0966", - "zerogujarati": "\u0ae6", - "zerogurmukhi": "\u0a66", - "zerohackarabic": "\u0660", - "zeroinferior": "\u2080", - "zeromonospace": "\uff10", - "zerooldstyle": "\uf730", - "zeropersian": "\u06f0", - "zerosuperior": "\u2070", - "zerothai": "\u0e50", - "zerowidthjoiner": "\ufeff", - "zerowidthnonjoiner": "\u200c", - "zerowidthspace": "\u200b", - "zeta": "\u03b6", - "zhbopomofo": "\u3113", - "zhearmenian": "\u056a", - "zhebrevecyrillic": "\u04c2", - "zhecyrillic": "\u0436", - "zhedescendercyrillic": "\u0497", - "zhedieresiscyrillic": "\u04dd", - "zihiragana": "\u3058", - "zikatakana": "\u30b8", - "zinorhebrew": "\u05ae", - "zlinebelow": "\u1e95", - "zmonospace": "\uff5a", - "zohiragana": "\u305e", - "zokatakana": "\u30be", - "zparen": "\u24b5", - "zretroflexhook": "\u0290", - "zstroke": "\u01b6", - "zuhiragana": "\u305a", - "zukatakana": "\u30ba", -} -# --end diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py index c525454..f696270 100644 --- a/pdf2zh/high_level.py +++ b/pdf2zh/high_level.py @@ -1,45 +1,36 @@ """Functions that can be used for the most common use-cases for pdf2zh.six""" import logging -import sys -from io import StringIO -from typing import Any, BinaryIO, Container, Iterator, Optional, cast +from typing import BinaryIO import numpy as np import tqdm from pymupdf import Document +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfparser import PDFParser +from pdf2zh.converter import TranslateConverter +from pdf2zh.pdfinterp import PDFPageInterpreterEx -from pdf2zh.converter import ( - HOCRConverter, - HTMLConverter, - PDFPageAggregator, - TextConverter, - XMLConverter, -) -from pdf2zh.image import ImageWriter -from pdf2zh.layout import LAParams, LTPage -from pdf2zh.pdfdevice import PDFDevice, TagExtractor -from pdf2zh.pdfexceptions import PDFValueError -from pdf2zh.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdf2zh.pdfpage import PDFPage -from pdf2zh.utils import AnyIO, FileOrName, open_filename, get_device + +def get_device(): + """Get the device to use for computation.""" + try: + import torch + + if torch.cuda.is_available(): + return "cuda:0" + except ImportError: + pass + + return "cpu" def extract_text_to_fp( inf: BinaryIO, - outfp: AnyIO, - output_type: str = "text", - codec: str = "utf-8", - laparams: Optional[LAParams] = None, - maxpages: int = 0, - pages: Optional[Container[int]] = None, + pages=None, password: str = "", - scale: float = 1.0, - rotation: int = 0, - layoutmode: str = "normal", - output_dir: Optional[str] = None, - strip_control: bool = False, debug: bool = False, - disable_caching: bool = False, page_count: int = 0, vfont: str = "", vchar: str = "", @@ -50,126 +41,37 @@ def extract_text_to_fp( lang_out: str = "", service: str = "", callback: object = None, - **kwargs: Any, + **kwarg, ) -> None: - """Parses text from inf-file and writes to outfp file-like object. - - Takes loads of optional arguments but the defaults are somewhat sane. - Beware laparams: Including an empty LAParams is not the same as passing - None! - - :param inf: a file-like object to read PDF structure from, such as a - file handler (using the builtin `open()` function) or a `BytesIO`. - :param outfp: a file-like object to write the text to. - :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. - Only 'text' works properly. - :param codec: Text decoding codec - :param laparams: An LAParams object from pdf2zh.layout. Default is None - but may not layout correctly. - :param maxpages: How many pages to stop parsing after - :param page_numbers: zero-indexed page numbers to operate on. - :param password: For encrypted PDFs, the password to decrypt. - :param scale: Scale factor - :param rotation: Rotation factor - :param layoutmode: Default is 'normal', see - pdf2zh.converter.HTMLConverter - :param output_dir: If given, creates an ImageWriter for extracted images. - :param strip_control: Does what it says on the tin - :param debug: Output more logging data - :param disable_caching: Does what it says on the tin - :param other: - :return: nothing, acting as it does on two streams. Use StringIO to get - strings. - """ if debug: logging.getLogger().setLevel(logging.DEBUG) - imagewriter = None - if output_dir: - imagewriter = ImageWriter(output_dir) - - rsrcmgr = PDFResourceManager(caching=not disable_caching) - device: Optional[PDFDevice] = None + rsrcmgr = PDFResourceManager() layout = {} - - if output_type != "text" and outfp == sys.stdout: - outfp = sys.stdout.buffer - - if output_type == "text": - device = TextConverter( - rsrcmgr, - outfp, - codec=codec, - laparams=laparams, - imagewriter=imagewriter, - vfont=vfont, - vchar=vchar, - thread=thread, - layout=layout, - lang_in=lang_in, - lang_out=lang_out, - service=service, - ) - - elif output_type == "xml": - device = XMLConverter( - rsrcmgr, - outfp, - codec=codec, - laparams=laparams, - imagewriter=imagewriter, - stripcontrol=strip_control, - ) - - elif output_type == "html": - device = HTMLConverter( - rsrcmgr, - outfp, - codec=codec, - scale=scale, - layoutmode=layoutmode, - laparams=laparams, - imagewriter=imagewriter, - ) - - elif output_type == "hocr": - device = HOCRConverter( - rsrcmgr, - outfp, - codec=codec, - laparams=laparams, - stripcontrol=strip_control, - ) - - elif output_type == "tag": - # Binary I/O is required, but we have no good way to test it here. - device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) - - else: - msg = f"Output type can be text, html, xml or tag but is {output_type}" - raise PDFValueError(msg) + device = TranslateConverter( + rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service + ) assert device is not None obj_patch = {} - interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch) + interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch) if pages: total_pages = len(pages) else: total_pages = page_count + + parser = PDFParser(inf) + doc = PDFDocument(parser, password=password) with tqdm.tqdm( - PDFPage.get_pages( - inf, - pages, - maxpages=maxpages, - password=password, - caching=not disable_caching, - ), + enumerate(PDFPage.create_pages(doc)), total=total_pages, - position=0, ) as progress: - for page in progress: + for pageno, page in progress: + if pages and (pageno not in pages): + continue if callback: callback(progress) + page.pageno = pageno pix = doc_en[page.pageno].get_pixmap() image = np.fromstring(pix.samples, np.uint8).reshape( pix.height, pix.width, 3 @@ -202,8 +104,6 @@ def extract_text_to_fp( ) box[y0:y1, x0:x1] = 0 layout[page.pageno] = box - # print(page.number,page_layout) - page.rotate = (page.rotate + rotation) % 360 # 新建一个 xref 存放新指令流 page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref doc_en.update_object(page.page_xref, "<<>>") @@ -213,86 +113,3 @@ def extract_text_to_fp( device.close() return obj_patch - - -def extract_text( - pdf_file: FileOrName, - password: str = "", - page_numbers: Optional[Container[int]] = None, - maxpages: int = 0, - caching: bool = True, - codec: str = "utf-8", - laparams: Optional[LAParams] = None, -) -> str: - """Parse and return the text contained in a PDF file. - - :param pdf_file: Either a file path or a file-like object for the PDF file - to be worked on. - :param password: For encrypted PDFs, the password to decrypt. - :param page_numbers: List of zero-indexed page numbers to extract. - :param maxpages: The maximum number of pages to parse - :param caching: If resources should be cached - :param codec: Text decoding codec - :param laparams: An LAParams object from pdf2zh.layout. If None, uses - some default settings that often work well. - :return: a string containing all of the text extracted. - """ - if laparams is None: - laparams = LAParams() - - with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: - fp = cast(BinaryIO, fp) # we opened in binary mode - rsrcmgr = PDFResourceManager(caching=caching) - device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - for page in PDFPage.get_pages( - fp, - page_numbers, - maxpages=maxpages, - password=password, - caching=caching, - ): - interpreter.process_page(page) - - return output_string.getvalue() - - -def extract_pages( - pdf_file: FileOrName, - password: str = "", - page_numbers: Optional[Container[int]] = None, - maxpages: int = 0, - caching: bool = True, - laparams: Optional[LAParams] = None, -) -> Iterator[LTPage]: - """Extract and yield LTPage objects - - :param pdf_file: Either a file path or a file-like object for the PDF file - to be worked on. - :param password: For encrypted PDFs, the password to decrypt. - :param page_numbers: List of zero-indexed page numbers to extract. - :param maxpages: The maximum number of pages to parse - :param caching: If resources should be cached - :param laparams: An LAParams object from pdf2zh.layout. If None, uses - some default settings that often work well. - :return: LTPage objects - """ - if laparams is None: - laparams = LAParams() - - with open_filename(pdf_file, "rb") as fp: - fp = cast(BinaryIO, fp) # we opened in binary mode - resource_manager = PDFResourceManager(caching=caching) - device = PDFPageAggregator(resource_manager, laparams=laparams) - interpreter = PDFPageInterpreter(resource_manager, device) - for page in PDFPage.get_pages( - fp, - page_numbers, - maxpages=maxpages, - password=password, - caching=caching, - ): - interpreter.process_page(page) - layout = device.get_result() - yield layout diff --git a/pdf2zh/image.py b/pdf2zh/image.py deleted file mode 100644 index 99e8e8c..0000000 --- a/pdf2zh/image.py +++ /dev/null @@ -1,297 +0,0 @@ -import os -import os.path -import struct -from io import BytesIO -from typing import BinaryIO, Tuple - -try: - from typing import Literal -except ImportError: - # Literal was introduced in Python 3.8 - from typing_extensions import Literal # type: ignore[assignment] - -from pdf2zh.jbig2 import JBIG2StreamReader, JBIG2StreamWriter -from pdf2zh.layout import LTImage -from pdf2zh.pdfcolor import ( - LITERAL_DEVICE_CMYK, - LITERAL_DEVICE_GRAY, - LITERAL_DEVICE_RGB, - LITERAL_INLINE_DEVICE_GRAY, - LITERAL_INLINE_DEVICE_RGB, -) -from pdf2zh.pdfexceptions import PDFValueError -from pdf2zh.pdftypes import ( - LITERALS_DCT_DECODE, - LITERALS_FLATE_DECODE, - LITERALS_JBIG2_DECODE, - LITERALS_JPX_DECODE, -) - -PIL_ERROR_MESSAGE = ( - "Could not import Pillow. This dependency of pdf2zh.six is not " - "installed by default. You need it to to save jpg images to a file. Install it " - "with `pip install 'pdf2zh.six[image]'`" -) - - -def align32(x: int) -> int: - return ((x + 3) // 4) * 4 - - -class BMPWriter: - def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None: - self.fp = fp - self.bits = bits - self.width = width - self.height = height - if bits == 1: - ncols = 2 - elif bits == 8: - ncols = 256 - elif bits == 24: - ncols = 0 - else: - raise PDFValueError(bits) - self.linesize = align32((self.width * self.bits + 7) // 8) - self.datasize = self.linesize * self.height - headersize = 14 + 40 + ncols * 4 - info = struct.pack( - " None: - self.fp.seek(self.pos1 - (y + 1) * self.linesize) - self.fp.write(data) - - -class ImageWriter: - """Write image to a file - - Supports various image types: JPEG, JBIG2 and bitmaps - """ - - def __init__(self, outdir: str) -> None: - self.outdir = outdir - if not os.path.exists(self.outdir): - os.makedirs(self.outdir) - - def export_image(self, image: LTImage) -> str: - """Save an LTImage to disk""" - (width, height) = image.srcsize - - filters = image.stream.get_filters() - - if filters[-1][0] in LITERALS_DCT_DECODE: - name = self._save_jpeg(image) - - elif filters[-1][0] in LITERALS_JPX_DECODE: - name = self._save_jpeg2000(image) - - elif self._is_jbig2_iamge(image): - name = self._save_jbig2(image) - - elif image.bits == 1: - name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) - - elif image.bits == 8 and ( - LITERAL_DEVICE_RGB in image.colorspace - or LITERAL_INLINE_DEVICE_RGB in image.colorspace - ): - name = self._save_bmp(image, width, height, width * 3, image.bits * 3) - - elif image.bits == 8 and ( - LITERAL_DEVICE_GRAY in image.colorspace - or LITERAL_INLINE_DEVICE_GRAY in image.colorspace - ): - name = self._save_bmp(image, width, height, width, image.bits) - - elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: - name = self._save_bytes(image) - - else: - name = self._save_raw(image) - - return name - - def _save_jpeg(self, image: LTImage) -> str: - """Save a JPEG encoded image""" - data = image.stream.get_data() - - name, path = self._create_unique_image_name(image, ".jpg") - with open(path, "wb") as fp: - if LITERAL_DEVICE_CMYK in image.colorspace: - try: - from PIL import Image, ImageChops # type: ignore[import] - except ImportError: - raise ImportError(PIL_ERROR_MESSAGE) - - ifp = BytesIO(data) - i = Image.open(ifp) - i = ImageChops.invert(i) - i = i.convert("RGB") - i.save(fp, "JPEG") - else: - fp.write(data) - - return name - - def _save_jpeg2000(self, image: LTImage) -> str: - """Save a JPEG 2000 encoded image""" - data = image.stream.get_data() - - name, path = self._create_unique_image_name(image, ".jp2") - with open(path, "wb") as fp: - try: - from PIL import Image # type: ignore[import] - except ImportError: - raise ImportError(PIL_ERROR_MESSAGE) - - # if we just write the raw data, most image programs - # that I have tried cannot open the file. However, - # open and saving with PIL produces a file that - # seems to be easily opened by other programs - ifp = BytesIO(data) - i = Image.open(ifp) - i.save(fp, "JPEG2000") - return name - - def _save_jbig2(self, image: LTImage) -> str: - """Save a JBIG2 encoded image""" - name, path = self._create_unique_image_name(image, ".jb2") - with open(path, "wb") as fp: - input_stream = BytesIO() - - global_streams = [] - filters = image.stream.get_filters() - for filter_name, params in filters: - if filter_name in LITERALS_JBIG2_DECODE: - global_streams.append(params["JBIG2Globals"].resolve()) - - if len(global_streams) > 1: - msg = ( - "There should never be more than one JBIG2Globals " - "associated with a JBIG2 embedded image" - ) - raise PDFValueError(msg) - if len(global_streams) == 1: - input_stream.write(global_streams[0].get_data().rstrip(b"\n")) - input_stream.write(image.stream.get_data()) - input_stream.seek(0) - reader = JBIG2StreamReader(input_stream) - segments = reader.get_segments() - - writer = JBIG2StreamWriter(fp) - writer.write_file(segments) - return name - - def _save_bmp( - self, - image: LTImage, - width: int, - height: int, - bytes_per_line: int, - bits: int, - ) -> str: - """Save a BMP encoded image""" - name, path = self._create_unique_image_name(image, ".bmp") - with open(path, "wb") as fp: - bmp = BMPWriter(fp, bits, width, height) - data = image.stream.get_data() - i = 0 - for y in range(height): - bmp.write_line(y, data[i : i + bytes_per_line]) - i += bytes_per_line - return name - - def _save_bytes(self, image: LTImage) -> str: - """Save an image without encoding, just bytes""" - name, path = self._create_unique_image_name(image, ".jpg") - width, height = image.srcsize - channels = len(image.stream.get_data()) / width / height / (image.bits / 8) - with open(path, "wb") as fp: - try: - from PIL import ( - Image, # type: ignore[import] - ImageOps, - ) - except ImportError: - raise ImportError(PIL_ERROR_MESSAGE) - - mode: Literal["1", "L", "RGB", "CMYK"] - if image.bits == 1: - mode = "1" - elif image.bits == 8 and channels == 1: - mode = "L" - elif image.bits == 8 and channels == 3: - mode = "RGB" - elif image.bits == 8 and channels == 4: - mode = "CMYK" - - img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw") - if mode == "L": - img = ImageOps.invert(img) - - img.save(fp) - - return name - - def _save_raw(self, image: LTImage) -> str: - """Save an image with unknown encoding""" - ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1]) - name, path = self._create_unique_image_name(image, ext) - - with open(path, "wb") as fp: - fp.write(image.stream.get_data()) - return name - - @staticmethod - def _is_jbig2_iamge(image: LTImage) -> bool: - filters = image.stream.get_filters() - for filter_name, params in filters: - if filter_name in LITERALS_JBIG2_DECODE: - return True - return False - - def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]: - name = image.name + ext - path = os.path.join(self.outdir, name) - img_index = 0 - while os.path.exists(path): - name = "%s.%d%s" % (image.name, img_index, ext) - path = os.path.join(self.outdir, name) - img_index += 1 - return name, path diff --git a/pdf2zh/jbig2.py b/pdf2zh/jbig2.py deleted file mode 100644 index 594abbf..0000000 --- a/pdf2zh/jbig2.py +++ /dev/null @@ -1,373 +0,0 @@ -import math -import os -from struct import calcsize, pack, unpack -from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast - -from pdf2zh.pdfexceptions import PDFValueError - -# segment structure base -SEG_STRUCT = [ - (">L", "number"), - (">B", "flags"), - (">B", "retention_flags"), - (">B", "page_assoc"), - (">L", "data_length"), -] - -# segment header literals -HEADER_FLAG_DEFERRED = 0b10000000 -HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000 - -SEG_TYPE_MASK = 0b00111111 - -REF_COUNT_SHORT_MASK = 0b11100000 -REF_COUNT_LONG_MASK = 0x1FFFFFFF -REF_COUNT_LONG = 7 - -DATA_LEN_UNKNOWN = 0xFFFFFFFF - -# segment types -SEG_TYPE_IMMEDIATE_GEN_REGION = 38 -SEG_TYPE_END_OF_PAGE = 49 -SEG_TYPE_END_OF_FILE = 51 - -# file literals -FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a" -FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 - - -def bit_set(bit_pos: int, value: int) -> bool: - return bool((value >> bit_pos) & 1) - - -def check_flag(flag: int, value: int) -> bool: - return bool(flag & value) - - -def masked_value(mask: int, value: int) -> int: - for bit_pos in range(31): - if bit_set(bit_pos, mask): - return (value & mask) >> bit_pos - - raise PDFValueError("Invalid mask or value") - - -def mask_value(mask: int, value: int) -> int: - for bit_pos in range(31): - if bit_set(bit_pos, mask): - return (value & (mask >> bit_pos)) << bit_pos - - raise PDFValueError("Invalid mask or value") - - -def unpack_int(format: str, buffer: bytes) -> int: - assert format in {">B", ">I", ">L"} - [result] = cast(Tuple[int], unpack(format, buffer)) - return result - - -JBIG2SegmentFlags = Dict[str, Union[int, bool]] -JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]] -JBIG2Segment = Dict[ - str, - Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags], -] - - -class JBIG2StreamReader: - """Read segments from a JBIG2 byte stream""" - - def __init__(self, stream: BinaryIO) -> None: - self.stream = stream - - def get_segments(self) -> List[JBIG2Segment]: - segments: List[JBIG2Segment] = [] - while not self.is_eof(): - segment: JBIG2Segment = {} - for field_format, name in SEG_STRUCT: - field_len = calcsize(field_format) - field = self.stream.read(field_len) - if len(field) < field_len: - segment["_error"] = True - break - value = unpack_int(field_format, field) - parser = getattr(self, "parse_%s" % name, None) - if callable(parser): - value = parser(segment, value, field) - segment[name] = value - - if not segment.get("_error"): - segments.append(segment) - return segments - - def is_eof(self) -> bool: - if self.stream.read(1) == b"": - return True - else: - self.stream.seek(-1, os.SEEK_CUR) - return False - - def parse_flags( - self, - segment: JBIG2Segment, - flags: int, - field: bytes, - ) -> JBIG2SegmentFlags: - return { - "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), - "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), - "type": masked_value(SEG_TYPE_MASK, flags), - } - - def parse_retention_flags( - self, - segment: JBIG2Segment, - flags: int, - field: bytes, - ) -> JBIG2RetentionFlags: - ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) - retain_segments = [] - ref_segments = [] - - if ref_count < REF_COUNT_LONG: - for bit_pos in range(5): - retain_segments.append(bit_set(bit_pos, flags)) - else: - field += self.stream.read(3) - ref_count = unpack_int(">L", field) - ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count) - ret_bytes_count = int(math.ceil((ref_count + 1) / 8)) - for ret_byte_index in range(ret_bytes_count): - ret_byte = unpack_int(">B", self.stream.read(1)) - for bit_pos in range(7): - retain_segments.append(bit_set(bit_pos, ret_byte)) - - seg_num = segment["number"] - assert isinstance(seg_num, int) - if seg_num <= 256: - ref_format = ">B" - elif seg_num <= 65536: - ref_format = ">I" - else: - ref_format = ">L" - - ref_size = calcsize(ref_format) - - for ref_index in range(ref_count): - ref_data = self.stream.read(ref_size) - ref = unpack_int(ref_format, ref_data) - ref_segments.append(ref) - - return { - "ref_count": ref_count, - "retain_segments": retain_segments, - "ref_segments": ref_segments, - } - - def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int: - if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]: - field += self.stream.read(3) - page = unpack_int(">L", field) - return page - - def parse_data_length( - self, - segment: JBIG2Segment, - length: int, - field: bytes, - ) -> int: - if length: - if ( - cast(JBIG2SegmentFlags, segment["flags"])["type"] - == SEG_TYPE_IMMEDIATE_GEN_REGION - ) and (length == DATA_LEN_UNKNOWN): - raise NotImplementedError( - "Working with unknown segment length is not implemented yet", - ) - else: - segment["raw_data"] = self.stream.read(length) - - return length - - -class JBIG2StreamWriter: - """Write JBIG2 segments to a file in JBIG2 format""" - - EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = { - "ref_count": 0, - "ref_segments": cast(List[int], []), - "retain_segments": cast(List[bool], []), - } - - def __init__(self, stream: BinaryIO) -> None: - self.stream = stream - - def write_segments( - self, - segments: Iterable[JBIG2Segment], - fix_last_page: bool = True, - ) -> int: - data_len = 0 - current_page: Optional[int] = None - seg_num: Optional[int] = None - - for segment in segments: - data = self.encode_segment(segment) - self.stream.write(data) - data_len += len(data) - - seg_num = cast(Optional[int], segment["number"]) - - if fix_last_page: - seg_page = cast(int, segment.get("page_assoc")) - - if ( - cast(JBIG2SegmentFlags, segment["flags"])["type"] - == SEG_TYPE_END_OF_PAGE - ): - current_page = None - elif seg_page: - current_page = seg_page - - if fix_last_page and current_page and (seg_num is not None): - segment = self.get_eop_segment(seg_num + 1, current_page) - data = self.encode_segment(segment) - self.stream.write(data) - data_len += len(data) - - return data_len - - def write_file( - self, - segments: Iterable[JBIG2Segment], - fix_last_page: bool = True, - ) -> int: - header = FILE_HEADER_ID - header_flags = FILE_HEAD_FLAG_SEQUENTIAL - header += pack(">B", header_flags) - # The embedded JBIG2 files in a PDF always - # only have one page - number_of_pages = pack(">L", 1) - header += number_of_pages - self.stream.write(header) - data_len = len(header) - - data_len += self.write_segments(segments, fix_last_page) - - seg_num = 0 - for segment in segments: - seg_num = cast(int, segment["number"]) - - if fix_last_page: - seg_num_offset = 2 - else: - seg_num_offset = 1 - eof_segment = self.get_eof_segment(seg_num + seg_num_offset) - data = self.encode_segment(eof_segment) - - self.stream.write(data) - data_len += len(data) - - return data_len - - def encode_segment(self, segment: JBIG2Segment) -> bytes: - data = b"" - for field_format, name in SEG_STRUCT: - value = segment.get(name) - encoder = getattr(self, "encode_%s" % name, None) - if callable(encoder): - field = encoder(value, segment) - else: - field = pack(field_format, value) - data += field - return data - - def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes: - flags = 0 - if value.get("deferred"): - flags |= HEADER_FLAG_DEFERRED - - if "page_assoc_long" in value: - flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags - else: - flags |= ( - HEADER_FLAG_PAGE_ASSOC_LONG - if cast(int, segment.get("page", 0)) > 255 - else flags - ) - - flags |= mask_value(SEG_TYPE_MASK, value["type"]) - - return pack(">B", flags) - - def encode_retention_flags( - self, - value: JBIG2RetentionFlags, - segment: JBIG2Segment, - ) -> bytes: - flags = [] - flags_format = ">B" - ref_count = value["ref_count"] - assert isinstance(ref_count, int) - retain_segments = cast(List[bool], value.get("retain_segments", [])) - - if ref_count <= 4: - flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) - for ref_index, ref_retain in enumerate(retain_segments): - if ref_retain: - flags_byte |= 1 << ref_index - flags.append(flags_byte) - else: - bytes_count = math.ceil((ref_count + 1) / 8) - flags_format = ">L" + ("B" * bytes_count) - flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24 - flags.append(flags_dword) - - for byte_index in range(bytes_count): - ret_byte = 0 - ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8] - for bit_pos, ret_seg in enumerate(ret_part): - ret_byte |= 1 << bit_pos if ret_seg else ret_byte - - flags.append(ret_byte) - - ref_segments = cast(List[int], value.get("ref_segments", [])) - - seg_num = cast(int, segment["number"]) - if seg_num <= 256: - ref_format = "B" - elif seg_num <= 65536: - ref_format = "I" - else: - ref_format = "L" - - for ref in ref_segments: - flags_format += ref_format - flags.append(ref) - - return pack(flags_format, *flags) - - def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes: - data = pack(">L", value) - data += cast(bytes, segment["raw_data"]) - return data - - def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment: - return { - "data_length": 0, - "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE}, - "number": seg_number, - "page_assoc": page_number, - "raw_data": b"", - "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS, - } - - def get_eof_segment(self, seg_number: int) -> JBIG2Segment: - return { - "data_length": 0, - "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE}, - "number": seg_number, - "page_assoc": 0, - "raw_data": b"", - "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS, - } diff --git a/pdf2zh/latin_enc.py b/pdf2zh/latin_enc.py deleted file mode 100644 index c5e8305..0000000 --- a/pdf2zh/latin_enc.py +++ /dev/null @@ -1,246 +0,0 @@ -"""Standard encoding tables used in PDF. - -This table is extracted from PDF Reference Manual 1.6, pp.925 - "D.1 Latin Character Set and Encodings" - -""" - -from typing import List, Optional, Tuple - -EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]] - -ENCODING: List[EncodingRow] = [ - # (name, std, mac, win, pdf) - ("A", 65, 65, 65, 65), - ("AE", 225, 174, 198, 198), - ("Aacute", None, 231, 193, 193), - ("Acircumflex", None, 229, 194, 194), - ("Adieresis", None, 128, 196, 196), - ("Agrave", None, 203, 192, 192), - ("Aring", None, 129, 197, 197), - ("Atilde", None, 204, 195, 195), - ("B", 66, 66, 66, 66), - ("C", 67, 67, 67, 67), - ("Ccedilla", None, 130, 199, 199), - ("D", 68, 68, 68, 68), - ("E", 69, 69, 69, 69), - ("Eacute", None, 131, 201, 201), - ("Ecircumflex", None, 230, 202, 202), - ("Edieresis", None, 232, 203, 203), - ("Egrave", None, 233, 200, 200), - ("Eth", None, None, 208, 208), - ("Euro", None, None, 128, 160), - ("F", 70, 70, 70, 70), - ("G", 71, 71, 71, 71), - ("H", 72, 72, 72, 72), - ("I", 73, 73, 73, 73), - ("Iacute", None, 234, 205, 205), - ("Icircumflex", None, 235, 206, 206), - ("Idieresis", None, 236, 207, 207), - ("Igrave", None, 237, 204, 204), - ("J", 74, 74, 74, 74), - ("K", 75, 75, 75, 75), - ("L", 76, 76, 76, 76), - ("Lslash", 232, None, None, 149), - ("M", 77, 77, 77, 77), - ("N", 78, 78, 78, 78), - ("Ntilde", None, 132, 209, 209), - ("O", 79, 79, 79, 79), - ("OE", 234, 206, 140, 150), - ("Oacute", None, 238, 211, 211), - ("Ocircumflex", None, 239, 212, 212), - ("Odieresis", None, 133, 214, 214), - ("Ograve", None, 241, 210, 210), - ("Oslash", 233, 175, 216, 216), - ("Otilde", None, 205, 213, 213), - ("P", 80, 80, 80, 80), - ("Q", 81, 81, 81, 81), - ("R", 82, 82, 82, 82), - ("S", 83, 83, 83, 83), - ("Scaron", None, None, 138, 151), - ("T", 84, 84, 84, 84), - ("Thorn", None, None, 222, 222), - ("U", 85, 85, 85, 85), - ("Uacute", None, 242, 218, 218), - ("Ucircumflex", None, 243, 219, 219), - ("Udieresis", None, 134, 220, 220), - ("Ugrave", None, 244, 217, 217), - ("V", 86, 86, 86, 86), - ("W", 87, 87, 87, 87), - ("X", 88, 88, 88, 88), - ("Y", 89, 89, 89, 89), - ("Yacute", None, None, 221, 221), - ("Ydieresis", None, 217, 159, 152), - ("Z", 90, 90, 90, 90), - ("Zcaron", None, None, 142, 153), - ("a", 97, 97, 97, 97), - ("aacute", None, 135, 225, 225), - ("acircumflex", None, 137, 226, 226), - ("acute", 194, 171, 180, 180), - ("adieresis", None, 138, 228, 228), - ("ae", 241, 190, 230, 230), - ("agrave", None, 136, 224, 224), - ("ampersand", 38, 38, 38, 38), - ("aring", None, 140, 229, 229), - ("asciicircum", 94, 94, 94, 94), - ("asciitilde", 126, 126, 126, 126), - ("asterisk", 42, 42, 42, 42), - ("at", 64, 64, 64, 64), - ("atilde", None, 139, 227, 227), - ("b", 98, 98, 98, 98), - ("backslash", 92, 92, 92, 92), - ("bar", 124, 124, 124, 124), - ("braceleft", 123, 123, 123, 123), - ("braceright", 125, 125, 125, 125), - ("bracketleft", 91, 91, 91, 91), - ("bracketright", 93, 93, 93, 93), - ("breve", 198, 249, None, 24), - ("brokenbar", None, None, 166, 166), - ("bullet", 183, 165, 149, 128), - ("c", 99, 99, 99, 99), - ("caron", 207, 255, None, 25), - ("ccedilla", None, 141, 231, 231), - ("cedilla", 203, 252, 184, 184), - ("cent", 162, 162, 162, 162), - ("circumflex", 195, 246, 136, 26), - ("colon", 58, 58, 58, 58), - ("comma", 44, 44, 44, 44), - ("copyright", None, 169, 169, 169), - ("currency", 168, 219, 164, 164), - ("d", 100, 100, 100, 100), - ("dagger", 178, 160, 134, 129), - ("daggerdbl", 179, 224, 135, 130), - ("degree", None, 161, 176, 176), - ("dieresis", 200, 172, 168, 168), - ("divide", None, 214, 247, 247), - ("dollar", 36, 36, 36, 36), - ("dotaccent", 199, 250, None, 27), - ("dotlessi", 245, 245, None, 154), - ("e", 101, 101, 101, 101), - ("eacute", None, 142, 233, 233), - ("ecircumflex", None, 144, 234, 234), - ("edieresis", None, 145, 235, 235), - ("egrave", None, 143, 232, 232), - ("eight", 56, 56, 56, 56), - ("ellipsis", 188, 201, 133, 131), - ("emdash", 208, 209, 151, 132), - ("endash", 177, 208, 150, 133), - ("equal", 61, 61, 61, 61), - ("eth", None, None, 240, 240), - ("exclam", 33, 33, 33, 33), - ("exclamdown", 161, 193, 161, 161), - ("f", 102, 102, 102, 102), - ("fi", 174, 222, None, 147), - ("five", 53, 53, 53, 53), - ("fl", 175, 223, None, 148), - ("florin", 166, 196, 131, 134), - ("four", 52, 52, 52, 52), - ("fraction", 164, 218, None, 135), - ("g", 103, 103, 103, 103), - ("germandbls", 251, 167, 223, 223), - ("grave", 193, 96, 96, 96), - ("greater", 62, 62, 62, 62), - ("guillemotleft", 171, 199, 171, 171), - ("guillemotright", 187, 200, 187, 187), - ("guilsinglleft", 172, 220, 139, 136), - ("guilsinglright", 173, 221, 155, 137), - ("h", 104, 104, 104, 104), - ("hungarumlaut", 205, 253, None, 28), - ("hyphen", 45, 45, 45, 45), - ("i", 105, 105, 105, 105), - ("iacute", None, 146, 237, 237), - ("icircumflex", None, 148, 238, 238), - ("idieresis", None, 149, 239, 239), - ("igrave", None, 147, 236, 236), - ("j", 106, 106, 106, 106), - ("k", 107, 107, 107, 107), - ("l", 108, 108, 108, 108), - ("less", 60, 60, 60, 60), - ("logicalnot", None, 194, 172, 172), - ("lslash", 248, None, None, 155), - ("m", 109, 109, 109, 109), - ("macron", 197, 248, 175, 175), - ("minus", None, None, None, 138), - ("mu", None, 181, 181, 181), - ("multiply", None, None, 215, 215), - ("n", 110, 110, 110, 110), - ("nbspace", None, 202, 160, None), - ("nine", 57, 57, 57, 57), - ("ntilde", None, 150, 241, 241), - ("numbersign", 35, 35, 35, 35), - ("o", 111, 111, 111, 111), - ("oacute", None, 151, 243, 243), - ("ocircumflex", None, 153, 244, 244), - ("odieresis", None, 154, 246, 246), - ("oe", 250, 207, 156, 156), - ("ogonek", 206, 254, None, 29), - ("ograve", None, 152, 242, 242), - ("one", 49, 49, 49, 49), - ("onehalf", None, None, 189, 189), - ("onequarter", None, None, 188, 188), - ("onesuperior", None, None, 185, 185), - ("ordfeminine", 227, 187, 170, 170), - ("ordmasculine", 235, 188, 186, 186), - ("oslash", 249, 191, 248, 248), - ("otilde", None, 155, 245, 245), - ("p", 112, 112, 112, 112), - ("paragraph", 182, 166, 182, 182), - ("parenleft", 40, 40, 40, 40), - ("parenright", 41, 41, 41, 41), - ("percent", 37, 37, 37, 37), - ("period", 46, 46, 46, 46), - ("periodcentered", 180, 225, 183, 183), - ("perthousand", 189, 228, 137, 139), - ("plus", 43, 43, 43, 43), - ("plusminus", None, 177, 177, 177), - ("q", 113, 113, 113, 113), - ("question", 63, 63, 63, 63), - ("questiondown", 191, 192, 191, 191), - ("quotedbl", 34, 34, 34, 34), - ("quotedblbase", 185, 227, 132, 140), - ("quotedblleft", 170, 210, 147, 141), - ("quotedblright", 186, 211, 148, 142), - ("quoteleft", 96, 212, 145, 143), - ("quoteright", 39, 213, 146, 144), - ("quotesinglbase", 184, 226, 130, 145), - ("quotesingle", 169, 39, 39, 39), - ("r", 114, 114, 114, 114), - ("registered", None, 168, 174, 174), - ("ring", 202, 251, None, 30), - ("s", 115, 115, 115, 115), - ("scaron", None, None, 154, 157), - ("section", 167, 164, 167, 167), - ("semicolon", 59, 59, 59, 59), - ("seven", 55, 55, 55, 55), - ("six", 54, 54, 54, 54), - ("slash", 47, 47, 47, 47), - ("space", 32, 32, 32, 32), - ("space", None, 202, 160, None), - ("space", None, 202, 173, None), - ("sterling", 163, 163, 163, 163), - ("t", 116, 116, 116, 116), - ("thorn", None, None, 254, 254), - ("three", 51, 51, 51, 51), - ("threequarters", None, None, 190, 190), - ("threesuperior", None, None, 179, 179), - ("tilde", 196, 247, 152, 31), - ("trademark", None, 170, 153, 146), - ("two", 50, 50, 50, 50), - ("twosuperior", None, None, 178, 178), - ("u", 117, 117, 117, 117), - ("uacute", None, 156, 250, 250), - ("ucircumflex", None, 158, 251, 251), - ("udieresis", None, 159, 252, 252), - ("ugrave", None, 157, 249, 249), - ("underscore", 95, 95, 95, 95), - ("v", 118, 118, 118, 118), - ("w", 119, 119, 119, 119), - ("x", 120, 120, 120, 120), - ("y", 121, 121, 121, 121), - ("yacute", None, None, 253, 253), - ("ydieresis", None, 216, 255, 255), - ("yen", 165, 180, 165, 165), - ("z", 122, 122, 122, 122), - ("zcaron", None, None, 158, 158), - ("zero", 48, 48, 48, 48), -] diff --git a/pdf2zh/layout.py b/pdf2zh/layout.py deleted file mode 100644 index 0920856..0000000 --- a/pdf2zh/layout.py +++ /dev/null @@ -1,993 +0,0 @@ -import heapq -import logging -from typing import ( - Dict, - Generic, - Iterable, - Iterator, - List, - Optional, - Sequence, - Set, - Tuple, - TypeVar, - Union, - cast, -) - -from pdf2zh.pdfcolor import PDFColorSpace -from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError -from pdf2zh.pdffont import PDFFont -from pdf2zh.pdfinterp import Color, PDFGraphicState -from pdf2zh.pdftypes import PDFStream -from pdf2zh.utils import ( - INF, - LTComponentT, - Matrix, - PathSegment, - Plane, - Point, - Rect, - apply_matrix_pt, - bbox2str, - fsplit, - get_bound, - matrix2str, - uniq, -) - -logger = logging.getLogger(__name__) - - -class IndexAssigner: - def __init__(self, index: int = 0) -> None: - self.index = index - - def run(self, obj: "LTItem") -> None: - if isinstance(obj, LTTextBox): - obj.index = self.index - self.index += 1 - elif isinstance(obj, LTTextGroup): - for x in obj: - self.run(x) - - -class LAParams: - """Parameters for layout analysis - - :param line_overlap: If two characters have more overlap than this they - are considered to be on the same line. The overlap is specified - relative to the minimum height of both characters. - :param char_margin: If two characters are closer together than this - margin they are considered part of the same line. The margin is - specified relative to the width of the character. - :param word_margin: If two characters on the same line are further apart - than this margin then they are considered to be two separate words, and - an intermediate space will be added for readability. The margin is - specified relative to the width of the character. - :param line_margin: If two lines are are close together they are - considered to be part of the same paragraph. The margin is - specified relative to the height of a line. - :param boxes_flow: Specifies how much a horizontal and vertical position - of a text matters when determining the order of text boxes. The value - should be within the range of -1.0 (only horizontal position - matters) to +1.0 (only vertical position matters). You can also pass - `None` to disable advanced layout analysis, and instead return text - based on the position of the bottom left corner of the text box. - :param detect_vertical: If vertical text should be considered during - layout analysis - :param all_texts: If layout analysis should be performed on text in - figures. - """ - - def __init__( - self, - line_overlap: float = 0.5, - char_margin: float = 2.0, - line_margin: float = 0.5, - word_margin: float = 0.1, - boxes_flow: Optional[float] = 0.5, - detect_vertical: bool = False, - all_texts: bool = False, - ) -> None: - self.line_overlap = line_overlap - self.char_margin = char_margin - self.line_margin = line_margin - self.word_margin = word_margin - self.boxes_flow = boxes_flow - self.detect_vertical = detect_vertical - self.all_texts = all_texts - - self._validate() - - def _validate(self) -> None: - if self.boxes_flow is not None: - boxes_flow_err_msg = ( - "LAParam boxes_flow should be None, or a number between -1 and +1" - ) - if not ( - isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float) - ): - raise PDFTypeError(boxes_flow_err_msg) - if not -1 <= self.boxes_flow <= 1: - raise PDFValueError(boxes_flow_err_msg) - - def __repr__(self) -> str: - return ( - "" - % (self.char_margin, self.line_margin, self.word_margin, self.all_texts) - ) - - -class LTItem: - """Interface for things that can be analyzed""" - - def analyze(self, laparams: LAParams) -> None: - """Perform the layout analysis.""" - - -class LTText: - """Interface for things that have text""" - - def __repr__(self) -> str: - return f"<{self.__class__.__name__} {self.get_text()!r}>" - - def get_text(self) -> str: - """Text contained in this object""" - raise NotImplementedError - - -class LTComponent(LTItem): - """Object with a bounding box""" - - def __init__(self, bbox: Rect) -> None: - LTItem.__init__(self) - self.set_bbox(bbox) - - def __repr__(self) -> str: - return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>" - - # Disable comparison. - def __lt__(self, _: object) -> bool: - raise PDFValueError - - def __le__(self, _: object) -> bool: - raise PDFValueError - - def __gt__(self, _: object) -> bool: - raise PDFValueError - - def __ge__(self, _: object) -> bool: - raise PDFValueError - - def set_bbox(self, bbox: Rect) -> None: - (x0, y0, x1, y1) = bbox - self.x0 = x0 - self.y0 = y0 - self.x1 = x1 - self.y1 = y1 - self.width = x1 - x0 - self.height = y1 - y0 - self.bbox = bbox - - def is_empty(self) -> bool: - return self.width <= 0 or self.height <= 0 - - def is_hoverlap(self, obj: "LTComponent") -> bool: - assert isinstance(obj, LTComponent), str(type(obj)) - return obj.x0 <= self.x1 and self.x0 <= obj.x1 - - def hdistance(self, obj: "LTComponent") -> float: - assert isinstance(obj, LTComponent), str(type(obj)) - if self.is_hoverlap(obj): - return 0 - else: - return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) - - def hoverlap(self, obj: "LTComponent") -> float: - assert isinstance(obj, LTComponent), str(type(obj)) - if self.is_hoverlap(obj): - return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) - else: - return 0 - - def is_voverlap(self, obj: "LTComponent") -> bool: - assert isinstance(obj, LTComponent), str(type(obj)) - return obj.y0 <= self.y1 and self.y0 <= obj.y1 - - def vdistance(self, obj: "LTComponent") -> float: - assert isinstance(obj, LTComponent), str(type(obj)) - if self.is_voverlap(obj): - return 0 - else: - return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) - - def voverlap(self, obj: "LTComponent") -> float: - assert isinstance(obj, LTComponent), str(type(obj)) - if self.is_voverlap(obj): - return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) - else: - return 0 - - -class LTCurve(LTComponent): - """A generic Bezier curve - - The parameter `original_path` contains the original - pathing information from the pdf (e.g. for reconstructing Bezier Curves). - - `dashing_style` contains the Dashing information if any. - """ - - def __init__( - self, - linewidth: float, - pts: List[Point], - stroke: bool = False, - fill: bool = False, - evenodd: bool = False, - stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None, - original_path: Optional[List[PathSegment]] = None, - dashing_style: Optional[Tuple[object, object]] = None, - ) -> None: - LTComponent.__init__(self, get_bound(pts)) - self.pts = pts - self.linewidth = linewidth - self.stroke = stroke - self.fill = fill - self.evenodd = evenodd - self.stroking_color = stroking_color - self.non_stroking_color = non_stroking_color - self.original_path = original_path - self.dashing_style = dashing_style - - def get_pts(self) -> str: - return ",".join("%.3f,%.3f" % p for p in self.pts) - - -class LTLine(LTCurve): - """A single straight line. - - Could be used for separating text or figures. - """ - - def __init__( - self, - linewidth: float, - p0: Point, - p1: Point, - stroke: bool = False, - fill: bool = False, - evenodd: bool = False, - stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None, - original_path: Optional[List[PathSegment]] = None, - dashing_style: Optional[Tuple[object, object]] = None, - ) -> None: - LTCurve.__init__( - self, - linewidth, - [p0, p1], - stroke, - fill, - evenodd, - stroking_color, - non_stroking_color, - original_path, - dashing_style, - ) - - -class LTRect(LTCurve): - """A rectangle. - - Could be used for framing another pictures or figures. - """ - - def __init__( - self, - linewidth: float, - bbox: Rect, - stroke: bool = False, - fill: bool = False, - evenodd: bool = False, - stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None, - original_path: Optional[List[PathSegment]] = None, - dashing_style: Optional[Tuple[object, object]] = None, - ) -> None: - (x0, y0, x1, y1) = bbox - LTCurve.__init__( - self, - linewidth, - [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], - stroke, - fill, - evenodd, - stroking_color, - non_stroking_color, - original_path, - dashing_style, - ) - - -class LTImage(LTComponent): - """An image object. - - Embedded images can be in JPEG, Bitmap or JBIG2. - """ - - def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: - LTComponent.__init__(self, bbox) - self.name = name - self.stream = stream - self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height"))) - self.imagemask = stream.get_any(("IM", "ImageMask")) - self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1) - self.colorspace = stream.get_any(("CS", "ColorSpace")) - if not isinstance(self.colorspace, list): - self.colorspace = [self.colorspace] - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>" - - -class LTAnno(LTItem, LTText): - """Actual letter in the text as a Unicode string. - - Note that, while a LTChar object has actual boundaries, LTAnno objects does - not, as these are "virtual" characters, inserted by a layout analyzer - according to the relationship between two characters (e.g. a space). - """ - - def __init__(self, text: str) -> None: - self._text = text - - def get_text(self) -> str: - return self._text - - -class LTChar(LTComponent, LTText): - """Actual letter in the text as a Unicode string.""" - - def __init__( - self, - matrix: Matrix, - font: PDFFont, - fontsize: float, - scaling: float, - rise: float, - text: str, - textwidth: float, - textdisp: Union[float, Tuple[Optional[float], float]], - ncs: PDFColorSpace, - graphicstate: PDFGraphicState, - ) -> None: - LTText.__init__(self) - self._text = text - self.matrix = matrix - self.font = font - self.fontname = font.fontname - self.ncs = ncs - self.graphicstate = graphicstate - self.adv = textwidth * fontsize * scaling - # compute the boundary rectangle. - if font.is_vertical(): - # vertical - assert isinstance(textdisp, tuple) - (vx, vy) = textdisp - if vx is None: - vx = fontsize * 0.5 - else: - vx = vx * fontsize * 0.001 - vy = (1000 - vy) * fontsize * 0.001 - bbox_lower_left = (-vx, vy + rise + self.adv) - bbox_upper_right = (-vx + fontsize, vy + rise) - else: - # horizontal - descent = 0 # descent = font.get_descent() * fontsize - bbox_lower_left = (0, descent + rise) - bbox_upper_right = (self.adv, descent + rise + fontsize) - (a, b, c, d, e, f) = self.matrix - self.upright = a * d * scaling > 0 and b * c <= 0 - (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left) - (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right) - if x1 < x0: - (x0, x1) = (x1, x0) - if y1 < y0: - (y0, y1) = (y1, y0) - LTComponent.__init__(self, (x0, y0, x1, y1)) - if font.is_vertical(): - self.size = self.width - else: - self.size = self.height - - def __repr__(self) -> str: - return "<{} {} matrix={} font={} adv={} text={}>".format( - self.__class__.__name__, - bbox2str(self.bbox), - matrix2str(self.matrix), - repr(self.fontname), - self.adv, - repr(self.get_text()), - ) - - def get_text(self) -> str: - return self._text - - -LTItemT = TypeVar("LTItemT", bound=LTItem) - - -class LTContainer(LTComponent, Generic[LTItemT]): - """Object that can be extended and analyzed""" - - def __init__(self, bbox: Rect) -> None: - LTComponent.__init__(self, bbox) - self._objs: List[LTItemT] = [] - - def __iter__(self) -> Iterator[LTItemT]: - return iter(self._objs) - - def __len__(self) -> int: - return len(self._objs) - - def add(self, obj: LTItemT) -> None: - self._objs.append(obj) - - def extend(self, objs: Iterable[LTItemT]) -> None: - for obj in objs: - self.add(obj) - - def analyze(self, laparams: LAParams) -> None: - for obj in self._objs: - obj.analyze(laparams) - - -class LTExpandableContainer(LTContainer[LTItemT]): - def __init__(self) -> None: - LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) - - # Incompatible override: we take an LTComponent (with bounding box), but - # super() LTContainer only considers LTItem (no bounding box). - def add(self, obj: LTComponent) -> None: # type: ignore[override] - LTContainer.add(self, cast(LTItemT, obj)) - self.set_bbox( - ( - min(self.x0, obj.x0), - min(self.y0, obj.y0), - max(self.x1, obj.x1), - max(self.y1, obj.y1), - ), - ) - - -class LTTextContainer(LTExpandableContainer[LTItemT], LTText): - def __init__(self) -> None: - LTText.__init__(self) - LTExpandableContainer.__init__(self) - - def get_text(self) -> str: - return "".join( - cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText) - ) - - -TextLineElement = Union[LTChar, LTAnno] - - -class LTTextLine(LTTextContainer[TextLineElement]): - """Contains a list of LTChar objects that represent a single text line. - - The characters are aligned either horizontally or vertically, depending on - the text's writing mode. - """ - - def __init__(self, word_margin: float) -> None: - super().__init__() - self.word_margin = word_margin - - def __repr__(self) -> str: - return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>" - - def analyze(self, laparams: LAParams) -> None: - for obj in self._objs: - obj.analyze(laparams) - LTContainer.add(self, LTAnno("\n")) - - def find_neighbors( - self, - plane: Plane[LTComponentT], - ratio: float, - ) -> List["LTTextLine"]: - raise NotImplementedError - - def is_empty(self) -> bool: - return super().is_empty() or self.get_text().isspace() - - -class LTTextLineHorizontal(LTTextLine): - def __init__(self, word_margin: float) -> None: - LTTextLine.__init__(self, word_margin) - self._x1: float = +INF - - # Incompatible override: we take an LTComponent (with bounding box), but - # LTContainer only considers LTItem (no bounding box). - def add(self, obj: LTComponent) -> None: # type: ignore[override] - if isinstance(obj, LTChar) and self.word_margin: - margin = self.word_margin * max(obj.width, obj.height) - if self._x1 < obj.x0 - margin: - LTContainer.add(self, LTAnno(" ")) - self._x1 = obj.x1 - super().add(obj) - - def find_neighbors( - self, - plane: Plane[LTComponentT], - ratio: float, - ) -> List[LTTextLine]: - """Finds neighboring LTTextLineHorizontals in the plane. - - Returns a list of other LTTestLineHorizontals in the plane which are - close to self. "Close" can be controlled by ratio. The returned objects - will be the same height as self, and also either left-, right-, or - centrally-aligned. - """ - d = ratio * self.height - objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) - return [ - obj - for obj in objs - if ( - isinstance(obj, LTTextLineHorizontal) - and self._is_same_height_as(obj, tolerance=d) - and ( - self._is_left_aligned_with(obj, tolerance=d) - or self._is_right_aligned_with(obj, tolerance=d) - or self._is_centrally_aligned_with(obj, tolerance=d) - ) - ) - ] - - def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: - """Whether the left-hand edge of `other` is within `tolerance`.""" - return abs(other.x0 - self.x0) <= tolerance - - def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: - """Whether the right-hand edge of `other` is within `tolerance`.""" - return abs(other.x1 - self.x1) <= tolerance - - def _is_centrally_aligned_with( - self, - other: LTComponent, - tolerance: float = 0, - ) -> bool: - """Whether the horizontal center of `other` is within `tolerance`.""" - return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance - - def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool: - return abs(other.height - self.height) <= tolerance - - -class LTTextLineVertical(LTTextLine): - def __init__(self, word_margin: float) -> None: - LTTextLine.__init__(self, word_margin) - self._y0: float = -INF - - # Incompatible override: we take an LTComponent (with bounding box), but - # LTContainer only considers LTItem (no bounding box). - def add(self, obj: LTComponent) -> None: # type: ignore[override] - if isinstance(obj, LTChar) and self.word_margin: - margin = self.word_margin * max(obj.width, obj.height) - if obj.y1 + margin < self._y0: - LTContainer.add(self, LTAnno(" ")) - self._y0 = obj.y0 - super().add(obj) - - def find_neighbors( - self, - plane: Plane[LTComponentT], - ratio: float, - ) -> List[LTTextLine]: - """Finds neighboring LTTextLineVerticals in the plane. - - Returns a list of other LTTextLineVerticals in the plane which are - close to self. "Close" can be controlled by ratio. The returned objects - will be the same width as self, and also either upper-, lower-, or - centrally-aligned. - """ - d = ratio * self.width - objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) - return [ - obj - for obj in objs - if ( - isinstance(obj, LTTextLineVertical) - and self._is_same_width_as(obj, tolerance=d) - and ( - self._is_lower_aligned_with(obj, tolerance=d) - or self._is_upper_aligned_with(obj, tolerance=d) - or self._is_centrally_aligned_with(obj, tolerance=d) - ) - ) - ] - - def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: - """Whether the lower edge of `other` is within `tolerance`.""" - return abs(other.y0 - self.y0) <= tolerance - - def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: - """Whether the upper edge of `other` is within `tolerance`.""" - return abs(other.y1 - self.y1) <= tolerance - - def _is_centrally_aligned_with( - self, - other: LTComponent, - tolerance: float = 0, - ) -> bool: - """Whether the vertical center of `other` is within `tolerance`.""" - return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance - - def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: - return abs(other.width - self.width) <= tolerance - - -class LTTextBox(LTTextContainer[LTTextLine]): - """Represents a group of text chunks in a rectangular area. - - Note that this box is created by geometric analysis and does not - necessarily represents a logical boundary of the text. It contains a list - of LTTextLine objects. - """ - - def __init__(self) -> None: - LTTextContainer.__init__(self) - self.index: int = -1 - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>" - - def get_writing_mode(self) -> str: - raise NotImplementedError - - -class LTTextBoxHorizontal(LTTextBox): - def analyze(self, laparams: LAParams) -> None: - super().analyze(laparams) - self._objs.sort(key=lambda obj: -obj.y1) - - def get_writing_mode(self) -> str: - return "lr-tb" - - -class LTTextBoxVertical(LTTextBox): - def analyze(self, laparams: LAParams) -> None: - super().analyze(laparams) - self._objs.sort(key=lambda obj: -obj.x1) - - def get_writing_mode(self) -> str: - return "tb-rl" - - -TextGroupElement = Union[LTTextBox, "LTTextGroup"] - - -class LTTextGroup(LTTextContainer[TextGroupElement]): - def __init__(self, objs: Iterable[TextGroupElement]) -> None: - super().__init__() - self.extend(objs) - - -class LTTextGroupLRTB(LTTextGroup): - def analyze(self, laparams: LAParams) -> None: - super().analyze(laparams) - assert laparams.boxes_flow is not None - boxes_flow = laparams.boxes_flow - # reorder the objects from top-left to bottom-right. - self._objs.sort( - key=lambda obj: (1 - boxes_flow) * obj.x0 - - (1 + boxes_flow) * (obj.y0 + obj.y1), - ) - - -class LTTextGroupTBRL(LTTextGroup): - def analyze(self, laparams: LAParams) -> None: - super().analyze(laparams) - assert laparams.boxes_flow is not None - boxes_flow = laparams.boxes_flow - # reorder the objects from top-right to bottom-left. - self._objs.sort( - key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1) - - (1 - boxes_flow) * obj.y1, - ) - - -class LTLayoutContainer(LTContainer[LTComponent]): - def __init__(self, bbox: Rect) -> None: - LTContainer.__init__(self, bbox) - self.groups: Optional[List[LTTextGroup]] = None - - # group_objects: group text object to textlines. - def group_objects( - self, - laparams: LAParams, - objs: Iterable[LTComponent], - ) -> Iterator[LTTextLine]: - obj0 = None - line = None - for obj1 in objs: - if obj0 is not None: - # halign: obj0 and obj1 is horizontally aligned. - # - # +------+ - - - - # | obj0 | - - +------+ - - # | | | obj1 | | (line_overlap) - # +------+ - - | | - - # - - - +------+ - # - # |<--->| - # (char_margin) - halign = ( - obj0.is_voverlap(obj1) - and min(obj0.height, obj1.height) * laparams.line_overlap - < obj0.voverlap(obj1) - and obj0.hdistance(obj1) - < max(obj0.width, obj1.width) * laparams.char_margin - ) - - # valign: obj0 and obj1 is vertically aligned. - # - # +------+ - # | obj0 | - # | | - # +------+ - - - - # | | | (char_margin) - # +------+ - - - # | obj1 | - # | | - # +------+ - # - # |<-->| - # (line_overlap) - valign = ( - laparams.detect_vertical - and obj0.is_hoverlap(obj1) - and min(obj0.width, obj1.width) * laparams.line_overlap - < obj0.hoverlap(obj1) - and obj0.vdistance(obj1) - < max(obj0.height, obj1.height) * laparams.char_margin - ) - - if (halign and isinstance(line, LTTextLineHorizontal)) or ( - valign and isinstance(line, LTTextLineVertical) - ): - line.add(obj1) - elif line is not None: - yield line - line = None - elif valign and not halign: - line = LTTextLineVertical(laparams.word_margin) - line.add(obj0) - line.add(obj1) - elif halign and not valign: - line = LTTextLineHorizontal(laparams.word_margin) - line.add(obj0) - line.add(obj1) - else: - line = LTTextLineHorizontal(laparams.word_margin) - line.add(obj0) - yield line - line = None - obj0 = obj1 - if line is None: - line = LTTextLineHorizontal(laparams.word_margin) - assert obj0 is not None - line.add(obj0) - yield line - - def group_textlines( - self, - laparams: LAParams, - lines: Iterable[LTTextLine], - ) -> Iterator[LTTextBox]: - """Group neighboring lines to textboxes""" - plane: Plane[LTTextLine] = Plane(self.bbox) - plane.extend(lines) - boxes: Dict[LTTextLine, LTTextBox] = {} - for line in lines: - neighbors = line.find_neighbors(plane, laparams.line_margin) - members = [line] - for obj1 in neighbors: - members.append(obj1) - if obj1 in boxes: - members.extend(boxes.pop(obj1)) - if isinstance(line, LTTextLineHorizontal): - box: LTTextBox = LTTextBoxHorizontal() - else: - box = LTTextBoxVertical() - for obj in uniq(members): - box.add(obj) - boxes[obj] = box - done = set() - for line in lines: - if line not in boxes: - continue - box = boxes[line] - if box in done: - continue - done.add(box) - if not box.is_empty(): - yield box - - def group_textboxes( - self, - laparams: LAParams, - boxes: Sequence[LTTextBox], - ) -> List[LTTextGroup]: - """Group textboxes hierarchically. - - Get pair-wise distances, via dist func defined below, and then merge - from the closest textbox pair. Once obj1 and obj2 are merged / - grouped, the resulting group is considered as a new object, and its - distances to other objects & groups are added to the process queue. - - For performance reason, pair-wise distances and object pair info are - maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2) - tuples. It ensures quick access to the smallest element. Note that - since comparison operators, e.g., __lt__, are disabled for - LTComponent, id(obj) has to appear before obj in element tuples. - - :param laparams: LAParams object. - :param boxes: All textbox objects to be grouped. - :return: a list that has only one element, the final top level group. - """ - ElementT = Union[LTTextBox, LTTextGroup] - plane: Plane[ElementT] = Plane(self.bbox) - - def dist(obj1: LTComponent, obj2: LTComponent) -> float: - """A distance function between two TextBoxes. - - Consider the bounding rectangle for obj1 and obj2. - Return its area less the areas of obj1 and obj2, - shown as 'www' below. This value may be negative. - +------+..........+ (x1, y1) - | obj1 |wwwwwwwwww: - +------+www+------+ - :wwwwwwwwww| obj2 | - (x0, y0) +..........+------+ - """ - x0 = min(obj1.x0, obj2.x0) - y0 = min(obj1.y0, obj2.y0) - x1 = max(obj1.x1, obj2.x1) - y1 = max(obj1.y1, obj2.y1) - return ( - (x1 - x0) * (y1 - y0) - - obj1.width * obj1.height - - obj2.width * obj2.height - ) - - def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]: - """Check if there's any other object between obj1 and obj2.""" - x0 = min(obj1.x0, obj2.x0) - y0 = min(obj1.y0, obj2.y0) - x1 = max(obj1.x1, obj2.x1) - y1 = max(obj1.y1, obj2.y1) - objs = set(plane.find((x0, y0, x1, y1))) - return objs.difference((obj1, obj2)) - - dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = [] - for i in range(len(boxes)): - box1 = boxes[i] - for j in range(i + 1, len(boxes)): - box2 = boxes[j] - dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2)) - heapq.heapify(dists) - - plane.extend(boxes) - done = set() - while len(dists) > 0: - (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) - # Skip objects that are already merged - if (id1 not in done) and (id2 not in done): - if not skip_isany and isany(obj1, obj2): - heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) - continue - if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance( - obj2, - (LTTextBoxVertical, LTTextGroupTBRL), - ): - group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) - else: - group = LTTextGroupLRTB([obj1, obj2]) - plane.remove(obj1) - plane.remove(obj2) - done.update([id1, id2]) - - for other in plane: - heapq.heappush( - dists, - (False, dist(group, other), id(group), id(other), group, other), - ) - plane.add(group) - # By now only groups are in the plane - return list(cast(LTTextGroup, g) for g in plane) - - def analyze(self, laparams: LAParams) -> None: - # textobjs is a list of LTChar objects, i.e. - # it has all the individual characters in the page. - (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) - for obj in otherobjs: - obj.analyze(laparams) - if not textobjs: - return - textlines = list(self.group_objects(laparams, textobjs)) - (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) - for obj in empties: - obj.analyze(laparams) - textboxes = list(self.group_textlines(laparams, textlines)) - if laparams.boxes_flow is None: - for textbox in textboxes: - textbox.analyze(laparams) - - def getkey(box: LTTextBox) -> Tuple[int, float, float]: - if isinstance(box, LTTextBoxVertical): - return (0, -box.x1, -box.y0) - else: - return (1, -box.y0, box.x0) - - textboxes.sort(key=getkey) - else: - self.groups = self.group_textboxes(laparams, textboxes) - assigner = IndexAssigner() - for group in self.groups: - group.analyze(laparams) - assigner.run(group) - textboxes.sort(key=lambda box: box.index) - self._objs = ( - cast(List[LTComponent], textboxes) - + otherobjs - + cast(List[LTComponent], empties) - ) - - -class LTFigure(LTLayoutContainer): - """Represents an area used by PDF Form objects. - - PDF Forms can be used to present figures or pictures by embedding yet - another PDF document within a page. Note that LTFigure objects can appear - recursively. - """ - - def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: - self.name = name - self.matrix = matrix - (x, y, w, h) = bbox - bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) - bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds) - LTLayoutContainer.__init__(self, bbox) - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" - - def analyze(self, laparams: LAParams) -> None: - if not laparams.all_texts: - return - LTLayoutContainer.analyze(self, laparams) - - -class LTPage(LTLayoutContainer): - """Represents an entire page. - - Like any other LTLayoutContainer, an LTPage can be iterated to obtain child - objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. - """ - - def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: - LTLayoutContainer.__init__(self, bbox) - self.pageid = pageid - self.rotate = rotate - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>" diff --git a/pdf2zh/lzw.py b/pdf2zh/lzw.py deleted file mode 100644 index 82a4941..0000000 --- a/pdf2zh/lzw.py +++ /dev/null @@ -1,105 +0,0 @@ -import logging -from io import BytesIO -from typing import BinaryIO, Iterator, List, Optional, cast - -from pdf2zh.pdfexceptions import PDFEOFError, PDFException - -logger = logging.getLogger(__name__) - - -class CorruptDataError(PDFException): - pass - - -class LZWDecoder: - def __init__(self, fp: BinaryIO) -> None: - self.fp = fp - self.buff = 0 - self.bpos = 8 - self.nbits = 9 - # NB: self.table stores None only in indices 256 and 257 - self.table: List[Optional[bytes]] = [] - self.prevbuf: Optional[bytes] = None - - def readbits(self, bits: int) -> int: - v = 0 - while 1: - # the number of remaining bits we can get from the current buffer. - r = 8 - self.bpos - if bits <= r: - # |-----8-bits-----| - # |-bpos-|-bits-| | - # | |----r----| - v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1)) - self.bpos += bits - break - else: - # |-----8-bits-----| - # |-bpos-|---bits----... - # | |----r----| - v = (v << r) | (self.buff & ((1 << r) - 1)) - bits -= r - x = self.fp.read(1) - if not x: - raise PDFEOFError - self.buff = ord(x) - self.bpos = 0 - return v - - def feed(self, code: int) -> bytes: - x = b"" - if code == 256: - self.table = [bytes((c,)) for c in range(256)] # 0-255 - self.table.append(None) # 256 - self.table.append(None) # 257 - self.prevbuf = b"" - self.nbits = 9 - elif code == 257: - pass - elif not self.prevbuf: - x = self.prevbuf = cast(bytes, self.table[code]) # assume not None - else: - if code < len(self.table): - x = cast(bytes, self.table[code]) # assume not None - self.table.append(self.prevbuf + x[:1]) - elif code == len(self.table): - self.table.append(self.prevbuf + self.prevbuf[:1]) - x = cast(bytes, self.table[code]) - else: - raise CorruptDataError - table_length = len(self.table) - if table_length == 511: - self.nbits = 10 - elif table_length == 1023: - self.nbits = 11 - elif table_length == 2047: - self.nbits = 12 - self.prevbuf = x - return x - - def run(self) -> Iterator[bytes]: - while 1: - try: - code = self.readbits(self.nbits) - except EOFError: - break - try: - x = self.feed(code) - except CorruptDataError: - # just ignore corrupt data and stop yielding there - break - yield x - - # logger.debug( - # "nbits=%d, code=%d, output=%r, table=%r", - # self.nbits, - # code, - # x, - # self.table[258:], - # ) - - -def lzwdecode(data: bytes) -> bytes: - fp = BytesIO(data) - s = LZWDecoder(fp).run() - return b"".join(s) diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py index ac4ddcf..a60c227 100644 --- a/pdf2zh/pdf2zh.py +++ b/pdf2zh/pdf2zh.py @@ -6,34 +6,16 @@ from __future__ import annotations import argparse -import logging import os import sys from pathlib import Path -from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional +from typing import Any, Container, Iterable, List, Optional +from pdfminer.pdfexceptions import PDFValueError import pymupdf import requests from pdf2zh import __version__ -from pdf2zh.pdfexceptions import PDFValueError - -if TYPE_CHECKING: - from pdf2zh.layout import LAParams - from pdf2zh.utils import AnyIO - -OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")) - - -def setup_log() -> None: - logging.basicConfig() - - try: - import doclayout_yolo - - doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING) - except ImportError: - pass def check_files(files: List[str]) -> List[str]: @@ -47,31 +29,11 @@ def check_files(files: List[str]) -> List[str]: return missing_files -def float_or_disabled(x: str) -> Optional[float]: - if x.lower().strip() == "disabled": - return None - try: - return float(x) - except ValueError: - raise argparse.ArgumentTypeError(f"invalid float value: {x}") - - def extract_text( files: Iterable[str] = [], - outfile: str = "-", - laparams: Optional[LAParams] = None, - output_type: str = "text", - codec: str = "utf-8", - strip_control: bool = False, - maxpages: int = 0, pages: Optional[Container[int]] = None, password: str = "", - scale: float = 1.0, - rotation: int = 0, - layoutmode: str = "normal", - output_dir: Optional[str] = None, debug: bool = False, - disable_caching: bool = False, vfont: str = "", vchar: str = "", thread: int = 0, @@ -81,19 +43,13 @@ def extract_text( callback: object = None, output: str = "", **kwargs: Any, -) -> AnyIO: +): import pdf2zh.high_level from pdf2zh.doclayout import DocLayoutModel if not files: raise PDFValueError("Must provide files to work upon!") - if output_type == "text" and outfile != "-": - for override, alttype in OUTPUT_TYPES: - if outfile.endswith(override): - output_type = alttype - - outfp: AnyIO = sys.stdout model = DocLayoutModel.load_available() for file in files: @@ -300,11 +256,9 @@ def main(args: Optional[List[str]] = None) -> int: setup_gui(parsed_args.share) return 0 - setup_log() extract_text(**vars(parsed_args)) return 0 if __name__ == "__main__": sys.exit(main()) - sys.exit(main()) diff --git a/pdf2zh/pdfcolor.py b/pdf2zh/pdfcolor.py deleted file mode 100644 index 08e044e..0000000 --- a/pdf2zh/pdfcolor.py +++ /dev/null @@ -1,37 +0,0 @@ -import collections -from typing import Dict - -from pdf2zh.psparser import LIT - -LITERAL_DEVICE_GRAY = LIT("DeviceGray") -LITERAL_DEVICE_RGB = LIT("DeviceRGB") -LITERAL_DEVICE_CMYK = LIT("DeviceCMYK") -# Abbreviations for inline images -LITERAL_INLINE_DEVICE_GRAY = LIT("G") -LITERAL_INLINE_DEVICE_RGB = LIT("RGB") -LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK") - - -class PDFColorSpace: - def __init__(self, name: str, ncomponents: int) -> None: - self.name = name - self.ncomponents = ncomponents - - def __repr__(self) -> str: - return "" % (self.name, self.ncomponents) - - -PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict() - -for name, n in [ - ("DeviceGray", 1), # default value first - ("CalRGB", 3), - ("CalGray", 1), - ("Lab", 3), - ("DeviceRGB", 3), - ("DeviceCMYK", 4), - ("Separation", 1), - ("Indexed", 1), - ("Pattern", 1), -]: - PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n) diff --git a/pdf2zh/pdfdevice.py b/pdf2zh/pdfdevice.py deleted file mode 100644 index edbbe99..0000000 --- a/pdf2zh/pdfdevice.py +++ /dev/null @@ -1,316 +0,0 @@ -from typing import ( - TYPE_CHECKING, - BinaryIO, - Iterable, - List, - Optional, - Sequence, - Union, - cast, -) - -from pdf2zh import utils -from pdf2zh.pdfcolor import PDFColorSpace -from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined -from pdf2zh.pdfpage import PDFPage -from pdf2zh.pdftypes import PDFStream -from pdf2zh.psparser import PSLiteral -from pdf2zh.utils import Matrix, PathSegment, Point, Rect - -if TYPE_CHECKING: - from pdf2zh.pdfinterp import ( - PDFGraphicState, - PDFResourceManager, - PDFStackT, - PDFTextState, - ) - - -PDFTextSeq = Iterable[Union[int, float, bytes]] - - -class PDFDevice: - """Translate the output of PDFPageInterpreter to the output that is needed""" - - def __init__(self, rsrcmgr: "PDFResourceManager") -> None: - self.rsrcmgr = rsrcmgr - self.ctm: Optional[Matrix] = None - - def __repr__(self) -> str: - return "" - - def __enter__(self) -> "PDFDevice": - return self - - def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: - self.close() - - def close(self) -> None: - pass - - def set_ctm(self, ctm: Matrix) -> None: - self.ctm = ctm - - def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: - pass - - def end_tag(self) -> None: - pass - - def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: - pass - - def begin_page(self, page: PDFPage, ctm: Matrix) -> None: - pass - - def end_page(self, page: PDFPage) -> None: - pass - - def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: - pass - - def end_figure(self, name: str) -> None: - pass - - def paint_path( - self, - graphicstate: "PDFGraphicState", - stroke: bool, - fill: bool, - evenodd: bool, - path: Sequence[PathSegment], - ) -> None: - pass - - def render_image(self, name: str, stream: PDFStream) -> None: - pass - - def render_string( - self, - textstate: "PDFTextState", - seq: PDFTextSeq, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> None: - pass - - -class PDFTextDevice(PDFDevice): - def render_string( - self, - textstate: "PDFTextState", - seq: PDFTextSeq, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> None: - assert self.ctm is not None - matrix = utils.mult_matrix(textstate.matrix, self.ctm) - font = textstate.font - fontsize = textstate.fontsize - scaling = textstate.scaling * 0.01 - charspace = textstate.charspace * scaling - wordspace = textstate.wordspace * scaling - rise = textstate.rise - assert font is not None - if font.is_multibyte(): - wordspace = 0 - dxscale = 0.001 * fontsize * scaling - if font.is_vertical(): - textstate.linematrix = self.render_string_vertical( - seq, - matrix, - textstate.linematrix, - font, - fontsize, - scaling, - charspace, - wordspace, - rise, - dxscale, - ncs, - graphicstate, - ) - else: - textstate.linematrix = self.render_string_horizontal( - seq, - matrix, - textstate.linematrix, - font, - fontsize, - scaling, - charspace, - wordspace, - rise, - dxscale, - ncs, - graphicstate, - ) - - def render_string_horizontal( - self, - seq: PDFTextSeq, - matrix: Matrix, - pos: Point, - font: PDFFont, - fontsize: float, - scaling: float, - charspace: float, - wordspace: float, - rise: float, - dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> Point: - (x, y) = pos - needcharspace = False - for obj in seq: - if isinstance(obj, (int, float)): - x -= obj * dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - x += charspace - x += self.render_char( - utils.translate_matrix(matrix, (x, y)), - font, - fontsize, - scaling, - rise, - cid, - ncs, - graphicstate, - ) - if cid == 32 and wordspace: - x += wordspace - needcharspace = True - return (x, y) - - def render_string_vertical( - self, - seq: PDFTextSeq, - matrix: Matrix, - pos: Point, - font: PDFFont, - fontsize: float, - scaling: float, - charspace: float, - wordspace: float, - rise: float, - dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> Point: - (x, y) = pos - needcharspace = False - for obj in seq: - if isinstance(obj, (int, float)): - y -= obj * dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - y += charspace - y += self.render_char( - utils.translate_matrix(matrix, (x, y)), - font, - fontsize, - scaling, - rise, - cid, - ncs, - graphicstate, - ) - if cid == 32 and wordspace: - y += wordspace - needcharspace = True - return (x, y) - - def render_char( - self, - matrix: Matrix, - font: PDFFont, - fontsize: float, - scaling: float, - rise: float, - cid: int, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> float: - return 0 - - -class TagExtractor(PDFDevice): - def __init__( - self, - rsrcmgr: "PDFResourceManager", - outfp: BinaryIO, - codec: str = "utf-8", - ) -> None: - PDFDevice.__init__(self, rsrcmgr) - self.outfp = outfp - self.codec = codec - self.pageno = 0 - self._stack: List[PSLiteral] = [] - - def render_string( - self, - textstate: "PDFTextState", - seq: PDFTextSeq, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - ) -> None: - font = textstate.font - assert font is not None - text = "" - for obj in seq: - if isinstance(obj, str): - obj = utils.make_compat_bytes(obj) - if not isinstance(obj, bytes): - continue - chars = font.decode(obj) - for cid in chars: - try: - char = font.to_unichr(cid) - text += char - except PDFUnicodeNotDefined: - pass - self._write(utils.enc(text)) - - def begin_page(self, page: PDFPage, ctm: Matrix) -> None: - output = '' % ( - self.pageno, - utils.bbox2str(page.mediabox), - page.rotate, - ) - self._write(output) - - def end_page(self, page: PDFPage) -> None: - self._write("\n") - self.pageno += 1 - - def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: - s = "" - if isinstance(props, dict): - s = "".join( - [ - f' {utils.enc(k)}="{utils.make_compat_str(v)}"' - for (k, v) in sorted(props.items()) - ], - ) - out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" - self._write(out_s) - self._stack.append(tag) - - def end_tag(self) -> None: - assert self._stack, str(self.pageno) - tag = self._stack.pop(-1) - out_s = "" % utils.enc(cast(str, tag.name)) - self._write(out_s) - - def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: - self.begin_tag(tag, props) - self._stack.pop(-1) - - def _write(self, s: str) -> None: - self.outfp.write(s.encode(self.codec)) diff --git a/pdf2zh/pdfdocument.py b/pdf2zh/pdfdocument.py deleted file mode 100644 index 535459e..0000000 --- a/pdf2zh/pdfdocument.py +++ /dev/null @@ -1,1069 +0,0 @@ -import itertools -import logging -import re -import struct -from hashlib import md5, sha256, sha384, sha512 -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - KeysView, - List, - Optional, - Sequence, - Tuple, - Type, - Union, - cast, -) - -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes - -from pdf2zh import settings -from pdf2zh.arcfour import Arcfour -from pdf2zh.data_structures import NumberTree -from pdf2zh.pdfexceptions import ( - PDFException, - PDFKeyError, - PDFObjectNotFound, - PDFTypeError, -) -from pdf2zh.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError -from pdf2zh.pdftypes import ( - DecipherCallable, - PDFStream, - decipher_all, - dict_value, - int_value, - list_value, - str_value, - stream_value, - uint_value, -) -from pdf2zh.psexceptions import PSEOF -from pdf2zh.psparser import KWD, LIT, literal_name -from pdf2zh.utils import ( - choplist, - decode_text, - format_int_alpha, - format_int_roman, - nunpack, -) - -log = logging.getLogger(__name__) - - -class PDFNoValidXRef(PDFSyntaxError): - pass - - -class PDFNoValidXRefWarning(SyntaxWarning): - """Legacy warning for missing xref. - - Not used anymore because warnings.warn is replaced by logger.Logger.warn. - """ - - -class PDFNoOutlines(PDFException): - pass - - -class PDFNoPageLabels(PDFException): - pass - - -class PDFDestinationNotFound(PDFException): - pass - - -class PDFEncryptionError(PDFException): - pass - - -class PDFPasswordIncorrect(PDFEncryptionError): - pass - - -class PDFEncryptionWarning(UserWarning): - """Legacy warning for failed decryption. - - Not used anymore because warnings.warn is replaced by logger.Logger.warn. - """ - - -class PDFTextExtractionNotAllowedWarning(UserWarning): - """Legacy warning for PDF that does not allow extraction. - - Not used anymore because warnings.warn is replaced by logger.Logger.warn. - """ - - -class PDFTextExtractionNotAllowed(PDFEncryptionError): - pass - - -# some predefined literals and keywords. -LITERAL_OBJSTM = LIT("ObjStm") -LITERAL_XREF = LIT("XRef") -LITERAL_CATALOG = LIT("Catalog") - - -class PDFBaseXRef: - def get_trailer(self) -> Dict[str, Any]: - raise NotImplementedError - - def get_objids(self) -> Iterable[int]: - return [] - - # Must return - # (strmid, index, genno) - # or (None, pos, genno) - def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: - raise PDFKeyError(objid) - - def load(self, parser: PDFParser) -> None: - raise NotImplementedError - - -class PDFXRef(PDFBaseXRef): - def __init__(self) -> None: - self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {} - self.trailer: Dict[str, Any] = {} - - def __repr__(self) -> str: - return "" % (self.offsets.keys()) - - def load(self, parser: PDFParser) -> None: - while True: - try: - (pos, line) = parser.nextline() - line = line.strip() - if not line: - continue - except PSEOF: - raise PDFNoValidXRef("Unexpected EOF - file corrupted?") - if line.startswith(b"trailer"): - parser.seek(pos) - break - f = line.split(b" ") - if len(f) != 2: - error_msg = f"Trailer not found: {parser!r}: line={line!r}" - raise PDFNoValidXRef(error_msg) - try: - (start, nobjs) = map(int, f) - except ValueError: - error_msg = f"Invalid line: {parser!r}: line={line!r}" - raise PDFNoValidXRef(error_msg) - for objid in range(start, start + nobjs): - try: - (_, line) = parser.nextline() - line = line.strip() - except PSEOF: - raise PDFNoValidXRef("Unexpected EOF - file corrupted?") - f = line.split(b" ") - if len(f) != 3: - error_msg = f"Invalid XRef format: {parser!r}, line={line!r}" - raise PDFNoValidXRef(error_msg) - (pos_b, genno_b, use_b) = f - if use_b != b"n": - continue - self.offsets[objid] = (None, int(pos_b), int(genno_b)) - # log.debug("xref objects: %r", self.offsets) - self.load_trailer(parser) - - def load_trailer(self, parser: PDFParser) -> None: - try: - (_, kwd) = parser.nexttoken() - assert kwd is KWD(b"trailer"), str(kwd) - _, (_, dic) = parser.nextobject() - except PSEOF: - x = parser.pop(1) - if not x: - raise PDFNoValidXRef("Unexpected EOF - file corrupted") - (_, dic) = x[0] - self.trailer.update(dict_value(dic)) - # log.debug("trailer=%r", self.trailer) - - def get_trailer(self) -> Dict[str, Any]: - return self.trailer - - def get_objids(self) -> KeysView[int]: - return self.offsets.keys() - - def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: - return self.offsets[objid] - - -class PDFXRefFallback(PDFXRef): - def __repr__(self) -> str: - return "" % (self.offsets.keys()) - - PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b") - - def load(self, parser: PDFParser) -> None: - parser.seek(0) - while 1: - try: - (pos, line_bytes) = parser.nextline() - except PSEOF: - break - if line_bytes.startswith(b"trailer"): - parser.seek(pos) - self.load_trailer(parser) - # log.debug("trailer: %r", self.trailer) - break - line = line_bytes.decode("latin-1") # default pdf encoding - m = self.PDFOBJ_CUE.match(line) - if not m: - continue - (objid_s, genno_s) = m.groups() - objid = int(objid_s) - genno = int(genno_s) - self.offsets[objid] = (None, pos, genno) - # expand ObjStm. - parser.seek(pos) - _, (_, obj) = parser.nextobject() - if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM: - stream = stream_value(obj) - try: - n = stream["N"] - except KeyError: - if settings.STRICT: - raise PDFSyntaxError("N is not defined: %r" % stream) - n = 0 - parser1 = PDFStreamParser(stream.get_data()) - objs: List[int] = [] - try: - while 1: - _, (_, obj) = parser1.nextobject() - objs.append(cast(int, obj)) - except PSEOF: - pass - n = min(n, len(objs) // 2) - for index in range(n): - objid1 = objs[index * 2] - self.offsets[objid1] = (objid, index, 0) - - -class PDFXRefStream(PDFBaseXRef): - def __init__(self) -> None: - self.data: Optional[bytes] = None - self.entlen: Optional[int] = None - self.fl1: Optional[int] = None - self.fl2: Optional[int] = None - self.fl3: Optional[int] = None - self.ranges: List[Tuple[int, int]] = [] - - def __repr__(self) -> str: - return "" % (self.ranges) - - def load(self, parser: PDFParser) -> None: - (_, objid) = parser.nexttoken() # ignored - (_, genno) = parser.nexttoken() # ignored - (_, kwd) = parser.nexttoken() - _, (_, stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF: - raise PDFNoValidXRef("Invalid PDF stream spec.") - size = stream["Size"] - index_array = stream.get("Index", (0, size)) - if len(index_array) % 2 != 0: - raise PDFSyntaxError("Invalid index number") - self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array))) - (self.fl1, self.fl2, self.fl3) = stream["W"] - assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None - self.data = stream.get_data() - self.entlen = self.fl1 + self.fl2 + self.fl3 - self.trailer = stream.attrs - # log.debug( - # "xref stream: objid=%s, fields=%d,%d,%d", - # ", ".join(map(repr, self.ranges)), - # self.fl1, - # self.fl2, - # self.fl3, - # ) - - def get_trailer(self) -> Dict[str, Any]: - return self.trailer - - def get_objids(self) -> Iterator[int]: - for start, nobjs in self.ranges: - for i in range(nobjs): - assert self.entlen is not None - assert self.data is not None - offset = self.entlen * i - ent = self.data[offset : offset + self.entlen] - f1 = nunpack(ent[: self.fl1], 1) - if f1 == 1 or f1 == 2: - yield start + i - - def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: - index = 0 - for start, nobjs in self.ranges: - if start <= objid and objid < start + nobjs: - index += objid - start - break - else: - index += nobjs - else: - raise PDFKeyError(objid) - assert self.entlen is not None - assert self.data is not None - assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None - offset = self.entlen * index - ent = self.data[offset : offset + self.entlen] - f1 = nunpack(ent[: self.fl1], 1) - f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2]) - f3 = nunpack(ent[self.fl1 + self.fl2 :]) - if f1 == 1: - return (None, f2, f3) - elif f1 == 2: - return (f2, f3, 0) - else: - # this is a free object - raise PDFKeyError(objid) - - -class PDFStandardSecurityHandler: - PASSWORD_PADDING = ( - b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08" - b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz" - ) - supported_revisions: Tuple[int, ...] = (2, 3) - - def __init__( - self, - docid: Sequence[bytes], - param: Dict[str, Any], - password: str = "", - ) -> None: - self.docid = docid - self.param = param - self.password = password - self.init() - - def init(self) -> None: - self.init_params() - if self.r not in self.supported_revisions: - error_msg = "Unsupported revision: param=%r" % self.param - raise PDFEncryptionError(error_msg) - self.init_key() - - def init_params(self) -> None: - self.v = int_value(self.param.get("V", 0)) - self.r = int_value(self.param["R"]) - self.p = uint_value(self.param["P"], 32) - self.o = str_value(self.param["O"]) - self.u = str_value(self.param["U"]) - self.length = int_value(self.param.get("Length", 40)) - - def init_key(self) -> None: - self.key = self.authenticate(self.password) - if self.key is None: - raise PDFPasswordIncorrect - - def is_printable(self) -> bool: - return bool(self.p & 4) - - def is_modifiable(self) -> bool: - return bool(self.p & 8) - - def is_extractable(self) -> bool: - return bool(self.p & 16) - - def compute_u(self, key: bytes) -> bytes: - if self.r == 2: - # Algorithm 3.4 - return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 - else: - # Algorithm 3.5 - hash = md5(self.PASSWORD_PADDING) # 2 - hash.update(self.docid[0]) # 3 - result = Arcfour(key).encrypt(hash.digest()) # 4 - for i in range(1, 20): # 5 - k = b"".join(bytes((c ^ i,)) for c in iter(key)) - result = Arcfour(k).encrypt(result) - result += result # 6 - return result - - def compute_encryption_key(self, password: bytes) -> bytes: - # Algorithm 3.2 - password = (password + self.PASSWORD_PADDING)[:32] # 1 - hash = md5(password) # 2 - hash.update(self.o) # 3 - # See https://github.com/pdf2zh/pdf2zh.six/issues/186 - hash.update(struct.pack("= 4: - if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata: - hash.update(b"\xff\xff\xff\xff") - result = hash.digest() - n = 5 - if self.r >= 3: - n = self.length // 8 - for _ in range(50): - result = md5(result[:n]).digest() - return result[:n] - - def authenticate(self, password: str) -> Optional[bytes]: - password_bytes = password.encode("latin1") - key = self.authenticate_user_password(password_bytes) - if key is None: - key = self.authenticate_owner_password(password_bytes) - return key - - def authenticate_user_password(self, password: bytes) -> Optional[bytes]: - key = self.compute_encryption_key(password) - if self.verify_encryption_key(key): - return key - else: - return None - - def verify_encryption_key(self, key: bytes) -> bool: - # Algorithm 3.6 - u = self.compute_u(key) - if self.r == 2: - return u == self.u - return u[:16] == self.u[:16] - - def authenticate_owner_password(self, password: bytes) -> Optional[bytes]: - # Algorithm 3.7 - password = (password + self.PASSWORD_PADDING)[:32] - hash = md5(password) - if self.r >= 3: - for _ in range(50): - hash = md5(hash.digest()) - n = 5 - if self.r >= 3: - n = self.length // 8 - key = hash.digest()[:n] - if self.r == 2: - user_password = Arcfour(key).decrypt(self.o) - else: - user_password = self.o - for i in range(19, -1, -1): - k = b"".join(bytes((c ^ i,)) for c in iter(key)) - user_password = Arcfour(k).decrypt(user_password) - return self.authenticate_user_password(user_password) - - def decrypt( - self, - objid: int, - genno: int, - data: bytes, - attrs: Optional[Dict[str, Any]] = None, - ) -> bytes: - return self.decrypt_rc4(objid, genno, data) - - def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: - assert self.key is not None - key = self.key + struct.pack(" None: - super().init_params() - self.length = 128 - self.cf = dict_value(self.param.get("CF")) - self.stmf = literal_name(self.param["StmF"]) - self.strf = literal_name(self.param["StrF"]) - self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True)) - if self.stmf != self.strf: - error_msg = "Unsupported crypt filter: param=%r" % self.param - raise PDFEncryptionError(error_msg) - self.cfm = {} - for k, v in self.cf.items(): - f = self.get_cfm(literal_name(v["CFM"])) - if f is None: - error_msg = "Unknown crypt filter method: param=%r" % self.param - raise PDFEncryptionError(error_msg) - self.cfm[k] = f - self.cfm["Identity"] = self.decrypt_identity - if self.strf not in self.cfm: - error_msg = "Undefined crypt filter: param=%r" % self.param - raise PDFEncryptionError(error_msg) - - def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]: - if name == "V2": - return self.decrypt_rc4 - elif name == "AESV2": - return self.decrypt_aes128 - else: - return None - - def decrypt( - self, - objid: int, - genno: int, - data: bytes, - attrs: Optional[Dict[str, Any]] = None, - name: Optional[str] = None, - ) -> bytes: - if not self.encrypt_metadata and attrs is not None: - t = attrs.get("Type") - if t is not None and literal_name(t) == "Metadata": - return data - if name is None: - name = self.strf - return self.cfm[name](objid, genno, data) - - def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes: - return data - - def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes: - assert self.key is not None - key = ( - self.key - + struct.pack(" None: - super().init_params() - self.length = 256 - self.oe = str_value(self.param["OE"]) - self.ue = str_value(self.param["UE"]) - self.o_hash = self.o[:32] - self.o_validation_salt = self.o[32:40] - self.o_key_salt = self.o[40:] - self.u_hash = self.u[:32] - self.u_validation_salt = self.u[32:40] - self.u_key_salt = self.u[40:] - - def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]: - if name == "AESV3": - return self.decrypt_aes256 - else: - return None - - def authenticate(self, password: str) -> Optional[bytes]: - password_b = self._normalize_password(password) - hash = self._password_hash(password_b, self.o_validation_salt, self.u) - if hash == self.o_hash: - hash = self._password_hash(password_b, self.o_key_salt, self.u) - cipher = Cipher( - algorithms.AES(hash), - modes.CBC(b"\0" * 16), - backend=default_backend(), - ) # type: ignore - return cipher.decryptor().update(self.oe) # type: ignore - hash = self._password_hash(password_b, self.u_validation_salt) - if hash == self.u_hash: - hash = self._password_hash(password_b, self.u_key_salt) - cipher = Cipher( - algorithms.AES(hash), - modes.CBC(b"\0" * 16), - backend=default_backend(), - ) # type: ignore - return cipher.decryptor().update(self.ue) # type: ignore - return None - - def _normalize_password(self, password: str) -> bytes: - if self.r == 6: - # saslprep expects non-empty strings, apparently - if not password: - return b"" - from pdf2zh._saslprep import saslprep - - password = saslprep(password) - return password.encode("utf-8")[:127] - - def _password_hash( - self, - password: bytes, - salt: bytes, - vector: Optional[bytes] = None, - ) -> bytes: - """Compute password hash depending on revision number""" - if self.r == 5: - return self._r5_password(password, salt, vector) - return self._r6_password(password, salt[0:8], vector) - - def _r5_password( - self, - password: bytes, - salt: bytes, - vector: Optional[bytes] = None, - ) -> bytes: - """Compute the password for revision 5""" - hash = sha256(password) - hash.update(salt) - if vector is not None: - hash.update(vector) - return hash.digest() - - def _r6_password( - self, - password: bytes, - salt: bytes, - vector: Optional[bytes] = None, - ) -> bytes: - """Compute the password for revision 6""" - initial_hash = sha256(password) - initial_hash.update(salt) - if vector is not None: - initial_hash.update(vector) - k = initial_hash.digest() - hashes = (sha256, sha384, sha512) - round_no = last_byte_val = 0 - while round_no < 64 or last_byte_val > round_no - 32: - k1 = (password + k + (vector or b"")) * 64 - e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1) - # compute the first 16 bytes of e, - # interpreted as an unsigned integer mod 3 - next_hash = hashes[self._bytes_mod_3(e[:16])] - k = next_hash(e).digest() - last_byte_val = e[len(e) - 1] - round_no += 1 - return k[:32] - - @staticmethod - def _bytes_mod_3(input_bytes: bytes) -> int: - # 256 is 1 mod 3, so we can just sum 'em - return sum(b % 3 for b in input_bytes) % 3 - - def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes: - cipher = Cipher(algorithms.AES(key), modes.CBC(iv)) - encryptor = cipher.encryptor() # type: ignore - return encryptor.update(data) + encryptor.finalize() # type: ignore - - def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes: - initialization_vector = data[:16] - ciphertext = data[16:] - assert self.key is not None - cipher = Cipher( - algorithms.AES(self.key), - modes.CBC(initialization_vector), - backend=default_backend(), - ) # type: ignore - return cipher.decryptor().update(ciphertext) # type: ignore - - -class PDFDocument: - """PDFDocument object represents a PDF document. - - Since a PDF file can be very big, normally it is not loaded at - once. So PDF document has to cooperate with a PDF parser in order to - dynamically import the data as processing goes. - - Typical usage: - doc = PDFDocument(parser, password) - obj = doc.getobj(objid) - - """ - - security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = { - 1: PDFStandardSecurityHandler, - 2: PDFStandardSecurityHandler, - 4: PDFStandardSecurityHandlerV4, - 5: PDFStandardSecurityHandlerV5, - } - - def __init__( - self, - parser: PDFParser, - password: str = "", - caching: bool = True, - fallback: bool = True, - ) -> None: - """Set the document to use a given PDFParser object.""" - self.caching = caching - self.xrefs: List[PDFBaseXRef] = [] - self.info = [] - self.catalog: Dict[str, Any] = {} - self.encryption: Optional[Tuple[Any, Any]] = None - self.decipher: Optional[DecipherCallable] = None - self._parser = None - self._cached_objs: Dict[int, Tuple[object, int]] = {} - self._parsed_objs: Dict[int, Tuple[List[object], int]] = {} - self._parser = parser - self._parser.set_document(self) - self.is_printable = self.is_modifiable = self.is_extractable = True - # Retrieve the information of each header that was appended - # (maybe multiple times) at the end of the document. - try: - # print('FIND XREF') - pos = self.find_xref(parser) - self.pos = pos - self.read_xref_from(parser, pos, self.xrefs) - except PDFNoValidXRef: - if fallback: - parser.fallback = True - newxref = PDFXRefFallback() - newxref.load(parser) - self.xrefs.append(newxref) - # print(f'XREF {self.xrefs}') - for xref in self.xrefs: - trailer = xref.get_trailer() - if not trailer: - continue - # If there's an encryption info, remember it. - if "Encrypt" in trailer: - if "ID" in trailer: - id_value = list_value(trailer["ID"]) - else: - # Some documents may not have a /ID, use two empty - # byte strings instead. Solves - # https://github.com/pdf2zh/pdf2zh.six/issues/594 - id_value = (b"", b"") - self.encryption = (id_value, dict_value(trailer["Encrypt"])) - self._initialize_password(password) - if "Info" in trailer: - self.info.append(dict_value(trailer["Info"])) - if "Root" in trailer: - # Every PDF file must have exactly one /Root dictionary. - self.catalog = dict_value(trailer["Root"]) - break - else: - raise PDFSyntaxError("No /Root object! - Is this really a PDF?") - if self.catalog.get("Type") is not LITERAL_CATALOG: - if settings.STRICT: - raise PDFSyntaxError("Catalog not found!") - - KEYWORD_OBJ = KWD(b"obj") - - # _initialize_password(password=b'') - # Perform the initialization with a given password. - def _initialize_password(self, password: str = "") -> None: - assert self.encryption is not None - (docid, param) = self.encryption - if literal_name(param.get("Filter")) != "Standard": - raise PDFEncryptionError("Unknown filter: param=%r" % param) - v = int_value(param.get("V", 0)) - factory = self.security_handler_registry.get(v) - if factory is None: - raise PDFEncryptionError("Unknown algorithm: param=%r" % param) - handler = factory(docid, param, password) - self.decipher = handler.decrypt - self.is_printable = handler.is_printable() - self.is_modifiable = handler.is_modifiable() - self.is_extractable = handler.is_extractable() - assert self._parser is not None - self._parser.fallback = False # need to read streams with exact length - - def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: - if stream.objid in self._parsed_objs: - (objs, n) = self._parsed_objs[stream.objid] - else: - (objs, n) = self._get_objects(stream) - if self.caching: - assert stream.objid is not None - self._parsed_objs[stream.objid] = (objs, n) - i = n * 2 + index - try: - obj = objs[i] - except IndexError: - raise PDFSyntaxError("index too big: %r" % index) - return obj - - def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: - if stream.get("Type") is not LITERAL_OBJSTM: - if settings.STRICT: - raise PDFSyntaxError("Not a stream object: %r" % stream) - try: - n = cast(int, stream["N"]) - except KeyError: - if settings.STRICT: - raise PDFSyntaxError("N is not defined: %r" % stream) - n = 0 - parser = PDFStreamParser(stream.get_data()) - parser.set_document(self) - objs: List[object] = [] - try: - while 1: - _, (_, obj) = parser.nextobject() - objs.append(obj) - except PSEOF: - pass - return (objs, n) - - def _getobj_parse(self, pos: int, objid: int) -> object: - assert self._parser is not None - self._parser.seek(pos) - (_, objid1) = self._parser.nexttoken() # objid - (_, genno) = self._parser.nexttoken() # genno - (_, kwd) = self._parser.nexttoken() - # hack around malformed pdf files - # copied from https://github.com/jaepil/pdf2zh3k/blob/master/ - # pdf2zh/pdfparser.py#L399 - # to solve https://github.com/pdf2zh/pdf2zh.six/issues/56 - # assert objid1 == objid, str((objid1, objid)) - if objid1 != objid: - x = [] - while kwd is not self.KEYWORD_OBJ: - (_, kwd) = self._parser.nexttoken() - x.append(kwd) - if len(x) >= 2: - objid1 = x[-2] - # #### end hack around malformed pdf files - if objid1 != objid: - raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}") - - if kwd != KWD(b"obj"): - raise PDFSyntaxError("Invalid object spec: offset=%r" % pos) - end, (_, obj) = self._parser.nextobject() - return end, obj - - # can raise PDFObjectNotFound - def getobj(self, objid: int) -> object: - """Get object from PDF - - :raises PDFException if PDFDocument is not initialized - :raises PDFObjectNotFound if objid does not exist in PDF - """ - if not self.xrefs: - raise PDFException("PDFDocument is not initialized") - # log.debug("getobj: objid=%r", objid) - if objid in self._cached_objs: - (obj, genno) = self._cached_objs[objid] - else: - for xref in self.xrefs: - try: - (strmid, index, genno) = xref.get_pos(objid) - except KeyError: - continue - try: - if strmid is not None: - stream = stream_value(self.getobj(strmid)) - obj = self._getobj_objstm(stream, index, objid) - else: - end, obj = self._getobj_parse(index, objid) - if self.decipher: - obj = decipher_all(self.decipher, objid, genno, obj) - - if isinstance(obj, PDFStream): - obj.set_objid(objid, genno) - break - except (PSEOF, PDFSyntaxError): - continue - else: - raise PDFObjectNotFound(objid) - # log.debug("register: objid=%r: %r", objid, obj) - if self.caching: - self._cached_objs[objid] = (obj, genno) - return obj - - OutlineType = Tuple[Any, Any, Any, Any, Any] - - def get_outlines(self) -> Iterator[OutlineType]: - if "Outlines" not in self.catalog: - raise PDFNoOutlines - - def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]: - entry = dict_value(entry) - if "Title" in entry: - if "A" in entry or "Dest" in entry: - title = decode_text(str_value(entry["Title"])) - dest = entry.get("Dest") - action = entry.get("A") - se = entry.get("SE") - yield (level, title, dest, action, se) - if "First" in entry and "Last" in entry: - yield from search(entry["First"], level + 1) - if "Next" in entry: - yield from search(entry["Next"], level) - - return search(self.catalog["Outlines"], 0) - - def get_page_labels(self) -> Iterator[str]: - """Generate page label strings for the PDF document. - - If the document includes page labels, generates strings, one per page. - If not, raises PDFNoPageLabels. - - The resulting iteration is unbounded. - """ - assert self.catalog is not None - - try: - page_labels = PageLabels(self.catalog["PageLabels"]) - except (PDFTypeError, KeyError): - raise PDFNoPageLabels - - return page_labels.labels - - def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any: - try: - names = dict_value(self.catalog["Names"]) - except (PDFTypeError, KeyError): - raise PDFKeyError((cat, key)) - # may raise KeyError - d0 = dict_value(names[cat]) - - def lookup(d: Dict[str, Any]) -> Any: - if "Limits" in d: - (k1, k2) = list_value(d["Limits"]) - if key < k1 or k2 < key: - return None - if "Names" in d: - objs = list_value(d["Names"]) - names = dict( - cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)), - ) - return names[key] - if "Kids" in d: - for c in list_value(d["Kids"]): - v = lookup(dict_value(c)) - if v: - return v - raise PDFKeyError((cat, key)) - - return lookup(d0) - - def get_dest(self, name: Union[str, bytes]) -> Any: - try: - # PDF-1.2 or later - obj = self.lookup_name("Dests", name) - except KeyError: - # PDF-1.1 or prior - if "Dests" not in self.catalog: - raise PDFDestinationNotFound(name) - d0 = dict_value(self.catalog["Dests"]) - if name not in d0: - raise PDFDestinationNotFound(name) - obj = d0[name] - return obj - - # find_xref - def find_xref(self, parser: PDFParser) -> int: - """Internal function used to locate the first XRef.""" - # search the last xref table by scanning the file backwards. - prev = b"" - for line in parser.revreadlines(): - line = line.strip() - # log.debug("find_xref: %r", line) - - if line == b"startxref": - # log.debug("xref found: pos=%r", prev) - - if not prev.isdigit(): - raise PDFNoValidXRef(f"Invalid xref position: {prev!r}") - - start = int(prev) - - if not start >= 0: - raise PDFNoValidXRef(f"Invalid negative xref position: {start}") - - return start - - if line: - prev = line - - raise PDFNoValidXRef("Unexpected EOF") - - # read xref table - def read_xref_from( - self, - parser: PDFParser, - start: int, - xrefs: List[PDFBaseXRef], - ) -> None: - """Reads XRefs from the given location.""" - parser.seek(start) - parser.reset() - try: - (pos, token) = parser.nexttoken() - except PSEOF: - raise PDFNoValidXRef("Unexpected EOF") - # log.debug("read_xref_from: start=%d, token=%r", start, token) - if isinstance(token, int): - # XRefStream: PDF-1.5 - parser.seek(pos) - parser.reset() - xref: PDFBaseXRef = PDFXRefStream() - xref.load(parser) - else: - if token is parser.KEYWORD_XREF: - parser.nextline() - xref = PDFXRef() - xref.load(parser) - xrefs.append(xref) - trailer = xref.get_trailer() - # log.debug("trailer: %r", trailer) - if "XRefStm" in trailer: - pos = int_value(trailer["XRefStm"]) - self.read_xref_from(parser, pos, xrefs) - if "Prev" in trailer: - # find previous xref - pos = int_value(trailer["Prev"]) - self.read_xref_from(parser, pos, xrefs) - - -class PageLabels(NumberTree): - """PageLabels from the document catalog. - - See Section 8.3.1 in the PDF Reference. - """ - - @property - def labels(self) -> Iterator[str]: - ranges = self.values - - # The tree must begin with page index 0 - if len(ranges) == 0 or ranges[0][0] != 0: - if settings.STRICT: - raise PDFSyntaxError("PageLabels is missing page index 0") - else: - # Try to cope, by assuming empty labels for the initial pages - ranges.insert(0, (0, {})) - - for next, (start, label_dict_unchecked) in enumerate(ranges, 1): - label_dict = dict_value(label_dict_unchecked) - style = label_dict.get("S") - prefix = decode_text(str_value(label_dict.get("P", b""))) - first_value = int_value(label_dict.get("St", 1)) - - if next == len(ranges): - # This is the last specified range. It continues until the end - # of the document. - values: Iterable[int] = itertools.count(first_value) - else: - end, _ = ranges[next] - range_length = end - start - values = range(first_value, first_value + range_length) - - for value in values: - label = self._format_page_label(value, style) - yield prefix + label - - @staticmethod - def _format_page_label(value: int, style: Any) -> str: - """Format page label value in a specific style""" - if style is None: - label = "" - elif style is LIT("D"): # Decimal arabic numerals - label = str(value) - elif style is LIT("R"): # Uppercase roman numerals - label = format_int_roman(value).upper() - elif style is LIT("r"): # Lowercase roman numerals - label = format_int_roman(value) - elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ... - label = format_int_alpha(value).upper() - elif style is LIT("a"): # Lowercase letters a-z, aa-zz... - label = format_int_alpha(value) - else: - log.warning("Unknown page label style: %r", style) - label = "" - return label diff --git a/pdf2zh/pdfexceptions.py b/pdf2zh/pdfexceptions.py deleted file mode 100644 index e1a82ac..0000000 --- a/pdf2zh/pdfexceptions.py +++ /dev/null @@ -1,33 +0,0 @@ -from pdf2zh.psexceptions import PSException - - -class PDFException(PSException): - pass - - -class PDFTypeError(PDFException, TypeError): - pass - - -class PDFValueError(PDFException, ValueError): - pass - - -class PDFObjectNotFound(PDFException): - pass - - -class PDFNotImplementedError(PDFException, NotImplementedError): - pass - - -class PDFKeyError(PDFException, KeyError): - pass - - -class PDFEOFError(PDFException, EOFError): - pass - - -class PDFIOError(PDFException, IOError): - pass diff --git a/pdf2zh/pdffont.py b/pdf2zh/pdffont.py deleted file mode 100644 index 5591e1e..0000000 --- a/pdf2zh/pdffont.py +++ /dev/null @@ -1,1190 +0,0 @@ -import logging -import struct -from io import BytesIO -from typing import ( - TYPE_CHECKING, - Any, - BinaryIO, - Dict, - Iterable, - Iterator, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) - -from pdf2zh import settings -from pdf2zh.cmapdb import ( - CMap, - CMapBase, - CMapDB, - CMapParser, - FileUnicodeMap, - IdentityUnicodeMap, - UnicodeMap, -) -from pdf2zh.encodingdb import EncodingDB, name2unicode -from pdf2zh.fontmetrics import FONT_METRICS -from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError -from pdf2zh.pdftypes import ( - PDFStream, - dict_value, - int_value, - list_value, - num_value, - resolve1, - resolve_all, - stream_value, -) -from pdf2zh.psexceptions import PSEOF -from pdf2zh.psparser import ( - KWD, - LIT, - PSKeyword, - PSLiteral, - PSStackParser, - literal_name, -) -from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack - -if TYPE_CHECKING: - from pdf2zh.pdfinterp import PDFResourceManager - -log = logging.getLogger(__name__) - - -def get_widths(seq: Iterable[object]) -> Dict[int, float]: - """Build a mapping of character widths for horizontal writing.""" - widths: Dict[int, float] = {} - r: List[float] = [] - for v in seq: - if isinstance(v, list): - if r: - char1 = r[-1] - for i, w in enumerate(v): - widths[cast(int, char1) + i] = w - r = [] - elif isinstance(v, (int, float)): # == utils.isnumber(v) - r.append(v) - if len(r) == 3: - (char1, char2, w) = r - for i in range(cast(int, char1), cast(int, char2) + 1): - widths[i] = w - r = [] - return widths - - -def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: - """Build a mapping of character widths for vertical writing.""" - widths: Dict[int, Tuple[float, Point]] = {} - r: List[float] = [] - for v in seq: - if isinstance(v, list): - if r: - char1 = r[-1] - for i, (w, vx, vy) in enumerate(choplist(3, v)): - widths[cast(int, char1) + i] = (w, (vx, vy)) - r = [] - elif isinstance(v, (int, float)): # == utils.isnumber(v) - r.append(v) - if len(r) == 5: - (char1, char2, w, vx, vy) = r - for i in range(cast(int, char1), cast(int, char2) + 1): - widths[i] = (w, (vx, vy)) - r = [] - return widths - - -class FontMetricsDB: - @classmethod - def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: - return FONT_METRICS[fontname] - - -# int here means that we're not extending PSStackParser with additional types. -class Type1FontHeaderParser(PSStackParser[int]): - KEYWORD_BEGIN = KWD(b"begin") - KEYWORD_END = KWD(b"end") - KEYWORD_DEF = KWD(b"def") - KEYWORD_PUT = KWD(b"put") - KEYWORD_DICT = KWD(b"dict") - KEYWORD_ARRAY = KWD(b"array") - KEYWORD_READONLY = KWD(b"readonly") - KEYWORD_FOR = KWD(b"for") - - def __init__(self, data: BinaryIO) -> None: - PSStackParser.__init__(self, data) - self._cid2unicode: Dict[int, str] = {} - - def get_encoding(self) -> Dict[int, str]: - """Parse the font encoding. - - The Type1 font encoding maps character codes to character names. These - character names could either be standard Adobe glyph names, or - character names associated with custom CharStrings for this font. A - CharString is a sequence of operations that describe how the character - should be drawn. Currently, this function returns '' (empty string) - for character names that are associated with a CharStrings. - - Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format - - :returns mapping of character identifiers (cid's) to unicode characters - """ - while 1: - try: - _, (cid, name) = self.nextobject() - except PSEOF: - break - try: - self._cid2unicode[cid] = name2unicode(cast(str, name)) - except KeyError: - # log.debug(str(e)) - pass - return self._cid2unicode - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is self.KEYWORD_PUT: - ((_, key), (_, value)) = self.pop(2) - if isinstance(key, int) and isinstance(value, PSLiteral): - self.add_results((key, literal_name(value))) - - -NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") - -# Mapping of cmap names. Original cmap name is kept if not in the mapping. -# (missing reference for why DLIdent is mapped to Identity) -IDENTITY_ENCODER = { - "DLIdent-H": "Identity-H", - "DLIdent-V": "Identity-V", -} - - -def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: - d: Dict[int, List[Union[float, int]]] = {} - fp = BytesIO(data) - stack: List[Union[float, int]] = [] - while 1: - c = fp.read(1) - if not c: - break - b0 = ord(c) - if b0 <= 21: - d[b0] = stack - stack = [] - continue - if b0 == 30: - s = "" - loop = True - while loop: - b = ord(fp.read(1)) - for n in (b >> 4, b & 15): - if n == 15: - loop = False - else: - nibble = NIBBLES[n] - assert nibble is not None - s += nibble - value = float(s) - elif b0 >= 32 and b0 <= 246: - value = b0 - 139 - else: - b1 = ord(fp.read(1)) - if b0 >= 247 and b0 <= 250: - value = ((b0 - 247) << 8) + b1 + 108 - elif b0 >= 251 and b0 <= 254: - value = -((b0 - 251) << 8) - b1 - 108 - else: - b2 = ord(fp.read(1)) - if b1 >= 128: - b1 -= 256 - if b0 == 28: - value = b1 << 8 | b2 - else: - value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] - stack.append(value) - return d - - -class CFFFont: - STANDARD_STRINGS = ( - ".notdef", - "space", - "exclam", - "quotedbl", - "numbersign", - "dollar", - "percent", - "ampersand", - "quoteright", - "parenleft", - "parenright", - "asterisk", - "plus", - "comma", - "hyphen", - "period", - "slash", - "zero", - "one", - "two", - "three", - "four", - "five", - "six", - "seven", - "eight", - "nine", - "colon", - "semicolon", - "less", - "equal", - "greater", - "question", - "at", - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "J", - "K", - "L", - "M", - "N", - "O", - "P", - "Q", - "R", - "S", - "T", - "U", - "V", - "W", - "X", - "Y", - "Z", - "bracketleft", - "backslash", - "bracketright", - "asciicircum", - "underscore", - "quoteleft", - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "braceleft", - "bar", - "braceright", - "asciitilde", - "exclamdown", - "cent", - "sterling", - "fraction", - "yen", - "florin", - "section", - "currency", - "quotesingle", - "quotedblleft", - "guillemotleft", - "guilsinglleft", - "guilsinglright", - "fi", - "fl", - "endash", - "dagger", - "daggerdbl", - "periodcentered", - "paragraph", - "bullet", - "quotesinglbase", - "quotedblbase", - "quotedblright", - "guillemotright", - "ellipsis", - "perthousand", - "questiondown", - "grave", - "acute", - "circumflex", - "tilde", - "macron", - "breve", - "dotaccent", - "dieresis", - "ring", - "cedilla", - "hungarumlaut", - "ogonek", - "caron", - "emdash", - "AE", - "ordfeminine", - "Lslash", - "Oslash", - "OE", - "ordmasculine", - "ae", - "dotlessi", - "lslash", - "oslash", - "oe", - "germandbls", - "onesuperior", - "logicalnot", - "mu", - "trademark", - "Eth", - "onehalf", - "plusminus", - "Thorn", - "onequarter", - "divide", - "brokenbar", - "degree", - "thorn", - "threequarters", - "twosuperior", - "registered", - "minus", - "eth", - "multiply", - "threesuperior", - "copyright", - "Aacute", - "Acircumflex", - "Adieresis", - "Agrave", - "Aring", - "Atilde", - "Ccedilla", - "Eacute", - "Ecircumflex", - "Edieresis", - "Egrave", - "Iacute", - "Icircumflex", - "Idieresis", - "Igrave", - "Ntilde", - "Oacute", - "Ocircumflex", - "Odieresis", - "Ograve", - "Otilde", - "Scaron", - "Uacute", - "Ucircumflex", - "Udieresis", - "Ugrave", - "Yacute", - "Ydieresis", - "Zcaron", - "aacute", - "acircumflex", - "adieresis", - "agrave", - "aring", - "atilde", - "ccedilla", - "eacute", - "ecircumflex", - "edieresis", - "egrave", - "iacute", - "icircumflex", - "idieresis", - "igrave", - "ntilde", - "oacute", - "ocircumflex", - "odieresis", - "ograve", - "otilde", - "scaron", - "uacute", - "ucircumflex", - "udieresis", - "ugrave", - "yacute", - "ydieresis", - "zcaron", - "exclamsmall", - "Hungarumlautsmall", - "dollaroldstyle", - "dollarsuperior", - "ampersandsmall", - "Acutesmall", - "parenleftsuperior", - "parenrightsuperior", - "twodotenleader", - "onedotenleader", - "zerooldstyle", - "oneoldstyle", - "twooldstyle", - "threeoldstyle", - "fouroldstyle", - "fiveoldstyle", - "sixoldstyle", - "sevenoldstyle", - "eightoldstyle", - "nineoldstyle", - "commasuperior", - "threequartersemdash", - "periodsuperior", - "questionsmall", - "asuperior", - "bsuperior", - "centsuperior", - "dsuperior", - "esuperior", - "isuperior", - "lsuperior", - "msuperior", - "nsuperior", - "osuperior", - "rsuperior", - "ssuperior", - "tsuperior", - "ff", - "ffi", - "ffl", - "parenleftinferior", - "parenrightinferior", - "Circumflexsmall", - "hyphensuperior", - "Gravesmall", - "Asmall", - "Bsmall", - "Csmall", - "Dsmall", - "Esmall", - "Fsmall", - "Gsmall", - "Hsmall", - "Ismall", - "Jsmall", - "Ksmall", - "Lsmall", - "Msmall", - "Nsmall", - "Osmall", - "Psmall", - "Qsmall", - "Rsmall", - "Ssmall", - "Tsmall", - "Usmall", - "Vsmall", - "Wsmall", - "Xsmall", - "Ysmall", - "Zsmall", - "colonmonetary", - "onefitted", - "rupiah", - "Tildesmall", - "exclamdownsmall", - "centoldstyle", - "Lslashsmall", - "Scaronsmall", - "Zcaronsmall", - "Dieresissmall", - "Brevesmall", - "Caronsmall", - "Dotaccentsmall", - "Macronsmall", - "figuredash", - "hypheninferior", - "Ogoneksmall", - "Ringsmall", - "Cedillasmall", - "questiondownsmall", - "oneeighth", - "threeeighths", - "fiveeighths", - "seveneighths", - "onethird", - "twothirds", - "zerosuperior", - "foursuperior", - "fivesuperior", - "sixsuperior", - "sevensuperior", - "eightsuperior", - "ninesuperior", - "zeroinferior", - "oneinferior", - "twoinferior", - "threeinferior", - "fourinferior", - "fiveinferior", - "sixinferior", - "seveninferior", - "eightinferior", - "nineinferior", - "centinferior", - "dollarinferior", - "periodinferior", - "commainferior", - "Agravesmall", - "Aacutesmall", - "Acircumflexsmall", - "Atildesmall", - "Adieresissmall", - "Aringsmall", - "AEsmall", - "Ccedillasmall", - "Egravesmall", - "Eacutesmall", - "Ecircumflexsmall", - "Edieresissmall", - "Igravesmall", - "Iacutesmall", - "Icircumflexsmall", - "Idieresissmall", - "Ethsmall", - "Ntildesmall", - "Ogravesmall", - "Oacutesmall", - "Ocircumflexsmall", - "Otildesmall", - "Odieresissmall", - "OEsmall", - "Oslashsmall", - "Ugravesmall", - "Uacutesmall", - "Ucircumflexsmall", - "Udieresissmall", - "Yacutesmall", - "Thornsmall", - "Ydieresissmall", - "001.000", - "001.001", - "001.002", - "001.003", - "Black", - "Bold", - "Book", - "Light", - "Medium", - "Regular", - "Roman", - "Semibold", - ) - - class INDEX: - def __init__(self, fp: BinaryIO) -> None: - self.fp = fp - self.offsets: List[int] = [] - (count, offsize) = struct.unpack(">HB", self.fp.read(3)) - for i in range(count + 1): - self.offsets.append(nunpack(self.fp.read(offsize))) - self.base = self.fp.tell() - 1 - self.fp.seek(self.base + self.offsets[-1]) - - def __repr__(self) -> str: - return "" % len(self) - - def __len__(self) -> int: - return len(self.offsets) - 1 - - def __getitem__(self, i: int) -> bytes: - self.fp.seek(self.base + self.offsets[i]) - return self.fp.read(self.offsets[i + 1] - self.offsets[i]) - - def __iter__(self) -> Iterator[bytes]: - return iter(self[i] for i in range(len(self))) - - def __init__(self, name: str, fp: BinaryIO) -> None: - self.name = name - self.fp = fp - # Header - (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) - self.fp.read(hdrsize - 4) - # Name INDEX - self.name_index = self.INDEX(self.fp) - # Top DICT INDEX - self.dict_index = self.INDEX(self.fp) - # String INDEX - self.string_index = self.INDEX(self.fp) - # Global Subr INDEX - self.subr_index = self.INDEX(self.fp) - # Top DICT DATA - self.top_dict = getdict(self.dict_index[0]) - (charset_pos,) = self.top_dict.get(15, [0]) - (encoding_pos,) = self.top_dict.get(16, [0]) - (charstring_pos,) = self.top_dict.get(17, [0]) - # CharStrings - self.fp.seek(cast(int, charstring_pos)) - self.charstring = self.INDEX(self.fp) - self.nglyphs = len(self.charstring) - # Encodings - self.code2gid = {} - self.gid2code = {} - self.fp.seek(cast(int, encoding_pos)) - format = self.fp.read(1) - if format == b"\x00": - # Format 0 - (n,) = struct.unpack("B", self.fp.read(1)) - for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): - self.code2gid[code] = gid - self.gid2code[gid] = code - elif format == b"\x01": - # Format 1 - (n,) = struct.unpack("B", self.fp.read(1)) - code = 0 - for i in range(n): - (first, nleft) = struct.unpack("BB", self.fp.read(2)) - for gid in range(first, first + nleft + 1): - self.code2gid[code] = gid - self.gid2code[gid] = code - code += 1 - else: - raise PDFValueError("unsupported encoding format: %r" % format) - # Charsets - self.name2gid = {} - self.gid2name = {} - self.fp.seek(cast(int, charset_pos)) - format = self.fp.read(1) - if format == b"\x00": - # Format 0 - n = self.nglyphs - 1 - for gid, sid in enumerate( - cast( - Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) - ), - ): - gid += 1 - sidname = self.getstr(sid) - self.name2gid[sidname] = gid - self.gid2name[gid] = sidname - elif format == b"\x01": - # Format 1 - (n,) = struct.unpack("B", self.fp.read(1)) - sid = 0 - for i in range(n): - (first, nleft) = struct.unpack("BB", self.fp.read(2)) - for gid in range(first, first + nleft + 1): - sidname = self.getstr(sid) - self.name2gid[sidname] = gid - self.gid2name[gid] = sidname - sid += 1 - elif format == b"\x02": - # Format 2 - assert False, str(("Unhandled", format)) - else: - raise PDFValueError("unsupported charset format: %r" % format) - - def getstr(self, sid: int) -> Union[str, bytes]: - # This returns str for one of the STANDARD_STRINGS but bytes otherwise, - # and appears to be a needless source of type complexity. - if sid < len(self.STANDARD_STRINGS): - return self.STANDARD_STRINGS[sid] - return self.string_index[sid - len(self.STANDARD_STRINGS)] - - -class TrueTypeFont: - class CMapNotFound(PDFException): - pass - - def __init__(self, name: str, fp: BinaryIO) -> None: - self.name = name - self.fp = fp - self.tables: Dict[bytes, Tuple[int, int]] = {} - self.fonttype = fp.read(4) - try: - (ntables, _1, _2, _3) = cast( - Tuple[int, int, int, int], - struct.unpack(">HHHH", fp.read(8)), - ) - for _ in range(ntables): - (name_bytes, tsum, offset, length) = cast( - Tuple[bytes, int, int, int], - struct.unpack(">4sLLL", fp.read(16)), - ) - self.tables[name_bytes] = (offset, length) - except struct.error: - # Do not fail if there are not enough bytes to read. Even for - # corrupted PDFs we would like to get as much information as - # possible, so continue. - pass - - def create_unicode_map(self) -> FileUnicodeMap: - if b"cmap" not in self.tables: - raise TrueTypeFont.CMapNotFound - (base_offset, length) = self.tables[b"cmap"] - fp = self.fp - fp.seek(base_offset) - (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4))) - subtables: List[Tuple[int, int, int]] = [] - for i in range(nsubtables): - subtables.append( - cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), - ) - char2gid: Dict[int, int] = {} - # Only supports subtable type 0, 2 and 4. - for platform_id, encoding_id, st_offset in subtables: - # Skip non-Unicode cmaps. - # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap - if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): - continue - fp.seek(base_offset + st_offset) - (fmttype, fmtlen, fmtlang) = cast( - Tuple[int, int, int], - struct.unpack(">HHH", fp.read(6)), - ) - if fmttype == 0: - char2gid.update( - enumerate( - cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))), - ), - ) - elif fmttype == 2: - subheaderkeys = cast( - Tuple[int, ...], - struct.unpack(">256H", fp.read(512)), - ) - firstbytes = [0] * 8192 - for i, k in enumerate(subheaderkeys): - firstbytes[k // 8] = i - nhdrs = max(subheaderkeys) // 8 + 1 - hdrs: List[Tuple[int, int, int, int, int]] = [] - for i in range(nhdrs): - (firstcode, entcount, delta, offset) = cast( - Tuple[int, int, int, int], - struct.unpack(">HHhH", fp.read(8)), - ) - hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) - for i, firstcode, entcount, delta, pos in hdrs: - if not entcount: - continue - first = firstcode + (firstbytes[i] << 8) - fp.seek(pos) - for c in range(entcount): - gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] - if gid: - gid += delta - char2gid[first + c] = gid - elif fmttype == 4: - (segcount, _1, _2, _3) = cast( - Tuple[int, int, int, int], - struct.unpack(">HHHH", fp.read(8)), - ) - segcount //= 2 - ecs = cast( - Tuple[int, ...], - struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), - ) - fp.read(2) - scs = cast( - Tuple[int, ...], - struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), - ) - idds = cast( - Tuple[int, ...], - struct.unpack(">%dh" % segcount, fp.read(2 * segcount)), - ) - pos = fp.tell() - idrs = cast( - Tuple[int, ...], - struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), - ) - for ec, sc, idd, idr in zip(ecs, scs, idds, idrs): - if idr: - fp.seek(pos + idr) - for c in range(sc, ec + 1): - b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] - char2gid[c] = (b + idd) & 0xFFFF - else: - for c in range(sc, ec + 1): - char2gid[c] = (c + idd) & 0xFFFF - else: - assert False, str(("Unhandled", fmttype)) - if not char2gid: - raise TrueTypeFont.CMapNotFound - # create unicode map - unicode_map = FileUnicodeMap() - for char, gid in char2gid.items(): - unicode_map.add_cid2unichr(gid, char) - return unicode_map - - -class PDFFontError(PDFException): - pass - - -class PDFUnicodeNotDefined(PDFFontError): - pass - - -LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") -LITERAL_TYPE1C = LIT("Type1C") - -# Font widths are maintained in a dict type that maps from *either* unicode -# chars or integer character IDs. -FontWidthDict = Union[Dict[int, float], Dict[str, float]] - - -class PDFFont: - def __init__( - self, - descriptor: Mapping[str, Any], - widths: FontWidthDict, - default_width: Optional[float] = None, - ) -> None: - self.descriptor = descriptor - self.widths: FontWidthDict = resolve_all(widths) - self.fontname = resolve1(descriptor.get("FontName", "unknown")) - if isinstance(self.fontname, PSLiteral): - self.fontname = literal_name(self.fontname) - self.flags = int_value(descriptor.get("Flags", 0)) - self.ascent = num_value(descriptor.get("Ascent", 0)) - self.descent = num_value(descriptor.get("Descent", 0)) - self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) - if default_width is None: - self.default_width = num_value(descriptor.get("MissingWidth", 0)) - else: - self.default_width = default_width - self.default_width = resolve1(self.default_width) - self.leading = num_value(descriptor.get("Leading", 0)) - self.bbox = cast( - Rect, - list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))), - ) - self.hscale = self.vscale = 0.001 - - # PDF RM 9.8.1 specifies /Descent should always be a negative number. - # PScript5.dll seems to produce Descent with a positive number, but - # text analysis will be wrong if this is taken as correct. So force - # descent to negative. - if self.descent > 0: - self.descent = -self.descent - - def __repr__(self) -> str: - return "" - - def is_vertical(self) -> bool: - return False - - def is_multibyte(self) -> bool: - return False - - def decode(self, bytes: bytes) -> Iterable[int]: - return bytearray(bytes) # map(ord, bytes) - - def get_ascent(self) -> float: - """Ascent above the baseline, in text space units""" - return self.ascent * self.vscale - - def get_descent(self) -> float: - """Descent below the baseline, in text space units; always negative""" - return self.descent * self.vscale - - def get_width(self) -> float: - w = self.bbox[2] - self.bbox[0] - if w == 0: - w = -self.default_width - return w * self.hscale - - def get_height(self) -> float: - h = self.bbox[3] - self.bbox[1] - if h == 0: - h = self.ascent - self.descent - return h * self.vscale - - def char_width(self, cid: int) -> float: - # Because character widths may be mapping either IDs or strings, - # we try to lookup the character ID first, then its str equivalent. - try: - return cast(Dict[int, float], self.widths)[cid] * self.hscale - except KeyError: - str_widths = cast(Dict[str, float], self.widths) - try: - return str_widths[self.to_unichr(cid)] * self.hscale - except (KeyError, PDFUnicodeNotDefined): - return self.default_width * self.hscale - - def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: - """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" - return 0 - - def string_width(self, s: bytes) -> float: - return sum(self.char_width(cid) for cid in self.decode(s)) - - def to_unichr(self, cid: int) -> str: - raise NotImplementedError - - -class PDFSimpleFont(PDFFont): - def __init__( - self, - descriptor: Mapping[str, Any], - widths: FontWidthDict, - spec: Mapping[str, Any], - ) -> None: - # Font encoding is specified either by a name of - # built-in encoding or a dictionary that describes - # the differences. - if "Encoding" in spec: - encoding = resolve1(spec["Encoding"]) - else: - encoding = LITERAL_STANDARD_ENCODING - if isinstance(encoding, dict): - name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) - diff = list_value(encoding.get("Differences", [])) - self.cid2unicode = EncodingDB.get_encoding(name, diff) - else: - self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) - self.unicode_map: Optional[UnicodeMap] = None - if "ToUnicode" in spec: - strm = stream_value(spec["ToUnicode"]) - self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() - PDFFont.__init__(self, descriptor, widths) - - def to_unichr(self, cid: int) -> str: - if self.unicode_map: - try: - return self.unicode_map.get_unichr(cid) - except KeyError: - pass - try: - return self.cid2unicode[cid] - except KeyError: - raise PDFUnicodeNotDefined(None, cid) - - -class PDFType1Font(PDFSimpleFont): - def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: - try: - self.basefont = literal_name(spec["BaseFont"]) - except KeyError: - if settings.STRICT: - raise PDFFontError("BaseFont is missing") - self.basefont = "unknown" - - widths: FontWidthDict - try: - (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) - widths = cast(Dict[str, float], int_widths) # implicit int->float - except KeyError: - descriptor = dict_value(spec.get("FontDescriptor", {})) - firstchar = int_value(spec.get("FirstChar", 0)) - # lastchar = int_value(spec.get('LastChar', 255)) - width_list = list_value(spec.get("Widths", [0] * 256)) - widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} - PDFSimpleFont.__init__(self, descriptor, widths, spec) - if "Encoding" not in spec and "FontFile" in descriptor: - # try to recover the missing encoding info from the font file. - self.fontfile = stream_value(descriptor.get("FontFile")) - length1 = int_value(self.fontfile["Length1"]) - data = self.fontfile.get_data()[:length1] - parser = Type1FontHeaderParser(BytesIO(data)) - self.cid2unicode = parser.get_encoding() - - def __repr__(self) -> str: - return "" % self.basefont - - -class PDFTrueTypeFont(PDFType1Font): - def __repr__(self) -> str: - return "" % self.basefont - - -class PDFType3Font(PDFSimpleFont): - def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: - firstchar = int_value(spec.get("FirstChar", 0)) - # lastchar = int_value(spec.get('LastChar', 0)) - width_list = list_value(spec.get("Widths", [0] * 256)) - widths = {i + firstchar: w for (i, w) in enumerate(width_list)} - if "FontDescriptor" in spec: - descriptor = dict_value(spec["FontDescriptor"]) - else: - descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} - PDFSimpleFont.__init__(self, descriptor, widths, spec) - self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) - (_, self.descent, _, self.ascent) = self.bbox - (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) - - def __repr__(self) -> str: - return "" - - -class PDFCIDFont(PDFFont): - default_disp: Union[float, Tuple[Optional[float], float]] - - def __init__( - self, - rsrcmgr: "PDFResourceManager", - spec: Mapping[str, Any], - strict: bool = settings.STRICT, - ) -> None: - try: - self.basefont = literal_name(spec["BaseFont"]) - except KeyError: - if strict: - raise PDFFontError("BaseFont is missing") - self.basefont = "unknown" - self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) - cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( - "latin1", - ) - cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( - "latin1", - ) - self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" - self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) - - try: - descriptor = dict_value(spec["FontDescriptor"]) - except KeyError: - if strict: - raise PDFFontError("FontDescriptor is missing") - descriptor = {} - ttf = None - if "FontFile2" in descriptor: - self.fontfile = stream_value(descriptor.get("FontFile2")) - ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) - self.unicode_map: Optional[UnicodeMap] = None - if "ToUnicode" in spec: - if isinstance(spec["ToUnicode"], PDFStream): - strm = stream_value(spec["ToUnicode"]) - self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() - else: - cmap_name = literal_name(spec["ToUnicode"]) - encoding = literal_name(spec["Encoding"]) - if ( - "Identity" in cid_ordering - or "Identity" in cmap_name - or "Identity" in encoding - ): - self.unicode_map = IdentityUnicodeMap() - elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): - if ttf: - try: - self.unicode_map = ttf.create_unicode_map() - except TrueTypeFont.CMapNotFound: - pass - else: - try: - self.unicode_map = CMapDB.get_unicode_map( - self.cidcoding, - self.cmap.is_vertical(), - ) - except CMapDB.CMapNotFound: - pass - - self.vertical = self.cmap.is_vertical() - if self.vertical: - # writing mode: vertical - widths2 = get_widths2(list_value(spec.get("W2", []))) - self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} - (vy, w) = resolve1(spec.get("DW2", [880, -1000])) - self.default_disp = (None, vy) - widths = {cid: w for (cid, (w, _)) in widths2.items()} - default_width = w - else: - # writing mode: horizontal - self.disps = {} - self.default_disp = 0 - widths = get_widths(list_value(spec.get("W", []))) - default_width = spec.get("DW", 1000) - PDFFont.__init__(self, descriptor, widths, default_width=default_width) - - def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: - """Get cmap from font specification - - For certain PDFs, Encoding Type isn't mentioned as an attribute of - Encoding but as an attribute of CMapName, where CMapName is an - attribute of spec['Encoding']. - The horizontal/vertical modes are mentioned with different name - such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. - """ - cmap_name = self._get_cmap_name(spec, strict) - - try: - return CMapDB.get_cmap(cmap_name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) - return CMap() - - @staticmethod - def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: - """Get cmap name from font specification""" - cmap_name = "unknown" # default value - - try: - spec_encoding = spec["Encoding"] - if hasattr(spec_encoding, "name"): - cmap_name = literal_name(spec["Encoding"]) - else: - cmap_name = literal_name(spec_encoding["CMapName"]) - except KeyError: - if strict: - raise PDFFontError("Encoding is unspecified") - - if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] - cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) - if "CMapName" in cmap_name_stream: - cmap_name = cmap_name_stream.get("CMapName").name - elif strict: - raise PDFFontError("CMapName unspecified for encoding") - - return IDENTITY_ENCODER.get(cmap_name, cmap_name) - - def __repr__(self) -> str: - return f"" - - def is_vertical(self) -> bool: - return self.vertical - - def is_multibyte(self) -> bool: - return True - - def decode(self, bytes: bytes) -> Iterable[int]: - return self.cmap.decode(bytes) - - def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: - """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" - return self.disps.get(cid, self.default_disp) - - def to_unichr(self, cid: int) -> str: - try: - if not self.unicode_map: - raise PDFKeyError(cid) - return self.unicode_map.get_unichr(cid) - except KeyError: - raise PDFUnicodeNotDefined(self.cidcoding, cid) diff --git a/pdf2zh/pdfinterp.py b/pdf2zh/pdfinterp.py index b9d2338..9ea16b6 100644 --- a/pdf2zh/pdfinterp.py +++ b/pdf2zh/pdfinterp.py @@ -1,51 +1,39 @@ import logging -import re -from io import BytesIO -from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast +from typing import Any, Dict, Optional, Sequence, Tuple, cast import numpy as np -from pdf2zh import settings -from pdf2zh.casting import safe_float -from pdf2zh.cmapdb import CMap, CMapBase, CMapDB -from pdf2zh.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace -from pdf2zh.pdfdevice import PDFDevice, PDFTextSeq -from pdf2zh.pdfexceptions import PDFException, PDFValueError -from pdf2zh.pdffont import ( - PDFCIDFont, - PDFFont, - PDFFontError, - PDFTrueTypeFont, - PDFType1Font, - PDFType3Font, +from pdfminer import settings +from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace +from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdfinterp import ( + PDFPageInterpreter, + PDFResourceManager, + PDFContentParser, + PDFInterpreterError, + Color, + PDFStackT, + LITERAL_FORM, + LITERAL_IMAGE, ) -from pdf2zh.pdfpage import PDFPage -from pdf2zh.pdftypes import ( - LITERALS_ASCII85_DECODE, +from pdfminer.pdffont import PDFFont +from pdfminer.pdfpage import PDFPage +from pdfminer.pdftypes import ( PDFObjRef, - PDFStream, dict_value, list_value, resolve1, stream_value, ) -from pdf2zh.psexceptions import PSEOF, PSTypeError -from pdf2zh.psparser import ( - KWD, - LIT, +from pdfminer.psexceptions import PSEOF +from pdfminer.psparser import ( PSKeyword, - PSLiteral, - PSStackParser, - PSStackType, keyword_name, literal_name, ) -from pdf2zh.utils import ( +from pdfminer.utils import ( MATRIX_IDENTITY, Matrix, - PathSegment, - Point, Rect, - choplist, mult_matrix, apply_matrix_pt, ) @@ -53,316 +41,14 @@ log = logging.getLogger(__name__) -class PDFResourceError(PDFException): - pass +def safe_float(o: Any) -> Optional[float]: + try: + return float(o) + except (TypeError, ValueError): + return None -class PDFInterpreterError(PDFException): - pass - - -LITERAL_PDF = LIT("PDF") -LITERAL_TEXT = LIT("Text") -LITERAL_FONT = LIT("Font") -LITERAL_FORM = LIT("Form") -LITERAL_IMAGE = LIT("Image") - - -class PDFTextState: - matrix: Matrix - linematrix: Point - - def __init__(self) -> None: - self.font: Optional[PDFFont] = None - self.fontsize: float = 0 - self.charspace: float = 0 - self.wordspace: float = 0 - self.scaling: float = 100 - self.leading: float = 0 - self.render: int = 0 - self.rise: float = 0 - self.reset() - # self.matrix is set - # self.linematrix is set - - def __repr__(self) -> str: - return ( - "" - % ( - self.font, - self.fontsize, - self.charspace, - self.wordspace, - self.scaling, - self.leading, - self.render, - self.rise, - self.matrix, - self.linematrix, - ) - ) - - def copy(self) -> "PDFTextState": - obj = PDFTextState() - obj.font = self.font - obj.fontsize = self.fontsize - obj.charspace = self.charspace - obj.wordspace = self.wordspace - obj.scaling = self.scaling - obj.leading = self.leading - obj.render = self.render - obj.rise = self.rise - obj.matrix = self.matrix - obj.linematrix = self.linematrix - return obj - - def reset(self) -> None: - self.matrix = MATRIX_IDENTITY - self.linematrix = (0, 0) - - -Color = Union[ - float, # Greyscale - Tuple[float, float, float], # R, G, B - Tuple[float, float, float, float], # C, M, Y, K -] - - -class PDFGraphicState: - def __init__(self) -> None: - self.linewidth: float = 0 - self.linecap: Optional[object] = None - self.linejoin: Optional[object] = None - self.miterlimit: Optional[object] = None - self.dash: Optional[Tuple[object, object]] = None - self.intent: Optional[object] = None - self.flatness: Optional[object] = None - - # stroking color - self.scolor: Optional[Color] = None - - # non stroking color - self.ncolor: Optional[Color] = None - - def copy(self) -> "PDFGraphicState": - obj = PDFGraphicState() - obj.linewidth = self.linewidth - obj.linecap = self.linecap - obj.linejoin = self.linejoin - obj.miterlimit = self.miterlimit - obj.dash = self.dash - obj.intent = self.intent - obj.flatness = self.flatness - obj.scolor = self.scolor - obj.ncolor = self.ncolor - return obj - - def __repr__(self) -> str: - return ( - "" - % ( - self.linewidth, - self.linecap, - self.linejoin, - self.miterlimit, - self.dash, - self.intent, - self.flatness, - self.scolor, - self.ncolor, - ) - ) - - -class PDFResourceManager: - """Repository of shared resources. - - ResourceManager facilitates reuse of shared resources - such as fonts and images so that large objects are not - allocated multiple times. - """ - - def __init__(self, caching: bool = True) -> None: - self.caching = caching - self._cached_fonts: Dict[object, PDFFont] = {} - - def get_procset(self, procs: Sequence[object]) -> None: - for proc in procs: - if proc is LITERAL_PDF or proc is LITERAL_TEXT: - pass - else: - pass - - def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: - try: - return CMapDB.get_cmap(cmapname) - except CMapDB.CMapNotFound: - if strict: - raise - return CMap() - - def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: - if objid and objid in self._cached_fonts: - font = self._cached_fonts[objid] - else: - # log.debug("get_font: create: objid=%r, spec=%r", objid, spec) - if settings.STRICT: - if spec["Type"] is not LITERAL_FONT: - raise PDFFontError("Type is not /Font") - # Create a Font object. - if "Subtype" in spec: - subtype = literal_name(spec["Subtype"]) - else: - if settings.STRICT: - raise PDFFontError("Font Subtype is not specified.") - subtype = "Type1" - if subtype in ("Type1", "MMType1"): - # Type1 Font - font = PDFType1Font(self, spec) - elif subtype == "TrueType": - # TrueType Font - font = PDFTrueTypeFont(self, spec) - elif subtype == "Type3": - # Type3 Font - font = PDFType3Font(self, spec) - elif subtype in ("CIDFontType0", "CIDFontType2"): - # CID Font - font = PDFCIDFont(self, spec) - elif subtype == "Type0": - # Type0 Font - dfonts = list_value(spec["DescendantFonts"]) - assert dfonts - subspec = dict_value(dfonts[0]).copy() - for k in ("Encoding", "ToUnicode"): - if k in spec: - subspec[k] = resolve1(spec[k]) - font = self.get_font(None, subspec) - else: - if settings.STRICT: - raise PDFFontError("Invalid Font spec: %r" % spec) - font = PDFType1Font(self, spec) # this is so wrong! - if objid and self.caching: - self._cached_fonts[objid] = font - return font - - -class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): - def __init__(self, streams: Sequence[object]) -> None: - self.streams = streams - self.istream = 0 - # PSStackParser.__init__(fp=None) is safe only because we've overloaded - # all the methods that would attempt to access self.fp without first - # calling self.fillfp(). - PSStackParser.__init__(self, None) # type: ignore[arg-type] - - def fillfp(self) -> None: - if not self.fp: - if self.istream < len(self.streams): - strm = stream_value(self.streams[self.istream]) - self.istream += 1 - else: - raise PSEOF("Unexpected EOF, file truncated?") - self.fp = BytesIO(strm.get_data()) - # if log.isEnabledFor(logging.DEBUG): - # log.debug(f'STREAM DATA {strm.get_data()}') - - def seek(self, pos: int) -> None: - self.fillfp() - PSStackParser.seek(self, pos) - - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - while 1: - self.fillfp() - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if self.buf: - break - self.fp = None # type: ignore[assignment] - self.charpos = 0 - - def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]: - self.seek(pos) - i = 0 - data = b"" - while i <= len(target): - self.fillbuf() - if i: - ci = self.buf[self.charpos] - c = bytes((ci,)) - data += c - self.charpos += 1 - if ( - len(target) <= i - and c.isspace() - or i < len(target) - and c == (bytes((target[i],))) - ): - i += 1 - else: - i = 0 - else: - try: - j = self.buf.index(target[0], self.charpos) - data += self.buf[self.charpos : j + 1] - self.charpos = j + 1 - i = 1 - except ValueError: - data += self.buf[self.charpos :] - self.charpos = len(self.buf) - data = data[: -(len(target) + 1)] # strip the last part - data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data) - return (pos, data) - - def flush(self) -> None: - self.add_results(*self.popall()) - - KEYWORD_BI = KWD(b"BI") - KEYWORD_ID = KWD(b"ID") - KEYWORD_EI = KWD(b"EI") - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is self.KEYWORD_BI: - # inline image within a content stream - self.start_type(pos, "inline") - elif token is self.KEYWORD_ID: - try: - (_, objs) = self.end_type("inline") - if len(objs) % 2 != 0: - error_msg = f"Invalid dictionary construct: {objs!r}" - raise PSTypeError(error_msg) - d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} - eos = b"EI" - filter = d.get("F", None) - if filter is not None: - if isinstance(filter, PSLiteral): - filter = [filter] - if filter[0] in LITERALS_ASCII85_DECODE: - eos = b"~>" - (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) - if eos != b"EI": # it may be necessary for decoding - data += eos - obj = PDFStream(d, data) - self.push((pos, obj)) - if eos == b"EI": # otherwise it is still in the stream - self.push((pos, self.KEYWORD_EI)) - except PSTypeError: - if settings.STRICT: - raise - else: - self.push((pos, token)) - - -PDFStackT = PSStackType[PDFStream] -"""Types that may appear on the PDF argument stack.""" - - -class PDFPageInterpreter: +class PDFPageInterpreterEx(PDFPageInterpreter): """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary @@ -375,7 +61,7 @@ def __init__( self.device = device self.obj_patch = obj_patch - def dup(self) -> "PDFPageInterpreter": + def dup(self) -> "PDFPageInterpreterEx": return self.__class__(self.rsrcmgr, self.device, self.obj_patch) def init_resources(self, resources: Dict[object, object]) -> None: @@ -409,6 +95,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]: objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) + self.fontmap[fontid].descent = 0 # hack fix descent self.fontid[self.fontmap[fontid]] = fontid elif k == "ColorSpace": for csid, spec in dict_value(v).items(): @@ -421,155 +108,6 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]: for xobjid, xobjstrm in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm - def init_state(self, ctm: Matrix) -> None: - """Initialize the text and graphic states for rendering a page.""" - # gstack: stack for graphical states. - self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] - self.ctm = ctm - self.device.set_ctm(self.ctm) - self.textstate = PDFTextState() - self.graphicstate = PDFGraphicState() - self.curpath: List[PathSegment] = [] - # argstack: stack for command arguments. - self.argstack: List[PDFStackT] = [] - # set some global states. - self.scs: Optional[PDFColorSpace] = None - self.ncs: Optional[PDFColorSpace] = None - if self.csmap: - self.scs = self.ncs = next(iter(self.csmap.values())) - - def push(self, obj: PDFStackT) -> None: - self.argstack.append(obj) - - def pop(self, n: int) -> List[PDFStackT]: - if n == 0: - return [] - x = self.argstack[-n:] - self.argstack = self.argstack[:-n] - return x - - def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: - return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - - def set_current_state( - self, - state: Tuple[Matrix, PDFTextState, PDFGraphicState], - ) -> None: - (self.ctm, self.textstate, self.graphicstate) = state - self.device.set_ctm(self.ctm) - - def do_q(self) -> None: - """Save graphics state""" - self.gstack.append(self.get_current_state()) - - def do_Q(self) -> None: - """Restore graphics state""" - if self.gstack: - self.set_current_state(self.gstack.pop()) - - def do_cm( - self, - a1: PDFStackT, - b1: PDFStackT, - c1: PDFStackT, - d1: PDFStackT, - e1: PDFStackT, - f1: PDFStackT, - ) -> None: - """Concatenate matrix to current transformation matrix""" - self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm) - self.device.set_ctm(self.ctm) - - def do_w(self, linewidth: PDFStackT) -> None: - """Set line width""" - self.graphicstate.linewidth = cast(float, linewidth) - - def do_J(self, linecap: PDFStackT) -> None: - """Set line cap style""" - self.graphicstate.linecap = linecap - - def do_j(self, linejoin: PDFStackT) -> None: - """Set line join style""" - self.graphicstate.linejoin = linejoin - - def do_M(self, miterlimit: PDFStackT) -> None: - """Set miter limit""" - self.graphicstate.miterlimit = miterlimit - - def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: - """Set line dash pattern""" - self.graphicstate.dash = (dash, phase) - - def do_ri(self, intent: PDFStackT) -> None: - """Set color rendering intent""" - self.graphicstate.intent = intent - - def do_i(self, flatness: PDFStackT) -> None: - """Set flatness tolerance""" - self.graphicstate.flatness = flatness - - def do_gs(self, name: PDFStackT) -> None: - """Set parameters from graphics state parameter dictionary""" - # TODO - - def do_m(self, x: PDFStackT, y: PDFStackT) -> None: - """Begin new subpath""" - self.curpath.append(("m", cast(float, x), cast(float, y))) - - def do_l(self, x: PDFStackT, y: PDFStackT) -> None: - """Append straight line segment to path""" - self.curpath.append(("l", cast(float, x), cast(float, y))) - - def do_c( - self, - x1: PDFStackT, - y1: PDFStackT, - x2: PDFStackT, - y2: PDFStackT, - x3: PDFStackT, - y3: PDFStackT, - ) -> None: - """Append curved segment to path (three control points)""" - self.curpath.append( - ( - "c", - cast(float, x1), - cast(float, y1), - cast(float, x2), - cast(float, y2), - cast(float, x3), - cast(float, y3), - ), - ) - - def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: - """Append curved segment to path (initial point replicated)""" - self.curpath.append( - ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)), - ) - - def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: - """Append curved segment to path (final point replicated)""" - self.curpath.append( - ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)), - ) - - def do_h(self) -> None: - """Close subpath""" - self.curpath.append(("h",)) - - def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: - """Append rectangle to path""" - x = cast(float, x) - y = cast(float, y) - w = cast(float, w) - h = cast(float, h) - self.curpath.append(("m", x, y)) - self.curpath.append(("l", x + w, y)) - self.curpath.append(("l", x + w, y + h)) - self.curpath.append(("l", x, y + h)) - self.curpath.append(("h",)) - def do_S(self) -> None: """Stroke path""" @@ -594,11 +132,6 @@ def is_black(color: Color) -> bool: else: self.curpath = [] - def do_s(self) -> None: - """Close and stroke path""" - self.do_h() - self.do_S() - def do_f(self) -> None: """Fill path using nonzero winding number rule""" # self.device.paint_path(self.graphicstate, False, True, False, self.curpath) @@ -622,85 +155,6 @@ def do_B_a(self) -> None: # self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] - def do_b(self) -> None: - """Close, fill, and stroke path using nonzero winding number rule""" - self.do_h() - self.do_B() - - def do_b_a(self) -> None: - """Close, fill, and stroke path using even-odd rule""" - self.do_h() - self.do_B_a() - - def do_n(self) -> None: - """End path without filling or stroking""" - self.curpath = [] - - def do_W(self) -> None: - """Set clipping path using nonzero winding number rule""" - - def do_W_a(self) -> None: - """Set clipping path using even-odd rule""" - - def do_CS(self, name: PDFStackT) -> None: - """Set color space for stroking operations - - Introduced in PDF 1.1 - """ - try: - self.scs = self.csmap[literal_name(name)] - except KeyError: - if settings.STRICT: - raise PDFInterpreterError("Undefined ColorSpace: %r" % name) - - def do_cs(self, name: PDFStackT) -> None: - """Set color space for nonstroking operations""" - try: - self.ncs = self.csmap[literal_name(name)] - except KeyError: - if settings.STRICT: - raise PDFInterpreterError("Undefined ColorSpace: %r" % name) - - def do_G(self, gray: PDFStackT) -> None: - """Set gray level for stroking operations""" - self.graphicstate.scolor = cast(float, gray) - self.scs = self.csmap["DeviceGray"] - - def do_g(self, gray: PDFStackT) -> None: - """Set gray level for nonstroking operations""" - self.graphicstate.ncolor = cast(float, gray) - self.ncs = self.csmap["DeviceGray"] - - def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: - """Set RGB color for stroking operations""" - self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b)) - self.scs = self.csmap["DeviceRGB"] - - def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: - """Set RGB color for nonstroking operations""" - self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b)) - self.ncs = self.csmap["DeviceRGB"] - - def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: - """Set CMYK color for stroking operations""" - self.graphicstate.scolor = ( - cast(float, c), - cast(float, m), - cast(float, y), - cast(float, k), - ) - self.scs = self.csmap["DeviceCMYK"] - - def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: - """Set CMYK color for nonstroking operations""" - self.graphicstate.ncolor = ( - cast(float, c), - cast(float, m), - cast(float, y), - cast(float, k), - ) - self.ncs = self.csmap["DeviceCMYK"] - def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: @@ -733,223 +187,6 @@ def do_sc(self) -> None: """Set color for nonstroking operations""" return self.do_scn() - def do_sh(self, name: object) -> None: - """Paint area defined by shading pattern""" - - def do_BT(self) -> None: - """Begin text object - - Initializing the text matrix, Tm, and the text line matrix, Tlm, to - the identity matrix. Text objects cannot be nested; a second BT cannot - appear before an ET. - """ - self.textstate.reset() - - def do_ET(self) -> None: - """End a text object""" - - def do_BX(self) -> None: - """Begin compatibility section""" - - def do_EX(self) -> None: - """End compatibility section""" - - def do_MP(self, tag: PDFStackT) -> None: - """Define marked-content point""" - self.device.do_tag(cast(PSLiteral, tag)) - - def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None: - """Define marked-content point with property list""" - self.device.do_tag(cast(PSLiteral, tag), props) - - def do_BMC(self, tag: PDFStackT) -> None: - """Begin marked-content sequence""" - self.device.begin_tag(cast(PSLiteral, tag)) - - def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None: - """Begin marked-content sequence with property list""" - self.device.begin_tag(cast(PSLiteral, tag), props) - - def do_EMC(self) -> None: - """End marked-content sequence""" - self.device.end_tag() - - def do_Tc(self, space: PDFStackT) -> None: - """Set character spacing. - - Character spacing is used by the Tj, TJ, and ' operators. - - :param space: a number expressed in unscaled text space units. - """ - self.textstate.charspace = cast(float, space) - - def do_Tw(self, space: PDFStackT) -> None: - """Set the word spacing. - - Word spacing is used by the Tj, TJ, and ' operators. - - :param space: a number expressed in unscaled text space units - """ - self.textstate.wordspace = cast(float, space) - - def do_Tz(self, scale: PDFStackT) -> None: - """Set the horizontal scaling. - - :param scale: is a number specifying the percentage of the normal width - """ - self.textstate.scaling = cast(float, scale) - - def do_TL(self, leading: PDFStackT) -> None: - """Set the text leading. - - Text leading is used only by the T*, ', and " operators. - - :param leading: a number expressed in unscaled text space units - """ - self.textstate.leading = -cast(float, leading) - - def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None: - """Set the text font - - :param fontid: the name of a font resource in the Font subdictionary - of the current resource dictionary - :param fontsize: size is a number representing a scale factor. - """ - try: - self.textstate.font = self.fontmap[literal_name(fontid)] - except KeyError: - if settings.STRICT: - raise PDFInterpreterError("Undefined Font id: %r" % fontid) - self.textstate.font = self.rsrcmgr.get_font(None, {}) - self.textstate.fontsize = cast(float, fontsize) - - def do_Tr(self, render: PDFStackT) -> None: - """Set the text rendering mode""" - self.textstate.render = cast(int, render) - - def do_Ts(self, rise: PDFStackT) -> None: - """Set the text rise - - :param rise: a number expressed in unscaled text space units - """ - self.textstate.rise = cast(float, rise) - - def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None: - """Move to the start of the next line - - Offset from the start of the current line by (tx , ty). - """ - tx_ = safe_float(tx) - ty_ = safe_float(ty) - if tx_ is not None and ty_ is not None: - (a, b, c, d, e, f) = self.textstate.matrix - e_new = tx_ * a + ty_ * c + e - f_new = tx_ * b + ty_ * d + f - self.textstate.matrix = (a, b, c, d, e_new, f_new) - - elif settings.STRICT: - raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td") - - self.textstate.linematrix = (0, 0) - - def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: - """Move to the start of the next line. - - offset from the start of the current line by (tx , ty). As a side effect, this - operator sets the leading parameter in the text state. - """ - tx_ = safe_float(tx) - ty_ = safe_float(ty) - - if tx_ is not None and ty_ is not None: - (a, b, c, d, e, f) = self.textstate.matrix - e_new = tx_ * a + ty_ * c + e - f_new = tx_ * b + ty_ * d + f - self.textstate.matrix = (a, b, c, d, e_new, f_new) - - elif settings.STRICT: - raise PDFValueError("Invalid offset ({tx}, {ty}) for TD") - - if ty_ is not None: - self.textstate.leading = ty_ - - self.textstate.linematrix = (0, 0) - - def do_Tm( - self, - a: PDFStackT, - b: PDFStackT, - c: PDFStackT, - d: PDFStackT, - e: PDFStackT, - f: PDFStackT, - ) -> None: - """Set text matrix and text line matrix""" - self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f)) - self.textstate.linematrix = (0, 0) - - def do_T_a(self) -> None: - """Move to start of next text line""" - (a, b, c, d, e, f) = self.textstate.matrix - self.textstate.matrix = ( - a, - b, - c, - d, - self.textstate.leading * c + e, - self.textstate.leading * d + f, - ) - self.textstate.linematrix = (0, 0) - - def do_TJ(self, seq: PDFStackT) -> None: - """Show text, allowing individual glyph positioning""" - if self.textstate.font is None: - if settings.STRICT: - raise PDFInterpreterError("No font specified!") - return - assert self.ncs is not None - self.device.render_string( - self.textstate, - cast(PDFTextSeq, seq), - self.ncs, - self.graphicstate.copy(), - ) - - def do_Tj(self, s: PDFStackT) -> None: - """Show text""" - self.do_TJ([s]) - - def do__q(self, s: PDFStackT) -> None: - """Move to next line and show text - - The ' (single quote) operator. - """ - self.do_T_a() - self.do_TJ([s]) - - def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None: - """Set word and character spacing, move to next line, and show text - - The " (double quote) operator. - """ - self.do_Tw(aw) - self.do_Tc(ac) - self.do_TJ([s]) - - def do_BI(self) -> None: - """Begin inline image object""" - - def do_ID(self) -> None: - """Begin inline image data""" - - def do_EI(self, obj: PDFStackT) -> None: - """End inline image object""" - if isinstance(obj, PDFStream) and "W" in obj and "H" in obj: - iobjid = str(id(obj)) - self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) - self.device.render_image(iobjid, obj) - self.device.end_figure(iobjid) - def do_Do(self, xobjid_arg: PDFStackT) -> None: """Invoke named XObject""" xobjid = literal_name(xobjid_arg) @@ -1055,7 +292,7 @@ def execute(self, streams: Sequence[object]) -> None: return while True: try: - _, (_, obj) = parser.nextobject() + (_, obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): diff --git a/pdf2zh/pdfpage.py b/pdf2zh/pdfpage.py deleted file mode 100644 index e6ac705..0000000 --- a/pdf2zh/pdfpage.py +++ /dev/null @@ -1,196 +0,0 @@ -import itertools -import logging -from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple - -from pdf2zh import settings -from pdf2zh.pdfdocument import ( - PDFDocument, - PDFNoPageLabels, - PDFTextExtractionNotAllowed, -) -from pdf2zh.pdfexceptions import PDFObjectNotFound, PDFValueError -from pdf2zh.pdfparser import PDFParser -from pdf2zh.pdftypes import dict_value, int_value, list_value, resolve1 -from pdf2zh.psparser import LIT -from pdf2zh.utils import parse_rect - -log = logging.getLogger(__name__) - -# some predefined literals and keywords. -LITERAL_PAGE = LIT("Page") -LITERAL_PAGES = LIT("Pages") - - -class PDFPage: - """An object that holds the information about a page. - - A PDFPage object is merely a convenience class that has a set - of keys and values, which describe the properties of a page - and point to its contents. - - Attributes - ---------- - doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. - attrs: a dictionary of page attributes. - contents: a list of PDFStream objects that represents the page content. - lastmod: the last modified time of the page. - resources: a dictionary of resources used by the page. - mediabox: the physical size of the page. - cropbox: the crop rectangle of the page. - rotate: the page rotation (in degree). - annots: the page annotations. - beads: a chain that represents natural reading order. - label: the page's label (typically, the logical page number). - - """ - - def __init__( - self, - doc: PDFDocument, - pageid: object, - attrs: object, - label: Optional[str], - ) -> None: - """Initialize a page object. - - doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. - attrs: a dictionary of page attributes. - label: page label string. - """ - self.doc = doc - self.pageid = pageid - self.pageno = 0 - self.attrs = dict_value(attrs) - self.label = label - self.lastmod = resolve1(self.attrs.get("LastModified")) - self.resources: Dict[object, object] = resolve1( - self.attrs.get("Resources", dict()), - ) - mediabox_params: List[Any] = [ - resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"] - ] - self.mediabox = parse_rect(resolve1(mediabox_params)) - self.cropbox = self.mediabox - if "CropBox" in self.attrs: - try: - self.cropbox = parse_rect(resolve1(self.attrs["CropBox"])) - except PDFValueError: - pass - - self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 - self.annots = self.attrs.get("Annots") - self.beads = self.attrs.get("B") - if "Contents" in self.attrs: - contents = resolve1(self.attrs["Contents"]) - else: - contents = [] - if not isinstance(contents, list): - contents = [contents] - self.contents: List[object] = contents - - def __repr__(self) -> str: - return f"" - - INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} - - @classmethod - def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: - def depth_first_search( - obj: Any, - parent: Dict[str, Any], - visited: Optional[Set[Any]] = None, - ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]: - if isinstance(obj, int): - object_id = obj - object_properties = dict_value(document.getobj(object_id)).copy() - else: - # This looks broken. obj.objid means obj could be either - # PDFObjRef or PDFStream, but neither is valid for dict_value. - object_id = obj.objid # type: ignore[attr-defined] - object_properties = dict_value(obj).copy() - - # Avoid recursion errors by keeping track of visited nodes - if visited is None: - visited = set() - if object_id in visited: - return - visited.add(object_id) - - for k, v in parent.items(): - if k in cls.INHERITABLE_ATTRS and k not in object_properties: - object_properties[k] = v - - object_type = object_properties.get("Type") - if object_type is None and not settings.STRICT: # See #64 - object_type = object_properties.get("type") - - if object_type is LITERAL_PAGES and "Kids" in object_properties: - # log.debug("Pages: Kids=%r", object_properties["Kids"]) - for child in list_value(object_properties["Kids"]): - yield from depth_first_search(child, object_properties, visited) - - elif object_type is LITERAL_PAGE: - # log.debug("Page: %r", object_properties) - yield (object_id, object_properties) - - try: - page_labels: Iterator[Optional[str]] = document.get_page_labels() - except PDFNoPageLabels: - page_labels = itertools.repeat(None) - - pages = False - if "Pages" in document.catalog: - objects = depth_first_search(document.catalog["Pages"], document.catalog) - for objid, tree in objects: - yield cls(document, objid, tree, next(page_labels)) - pages = True - if not pages: - # fallback when /Pages is missing. - for xref in document.xrefs: - for objid in xref.get_objids(): - try: - obj = document.getobj(objid) - if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: - yield cls(document, objid, obj, next(page_labels)) - except PDFObjectNotFound: - pass - - @classmethod - def get_pages( - cls, - fp: BinaryIO, - pagenos: Optional[Container[int]] = None, - maxpages: int = 0, - password: str = "", - caching: bool = True, - check_extractable: bool = False, - ) -> Iterator["PDFPage"]: - # Create a PDF parser object associated with the file object. - parser = PDFParser(fp) - # Create a PDF document object that stores the document structure. - doc = PDFDocument(parser, password=password, caching=caching) - # Check if the document allows text extraction. - # If not, warn the user and proceed. - if not doc.is_extractable: - if check_extractable: - error_msg = "Text extraction is not allowed: %r" % fp - raise PDFTextExtractionNotAllowed(error_msg) - else: - warning_msg = ( - "The PDF %r contains a metadata field " - "indicating that it should not allow " - "text extraction. Ignoring this field " - "and proceeding. Use the check_extractable " - "if you want to raise an error in this case" % fp - ) - log.warning(warning_msg) - # Process each page contained in the document. - for pageno, page in enumerate(cls.create_pages(doc)): - page.pageno = pageno - if pagenos and (pageno not in pagenos): - continue - yield page - if maxpages and maxpages <= pageno + 1: - break diff --git a/pdf2zh/pdfparser.py b/pdf2zh/pdfparser.py deleted file mode 100644 index 5b02150..0000000 --- a/pdf2zh/pdfparser.py +++ /dev/null @@ -1,166 +0,0 @@ -import logging -from io import BytesIO -from typing import TYPE_CHECKING, BinaryIO, Optional, Union - -from pdf2zh import settings -from pdf2zh.casting import safe_int -from pdf2zh.pdfexceptions import PDFException -from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value -from pdf2zh.psexceptions import PSEOF -from pdf2zh.psparser import KWD, PSKeyword, PSStackParser - -if TYPE_CHECKING: - from pdf2zh.pdfdocument import PDFDocument - -log = logging.getLogger(__name__) - - -class PDFSyntaxError(PDFException): - pass - - -# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None -class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): - """PDFParser fetch PDF objects from a file stream. - It can handle indirect references by referring to - a PDF document set by set_document method. - It also reads XRefs at the end of every PDF file. - - Typical usage: - parser = PDFParser(fp) - parser.read_xref() - parser.read_xref(fallback=True) # optional - parser.set_document(doc) - parser.seek(offset) - parser.nextobject() - - """ - - def __init__(self, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) - self.doc: Optional[PDFDocument] = None - self.fallback = False - - def set_document(self, doc: "PDFDocument") -> None: - """Associates the parser with a PDFDocument object.""" - self.doc = doc - - KEYWORD_R = KWD(b"R") - KEYWORD_NULL = KWD(b"null") - KEYWORD_ENDOBJ = KWD(b"endobj") - KEYWORD_STREAM = KWD(b"stream") - KEYWORD_XREF = KWD(b"xref") - KEYWORD_STARTXREF = KWD(b"startxref") - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - """Handles PDF-related keywords.""" - if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): - self.add_results(*self.pop(1)) - - elif token is self.KEYWORD_ENDOBJ: - self.add_results(*self.pop(4)) - - elif token is self.KEYWORD_NULL: - # null object - self.push((pos, None)) - - elif token is self.KEYWORD_R: - # reference to indirect object - if len(self.curstack) >= 2: - (_, _object_id), _ = self.pop(2) - object_id = safe_int(_object_id) - if object_id is not None: - obj = PDFObjRef(self.doc, object_id) - self.push((pos, obj)) - - elif token is self.KEYWORD_STREAM: - # stream object - ((_, dic),) = self.pop(1) - dic = dict_value(dic) - objlen = 0 - if not self.fallback: - try: - objlen = int_value(dic["Length"]) - except KeyError: - if settings.STRICT: - raise PDFSyntaxError("/Length is undefined: %r" % dic) - self.seek(pos) - try: - (_, line) = self.nextline() # 'stream' - except PSEOF: - if settings.STRICT: - raise PDFSyntaxError("Unexpected EOF") - return - pos += len(line) - self.fp.seek(pos) - data = bytearray(self.fp.read(objlen)) - self.seek(pos + objlen) - while 1: - try: - (linepos, line) = self.nextline() - except PSEOF: - if settings.STRICT: - raise PDFSyntaxError("Unexpected EOF") - break - if b"endstream" in line: - i = line.index(b"endstream") - objlen += i - if self.fallback: - data += line[:i] - break - objlen += len(line) - if self.fallback: - data += line - self.seek(pos + objlen) - # XXX limit objlen not to exceed object boundary - # log.debug( - # "Stream: pos=%d, objlen=%d, dic=%r, data=%r...", - # pos, - # objlen, - # dic, - # data[:10], - # ) - assert self.doc is not None - stream = PDFStream(dic, bytes(data), self.doc.decipher) - self.push((pos, stream)) - - else: - # others - self.push((pos, token)) - - -class PDFStreamParser(PDFParser): - """PDFStreamParser is used to parse PDF content streams - that is contained in each page and has instructions - for rendering the page. A reference to a PDF document is - needed because a PDF content stream can also have - indirect references to other objects in the same document. - """ - - def __init__(self, data: bytes) -> None: - PDFParser.__init__(self, BytesIO(data)) - - def flush(self) -> None: - self.add_results(*self.popall()) - - KEYWORD_OBJ = KWD(b"obj") - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is self.KEYWORD_R: - # reference to indirect object - (_, _object_id), _ = self.pop(2) - object_id = safe_int(_object_id) - if object_id is not None: - obj = PDFObjRef(self.doc, object_id) - self.push((pos, obj)) - return - - elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ): - if settings.STRICT: - # See PDF Spec 3.4.6: Only the object values are stored in the - # stream; the obj and endobj keywords are not used. - raise PDFSyntaxError("Keyword endobj found in stream") - return - - # others - self.push((pos, token)) diff --git a/pdf2zh/pdftypes.py b/pdf2zh/pdftypes.py deleted file mode 100644 index 2563fef..0000000 --- a/pdf2zh/pdftypes.py +++ /dev/null @@ -1,397 +0,0 @@ -import io -import logging -import zlib -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Iterable, - List, - Optional, - Protocol, - Tuple, - Union, - cast, -) -from warnings import warn - -from pdf2zh import pdfexceptions, settings -from pdf2zh.ascii85 import ascii85decode, asciihexdecode -from pdf2zh.ccitt import ccittfaxdecode -from pdf2zh.lzw import lzwdecode -from pdf2zh.psparser import LIT, PSObject -from pdf2zh.runlength import rldecode -from pdf2zh.utils import apply_png_predictor - -if TYPE_CHECKING: - from pdf2zh.pdfdocument import PDFDocument - -logger = logging.getLogger(__name__) - -LITERAL_CRYPT = LIT("Crypt") - -# Abbreviation of Filter names in PDF 4.8.6. "Inline Images" -LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl")) -LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW")) -LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85")) -LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx")) -LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL")) -LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF")) -LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT")) -LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),) -LITERALS_JPX_DECODE = (LIT("JPXDecode"),) - - -class DecipherCallable(Protocol): - """Fully typed a decipher callback, with optional parameter.""" - - def __call__( - self, - objid: int, - genno: int, - data: bytes, - attrs: Optional[Dict[str, Any]] = None, - ) -> bytes: - raise NotImplementedError - - -class PDFObject(PSObject): - pass - - -# Adding aliases for these exceptions for backwards compatibility -PDFException = pdfexceptions.PDFException -PDFTypeError = pdfexceptions.PDFTypeError -PDFValueError = pdfexceptions.PDFValueError -PDFObjectNotFound = pdfexceptions.PDFObjectNotFound -PDFNotImplementedError = pdfexceptions.PDFNotImplementedError - -_DEFAULT = object() - - -class PDFObjRef(PDFObject): - def __init__( - self, - doc: Optional["PDFDocument"], - objid: int, - _: Any = _DEFAULT, - ) -> None: - """Reference to a PDF object. - - :param doc: The PDF document. - :param objid: The object number. - :param _: Unused argument for backwards compatibility. - """ - if _ is not _DEFAULT: - warn( - "The third argument of PDFObjRef is unused and will be removed after " - "2024", - DeprecationWarning, - ) - - if objid == 0: - if settings.STRICT: - raise PDFValueError("PDF object id cannot be 0.") - - self.doc = doc - self.objid = objid - - def __repr__(self) -> str: - return "" % (self.objid) - - def resolve(self, default: object = None) -> Any: - assert self.doc is not None - try: - return self.doc.getobj(self.objid) - except PDFObjectNotFound: - return default - - -def resolve1(x: object, default: object = None) -> Any: - """Resolves an object. - - If this is an array or dictionary, it may still contains - some indirect objects inside. - """ - while isinstance(x, PDFObjRef): - x = x.resolve(default=default) - return x - - -def resolve_all(x: object, default: object = None) -> Any: - """Recursively resolves the given object and all the internals. - - Make sure there is no indirect reference within the nested object. - This procedure might be slow. - """ - while isinstance(x, PDFObjRef): - x = x.resolve(default=default) - if isinstance(x, list): - x = [resolve_all(v, default=default) for v in x] - elif isinstance(x, dict): - for k, v in x.items(): - x[k] = resolve_all(v, default=default) - return x - - -def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any: - """Recursively deciphers the given object.""" - if isinstance(x, bytes): - if len(x) == 0: - return x - return decipher(objid, genno, x) - if isinstance(x, list): - x = [decipher_all(decipher, objid, genno, v) for v in x] - elif isinstance(x, dict): - for k, v in x.items(): - x[k] = decipher_all(decipher, objid, genno, v) - return x - - -def int_value(x: object) -> int: - x = resolve1(x) - if not isinstance(x, int): - if settings.STRICT: - raise PDFTypeError("Integer required: %r" % x) - return 0 - return x - - -def float_value(x: object) -> float: - x = resolve1(x) - if not isinstance(x, float): - if settings.STRICT: - raise PDFTypeError("Float required: %r" % x) - return 0.0 - return x - - -def num_value(x: object) -> float: - x = resolve1(x) - if not isinstance(x, (int, float)): # == utils.isnumber(x) - if settings.STRICT: - raise PDFTypeError("Int or Float required: %r" % x) - return 0 - return x - - -def uint_value(x: object, n_bits: int) -> int: - """Resolve number and interpret it as a two's-complement unsigned number""" - xi = int_value(x) - if xi > 0: - return xi - else: - return xi + cast(int, 2**n_bits) - - -def str_value(x: object) -> bytes: - x = resolve1(x) - if not isinstance(x, bytes): - if settings.STRICT: - raise PDFTypeError("String required: %r" % x) - return b"" - return x - - -def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]: - x = resolve1(x) - if not isinstance(x, (list, tuple)): - if settings.STRICT: - raise PDFTypeError("List required: %r" % x) - return [] - return x - - -def dict_value(x: object) -> Dict[Any, Any]: - x = resolve1(x) - if not isinstance(x, dict): - if settings.STRICT: - logger.error("PDFTypeError : Dict required: %r", x) - raise PDFTypeError("Dict required: %r" % x) - return {} - return x - - -def stream_value(x: object) -> "PDFStream": - x = resolve1(x) - if not isinstance(x, PDFStream): - if settings.STRICT: - raise PDFTypeError("PDFStream required: %r" % x) - return PDFStream({}, b"") - return x - - -def decompress_corrupted(data: bytes) -> bytes: - """Called on some data that can't be properly decoded because of CRC checksum - error. Attempt to decode it skipping the CRC. - """ - d = zlib.decompressobj() - f = io.BytesIO(data) - result_str = b"" - buffer = f.read(1) - i = 0 - try: - while buffer: - result_str += d.decompress(buffer) - buffer = f.read(1) - i += 1 - except zlib.error: - # Let the error propagates if we're not yet in the CRC checksum - if i < len(data) - 3: - logger.warning("Data-loss while decompressing corrupted data") - return result_str - - -class PDFStream(PDFObject): - def __init__( - self, - attrs: Dict[str, Any], - rawdata: bytes, - decipher: Optional[DecipherCallable] = None, - ) -> None: - assert isinstance(attrs, dict), str(type(attrs)) - self.attrs = attrs - self.rawdata: Optional[bytes] = rawdata - self.decipher = decipher - self.data: Optional[bytes] = None - self.objid: Optional[int] = None - self.genno: Optional[int] = None - - def set_objid(self, objid: int, genno: int) -> None: - self.objid = objid - self.genno = genno - - def __repr__(self) -> str: - if self.data is None: - assert self.rawdata is not None - return "" % ( - self.objid, - len(self.rawdata), - self.attrs, - ) - else: - assert self.data is not None - return "" % ( - self.objid, - len(self.data), - self.attrs, - ) - - def __contains__(self, name: object) -> bool: - return name in self.attrs - - def __getitem__(self, name: str) -> Any: - return self.attrs[name] - - def get(self, name: str, default: object = None) -> Any: - return self.attrs.get(name, default) - - def get_any(self, names: Iterable[str], default: object = None) -> Any: - for name in names: - if name in self.attrs: - return self.attrs[name] - return default - - def get_filters(self) -> List[Tuple[Any, Any]]: - filters = self.get_any(("F", "Filter")) - params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {}) - if not filters: - return [] - if not isinstance(filters, list): - filters = [filters] - if not isinstance(params, list): - # Make sure the parameters list is the same as filters. - params = [params] * len(filters) - if settings.STRICT and len(params) != len(filters): - raise PDFException("Parameters len filter mismatch") - - resolved_filters = [resolve1(f) for f in filters] - resolved_params = [resolve1(param) for param in params] - return list(zip(resolved_filters, resolved_params)) - - def decode(self) -> None: - assert self.data is None and self.rawdata is not None, str( - (self.data, self.rawdata), - ) - data = self.rawdata - if self.decipher: - # Handle encryption - assert self.objid is not None - assert self.genno is not None - data = self.decipher(self.objid, self.genno, data, self.attrs) - filters = self.get_filters() - if not filters: - self.data = data - self.rawdata = None - return - for f, params in filters: - if f in LITERALS_FLATE_DECODE: - # will get errors if the document is encrypted. - try: - data = zlib.decompress(data) - - except zlib.error as e: - if settings.STRICT: - error_msg = f"Invalid zlib bytes: {e!r}, {data!r}" - raise PDFException(error_msg) - - try: - data = decompress_corrupted(data) - except zlib.error: - data = b"" - - elif f in LITERALS_LZW_DECODE: - data = lzwdecode(data) - elif f in LITERALS_ASCII85_DECODE: - data = ascii85decode(data) - elif f in LITERALS_ASCIIHEX_DECODE: - data = asciihexdecode(data) - elif f in LITERALS_RUNLENGTH_DECODE: - data = rldecode(data) - elif f in LITERALS_CCITTFAX_DECODE: - data = ccittfaxdecode(data, params) - elif f in LITERALS_DCT_DECODE: - # This is probably a JPG stream - # it does not need to be decoded twice. - # Just return the stream to the user. - pass - elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE: - pass - elif f == LITERAL_CRYPT: - # not yet.. - raise PDFNotImplementedError("/Crypt filter is unsupported") - else: - raise PDFNotImplementedError("Unsupported filter: %r" % f) - # apply predictors - if params and "Predictor" in params: - pred = int_value(params["Predictor"]) - if pred == 1: - # no predictor - pass - elif pred >= 10: - # PNG predictor - colors = int_value(params.get("Colors", 1)) - columns = int_value(params.get("Columns", 1)) - raw_bits_per_component = params.get("BitsPerComponent", 8) - bitspercomponent = int_value(raw_bits_per_component) - data = apply_png_predictor( - pred, - colors, - columns, - bitspercomponent, - data, - ) - else: - error_msg = "Unsupported predictor: %r" % pred - raise PDFNotImplementedError(error_msg) - self.data = data - self.rawdata = None - - def get_data(self) -> bytes: - if self.data is None: - self.decode() - assert self.data is not None - return self.data - - def get_rawdata(self) -> Optional[bytes]: - return self.rawdata diff --git a/pdf2zh/psexceptions.py b/pdf2zh/psexceptions.py deleted file mode 100644 index b8291dc..0000000 --- a/pdf2zh/psexceptions.py +++ /dev/null @@ -1,18 +0,0 @@ -class PSException(Exception): - pass - - -class PSEOF(PSException): - pass - - -class PSSyntaxError(PSException): - pass - - -class PSTypeError(PSException): - pass - - -class PSValueError(PSException): - pass diff --git a/pdf2zh/psparser.py b/pdf2zh/psparser.py deleted file mode 100644 index 1249153..0000000 --- a/pdf2zh/psparser.py +++ /dev/null @@ -1,656 +0,0 @@ -#!/usr/bin/env python3 -import io -import logging -import re -from typing import ( - Any, - BinaryIO, - Dict, - Generic, - Iterator, - List, - Optional, - Tuple, - Type, - TypeVar, - Union, -) - -from pdf2zh import psexceptions, settings -from pdf2zh.utils import choplist - -log = logging.getLogger(__name__) - - -# Adding aliases for these exceptions for backwards compatibility -PSException = psexceptions.PSException -PSEOF = psexceptions.PSEOF -PSSyntaxError = psexceptions.PSSyntaxError -PSTypeError = psexceptions.PSTypeError -PSValueError = psexceptions.PSValueError - - -class PSObject: - """Base class for all PS or PDF-related data types.""" - - -class PSLiteral(PSObject): - """A class that represents a PostScript literal. - - Postscript literals are used as identifiers, such as - variable names, property names and dictionary keys. - Literals are case sensitive and denoted by a preceding - slash sign (e.g. "/Name") - - Note: Do not create an instance of PSLiteral directly. - Always use PSLiteralTable.intern(). - """ - - NameType = Union[str, bytes] - - def __init__(self, name: NameType) -> None: - self.name = name - - def __repr__(self) -> str: - name = self.name - return "/%r" % name - - -class PSKeyword(PSObject): - """A class that represents a PostScript keyword. - - PostScript keywords are a dozen of predefined words. - Commands and directives in PostScript are expressed by keywords. - They are also used to denote the content boundaries. - - Note: Do not create an instance of PSKeyword directly. - Always use PSKeywordTable.intern(). - """ - - def __init__(self, name: bytes) -> None: - self.name = name - - def __repr__(self) -> str: - name = self.name - return "/%r" % name - - -_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) - - -class PSSymbolTable(Generic[_SymbolT]): - """A utility class for storing PSLiteral/PSKeyword objects. - - Interned objects can be checked its identity with "is" operator. - """ - - def __init__(self, klass: Type[_SymbolT]) -> None: - self.dict: Dict[PSLiteral.NameType, _SymbolT] = {} - self.klass: Type[_SymbolT] = klass - - def intern(self, name: PSLiteral.NameType) -> _SymbolT: - if name in self.dict: - lit = self.dict[name] - else: - # Type confusion issue: PSKeyword always takes bytes as name - # PSLiteral uses either str or bytes - lit = self.klass(name) # type: ignore[arg-type] - self.dict[name] = lit - return lit - - -PSLiteralTable = PSSymbolTable(PSLiteral) -PSKeywordTable = PSSymbolTable(PSKeyword) -LIT = PSLiteralTable.intern -KWD = PSKeywordTable.intern -KEYWORD_PROC_BEGIN = KWD(b"{") -KEYWORD_PROC_END = KWD(b"}") -KEYWORD_ARRAY_BEGIN = KWD(b"[") -KEYWORD_ARRAY_END = KWD(b"]") -KEYWORD_DICT_BEGIN = KWD(b"<<") -KEYWORD_DICT_END = KWD(b">>") - - -def literal_name(x: Any) -> str: - if isinstance(x, PSLiteral): - if isinstance(x.name, str): - return x.name - try: - return str(x.name, "utf-8") - except UnicodeDecodeError: - return str(x.name) - else: - if settings.STRICT: - raise PSTypeError(f"Literal required: {x!r}") - return str(x) - - -def keyword_name(x: Any) -> Any: - if not isinstance(x, PSKeyword): - if settings.STRICT: - raise PSTypeError("Keyword required: %r" % x) - else: - name = x - else: - name = str(x.name, "utf-8", "ignore") - return name - - -EOL = re.compile(rb"[\r\n]") -SPC = re.compile(rb"\s") -NONSPC = re.compile(rb"\S") -HEX = re.compile(rb"[0-9a-fA-F]") -END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]") -END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]") -HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.") -END_NUMBER = re.compile(rb"[^0-9]") -END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]") -END_STRING = re.compile(rb"[()\134]") -OCT_STRING = re.compile(rb"[0-7]") -ESC_STRING = { - b"b": 8, - b"t": 9, - b"n": 10, - b"f": 12, - b"r": 13, - b"(": 40, - b")": 41, - b"\\": 92, -} - - -PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] - - -class PSBaseParser: - """Most basic PostScript parser that performs only tokenization.""" - - BUFSIZ = 4096 - - def __init__(self, fp: BinaryIO) -> None: - self.fp = fp - self.seek(0) - - def __repr__(self) -> str: - return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos) - - def flush(self) -> None: - pass - - def close(self) -> None: - self.flush() - - def tell(self) -> int: - return self.bufpos + self.charpos - - def poll(self, pos: Optional[int] = None, n: int = 80) -> None: - pos0 = self.fp.tell() - if not pos: - pos = self.bufpos + self.charpos - self.fp.seek(pos) - # log.debug("poll(%d): %r", pos, self.fp.read(n)) - self.fp.seek(pos0) - - def seek(self, pos: int) -> None: - """Seeks the parser to the given position.""" - # log.debug("seek: %r", pos) - self.fp.seek(pos) - # reset the status for nextline() - self.bufpos = pos - self.buf = b"" - self.charpos = 0 - # reset the status for nexttoken() - self._parse1 = self._parse_main - self._curtoken = b"" - self._curtokenpos = 0 - self._tokens: List[Tuple[int, PSBaseParserToken]] = [] - - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - # fetch next chunk. - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if not self.buf: - raise PSEOF("Unexpected EOF") - self.charpos = 0 - - def nextline(self) -> Tuple[int, bytes]: - """Fetches a next line that ends either with \\r or \\n.""" - linebuf = b"" - linepos = self.bufpos + self.charpos - eol = False - while 1: - self.fillbuf() - if eol: - c = self.buf[self.charpos : self.charpos + 1] - # handle b'\r\n' - if c == b"\n": - linebuf += c - self.charpos += 1 - break - m = EOL.search(self.buf, self.charpos) - if m: - linebuf += self.buf[self.charpos : m.end(0)] - self.charpos = m.end(0) - if linebuf[-1:] == b"\r": - eol = True - else: - break - else: - linebuf += self.buf[self.charpos :] - self.charpos = len(self.buf) - # log.debug("nextline: %r, %r", linepos, linebuf) - - return (linepos, linebuf) - - def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backword. - - This is used to locate the trailers at the end of a file. - """ - self.fp.seek(0, io.SEEK_END) - pos = self.fp.tell() - buf = b"" - while pos > 0: - prevpos = pos - pos = max(0, pos - self.BUFSIZ) - self.fp.seek(pos) - s = self.fp.read(prevpos - pos) - if not s: - break - while 1: - n = max(s.rfind(b"\r"), s.rfind(b"\n")) - if n == -1: - buf = s + buf - break - yield s[n:] + buf - s = s[:n] - buf = b"" - - def _parse_main(self, s: bytes, i: int) -> int: - m = NONSPC.search(s, i) - if not m: - return len(s) - j = m.start(0) - c = s[j : j + 1] - self._curtokenpos = self.bufpos + j - if c == b"%": - self._curtoken = b"%" - self._parse1 = self._parse_comment - return j + 1 - elif c == b"/": - self._curtoken = b"" - self._parse1 = self._parse_literal - return j + 1 - elif c in b"-+" or c.isdigit(): - self._curtoken = c - self._parse1 = self._parse_number - return j + 1 - elif c == b".": - self._curtoken = c - self._parse1 = self._parse_float - return j + 1 - elif c.isalpha(): - self._curtoken = c - self._parse1 = self._parse_keyword - return j + 1 - elif c == b"(": - self._curtoken = b"" - self.paren = 1 - self._parse1 = self._parse_string - return j + 1 - elif c == b"<": - self._curtoken = b"" - self._parse1 = self._parse_wopen - return j + 1 - elif c == b">": - self._curtoken = b"" - self._parse1 = self._parse_wclose - return j + 1 - elif c == b"\x00": - return j + 1 - else: - self._add_token(KWD(c)) - return j + 1 - - def _add_token(self, obj: PSBaseParserToken) -> None: - self._tokens.append((self._curtokenpos, obj)) - - def _parse_comment(self, s: bytes, i: int) -> int: - m = EOL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - self._parse1 = self._parse_main - # We ignore comments. - # self._tokens.append(self._curtoken) - return j - - def _parse_literal(self, s: bytes, i: int) -> int: - m = END_LITERAL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b"#": - self.hex = b"" - self._parse1 = self._parse_literal_hex - return j + 1 - try: - name: Union[str, bytes] = str(self._curtoken, "utf-8") - except Exception: - name = self._curtoken - self._add_token(LIT(name)) - self._parse1 = self._parse_main - return j - - def _parse_literal_hex(self, s: bytes, i: int) -> int: - c = s[i : i + 1] - if HEX.match(c) and len(self.hex) < 2: - self.hex += c - return i + 1 - if self.hex: - self._curtoken += bytes((int(self.hex, 16),)) - self._parse1 = self._parse_literal - return i - - def _parse_number(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b".": - self._curtoken += c - self._parse1 = self._parse_float - return j + 1 - try: - self._add_token(int(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_float(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - try: - self._add_token(float(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_keyword(self, s: bytes, i: int) -> int: - m = END_KEYWORD.search(s, i) - if m: - j = m.start(0) - self._curtoken += s[i:j] - else: - # Use the rest of the stream if no non-keyword character is found. This - # can happen if the keyword is the final bytes of the stream - # (https://github.com/pdf2zh/pdf2zh.six/issues/884). - j = len(s) - self._curtoken += s[i:] - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False - else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j - - def _parse_string(self, s: bytes, i: int) -> int: - m = END_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b"\\": - self.oct = b"" - self._parse1 = self._parse_string_1 - return j + 1 - if c == b"(": - self.paren += 1 - self._curtoken += c - return j + 1 - if c == b")": - self.paren -= 1 - if self.paren: - # WTF, they said balanced parens need no special treatment. - self._curtoken += c - return j + 1 - self._add_token(self._curtoken) - self._parse1 = self._parse_main - return j + 1 - - def _parse_string_1(self, s: bytes, i: int) -> int: - """Parse literal strings - - PDF Reference 3.2.3 - """ - c = s[i : i + 1] - if OCT_STRING.match(c) and len(self.oct) < 3: - self.oct += c - return i + 1 - - elif self.oct: - chrcode = int(self.oct, 8) - assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode) - self._curtoken += bytes((chrcode,)) - self._parse1 = self._parse_string - return i - - elif c in ESC_STRING: - self._curtoken += bytes((ESC_STRING[c],)) - - elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n": - # If current and next character is \r\n skip both because enters - # after a \ are ignored - i += 1 - - # default action - self._parse1 = self._parse_string - return i + 1 - - def _parse_wopen(self, s: bytes, i: int) -> int: - c = s[i : i + 1] - if c == b"<": - self._add_token(KEYWORD_DICT_BEGIN) - self._parse1 = self._parse_main - i += 1 - else: - self._parse1 = self._parse_hexstring - return i - - def _parse_wclose(self, s: bytes, i: int) -> int: - c = s[i : i + 1] - if c == b">": - self._add_token(KEYWORD_DICT_END) - i += 1 - self._parse1 = self._parse_main - return i - - def _parse_hexstring(self, s: bytes, i: int) -> int: - m = END_HEX_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - token = HEX_PAIR.sub( - lambda m: bytes((int(m.group(0), 16),)), - SPC.sub(b"", self._curtoken), - ) - self._add_token(token) - self._parse1 = self._parse_main - return j - - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - while not self._tokens: - self.fillbuf() - self.charpos = self._parse1(self.buf, self.charpos) - token = self._tokens.pop(0) - # log.debug("nexttoken: %r", token) - return token - - -# Stack slots may by occupied by any of: -# * the name of a literal -# * the PSBaseParserToken types -# * list (via KEYWORD_ARRAY) -# * dict (via KEYWORD_DICT) -# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT -ExtraT = TypeVar("ExtraT") -PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT] -PSStackEntry = Tuple[int, PSStackType[ExtraT]] - - -class PSStackParser(PSBaseParser, Generic[ExtraT]): - def __init__(self, fp: BinaryIO) -> None: - PSBaseParser.__init__(self, fp) - self.reset() - - def reset(self) -> None: - self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] - self.curtype: Optional[str] = None - self.curstack: List[PSStackEntry[ExtraT]] = [] - self.results: List[PSStackEntry[ExtraT]] = [] - - def seek(self, pos: int) -> None: - PSBaseParser.seek(self, pos) - self.reset() - - def push(self, *objs: PSStackEntry[ExtraT]) -> None: - self.curstack.extend(objs) - - def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: - objs = self.curstack[-n:] - self.curstack[-n:] = [] - return objs - - def popall(self) -> List[PSStackEntry[ExtraT]]: - objs = self.curstack - self.curstack = [] - return objs - - def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: - # try: - # log.debug("add_results: %r", objs) - # except Exception: - # log.debug("add_results: (unprintable object)") - self.results.extend(objs) - - def start_type(self, pos: int, type: str) -> None: - self.context.append((pos, self.curtype, self.curstack)) - (self.curtype, self.curstack) = (type, []) - # log.debug("start_type: pos=%r, type=%r", pos, type) - - def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: - if self.curtype != type: - raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") - objs = [obj for (_, obj) in self.curstack] - (pos, self.curtype, self.curstack) = self.context.pop() - # log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) - return (pos, objs) - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - pass - - def nextobject(self) -> PSStackEntry[ExtraT]: - """Yields a list of objects. - - Arrays and dictionaries are represented as Python lists and - dictionaries. - - :return: keywords, literals, strings, numbers, arrays and dictionaries. - """ - end = None - while not self.results: - (pos, token) = self.nexttoken() - if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): - # normal token - self.push((pos, token)) - elif token == KEYWORD_ARRAY_BEGIN: - # begin array - self.start_type(pos, "a") - elif token == KEYWORD_ARRAY_END: - # end array - try: - self.push(self.end_type("a")) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_DICT_BEGIN: - # begin dictionary - self.start_type(pos, "d") - elif token == KEYWORD_DICT_END: - # end dictionary - try: - (pos, objs) = self.end_type("d") - if len(objs) % 2 != 0: - error_msg = "Invalid dictionary construct: %r" % objs - raise PSSyntaxError(error_msg) - d = { - literal_name(k): v - for (k, v) in choplist(2, objs) - if v is not None - } - self.push((pos, d)) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_PROC_BEGIN: - # begin proc - self.start_type(pos, "p") - elif token == KEYWORD_PROC_END: - # end proc - try: - self.push(self.end_type("p")) - except PSTypeError: - if settings.STRICT: - raise - elif isinstance(token, PSKeyword): - # log.debug( - # "do_keyword: pos=%r, token=%r, stack=%r", - # pos, - # token, - # self.curstack, - # ) - if token.name == b"endobj": - end = pos + 7 - self.do_keyword(pos, token) - else: - log.error( - "unknown token: pos=%r, token=%r, stack=%r", - pos, - token, - self.curstack, - ) - self.do_keyword(pos, token) - raise PSException - if self.context: - continue - else: - self.flush() - obj = self.results.pop(0) - # try: - # log.debug("nextobject: %r", obj) - # except Exception: - # log.debug("nextobject: (unprintable object)") - return end, obj diff --git a/pdf2zh/py.typed b/pdf2zh/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/pdf2zh/runlength.py b/pdf2zh/runlength.py deleted file mode 100644 index 2774e2a..0000000 --- a/pdf2zh/runlength.py +++ /dev/null @@ -1,39 +0,0 @@ -# -# RunLength decoder (Adobe version) implementation based on PDF Reference -# version 1.4 section 3.3.4. -# -# * public domain * -# - - -def rldecode(data: bytes) -> bytes: - """RunLength decoder (Adobe version) implementation based on PDF Reference - version 1.4 section 3.3.4: - The RunLengthDecode filter decodes data that has been encoded in a - simple byte-oriented format based on run length. The encoded data - is a sequence of runs, where each run consists of a length byte - followed by 1 to 128 bytes of data. If the length byte is in the - range 0 to 127, the following length + 1 (1 to 128) bytes are - copied literally during decompression. If length is in the range - 129 to 255, the following single byte is to be copied 257 - length - (2 to 128) times during decompression. A length value of 128 - denotes EOD. - """ - decoded = b"" - i = 0 - while i < len(data): - length = data[i] - if length == 128: - break - - if length >= 0 and length < 128: - for j in range(i + 1, (i + 1) + (length + 1)): - decoded += bytes((data[j],)) - i = (i + 1) + (length + 1) - - if length > 128: - run = bytes((data[i + 1],)) * (257 - length) - decoded += run - i = (i + 1) + 1 - - return decoded diff --git a/pdf2zh/settings.py b/pdf2zh/settings.py deleted file mode 100644 index 810077a..0000000 --- a/pdf2zh/settings.py +++ /dev/null @@ -1 +0,0 @@ -STRICT = False diff --git a/pdf2zh/translator.py b/pdf2zh/translator.py index 047b95d..ae272b6 100644 --- a/pdf2zh/translator.py +++ b/pdf2zh/translator.py @@ -7,6 +7,7 @@ import time from datetime import UTC, datetime from json import dumps, loads +import unicodedata import deepl import ollama @@ -16,6 +17,10 @@ from azure.core.credentials import AzureKeyCredential +def remove_control_characters(s): + return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") + + class BaseTranslator: def __init__(self, service, lang_out, lang_in, model): self.service = service @@ -56,7 +61,7 @@ def translate(self, text): raise ValueError("Empty translation result") else: result = html.unescape(re_result[0]) - return result + return remove_control_characters(result) class TencentTranslator(BaseTranslator): diff --git a/pdf2zh/utils.py b/pdf2zh/utils.py deleted file mode 100644 index ad5643b..0000000 --- a/pdf2zh/utils.py +++ /dev/null @@ -1,834 +0,0 @@ -"""Miscellaneous Routines.""" - -import io -import pathlib -import string -import struct -from html import escape -from typing import ( - TYPE_CHECKING, - Any, - BinaryIO, - Callable, - Dict, - Generic, - Iterable, - Iterator, - List, - Optional, - Set, - TextIO, - Tuple, - TypeVar, - Union, - cast, -) - -from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError - -if TYPE_CHECKING: - from pdf2zh.layout import LTComponent - -import charset_normalizer # For str encoding detection - -# from sys import maxint as INF doesn't work anymore under Python3, but PDF -# still uses 32 bits ints -INF = (1 << 31) - 1 - - -FileOrName = Union[pathlib.PurePath, str, io.IOBase] -AnyIO = Union[TextIO, BinaryIO] - - -class open_filename: - """Context manager that allows opening a filename - (str or pathlib.PurePath type is supported) and closes it on exit, - (just like `open`), but does nothing for file-like objects. - """ - - def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None: - if isinstance(filename, pathlib.PurePath): - filename = str(filename) - if isinstance(filename, str): - self.file_handler: AnyIO = open(filename, *args, **kwargs) - self.closing = True - elif isinstance(filename, io.IOBase): - self.file_handler = cast(AnyIO, filename) - self.closing = False - else: - raise PDFTypeError("Unsupported input type: %s" % type(filename)) - - def __enter__(self) -> AnyIO: - return self.file_handler - - def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: - if self.closing: - self.file_handler.close() - - -def make_compat_bytes(in_str: str) -> bytes: - """Converts to bytes, encoding to unicode.""" - assert isinstance(in_str, str), str(type(in_str)) - return in_str.encode() - - -def make_compat_str(o: object) -> str: - """Converts everything to string, if bytes guessing the encoding.""" - if isinstance(o, bytes): - enc = charset_normalizer.detect(o) - try: - return o.decode(enc["encoding"]) - except UnicodeDecodeError: - return str(o) - else: - return str(o) - - -def shorten_str(s: str, size: int) -> str: - if size < 7: - return s[:size] - if len(s) > size: - length = (size - 5) // 2 - return f"{s[:length]} ... {s[-length:]}" - else: - return s - - -def compatible_encode_method( - bytesorstring: Union[bytes, str], - encoding: str = "utf-8", - erraction: str = "ignore", -) -> str: - """When Py2 str.encode is called, it often means bytes.encode in Py3. - - This does either. - """ - if isinstance(bytesorstring, str): - return bytesorstring - assert isinstance(bytesorstring, bytes), str(type(bytesorstring)) - return bytesorstring.decode(encoding, erraction) - - -def paeth_predictor(left: int, above: int, upper_left: int) -> int: - # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html - # Initial estimate - p = left + above - upper_left - # Distances to a,b,c - pa = abs(p - left) - pb = abs(p - above) - pc = abs(p - upper_left) - - # Return nearest of a,b,c breaking ties in order a,b,c - if pa <= pb and pa <= pc: - return left - elif pb <= pc: - return above - else: - return upper_left - - -def apply_png_predictor( - pred: int, - colors: int, - columns: int, - bitspercomponent: int, - data: bytes, -) -> bytes: - """Reverse the effect of the PNG predictor - - Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html - """ - if bitspercomponent not in [8, 1]: - msg = "Unsupported `bitspercomponent': %d" % bitspercomponent - raise PDFValueError(msg) - - nbytes = colors * columns * bitspercomponent // 8 - bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel - buf = [] - line_above = list(b"\x00" * columns) - for scanline_i in range(0, len(data), nbytes + 1): - filter_type = data[scanline_i] - line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes] - raw = [] - - if filter_type == 0: - # Filter type 0: None - raw = list(line_encoded) - - elif filter_type == 1: - # Filter type 1: Sub - # To reverse the effect of the Sub() filter after decompression, - # output the following value: - # Raw(x) = Sub(x) + Raw(x - bpp) - # (computed mod 256), where Raw() refers to the bytes already - # decoded. - for j, sub_x in enumerate(line_encoded): - if j - bpp < 0: - raw_x_bpp = 0 - else: - raw_x_bpp = int(raw[j - bpp]) - raw_x = (sub_x + raw_x_bpp) & 255 - raw.append(raw_x) - - elif filter_type == 2: - # Filter type 2: Up - # To reverse the effect of the Up() filter after decompression, - # output the following value: - # Raw(x) = Up(x) + Prior(x) - # (computed mod 256), where Prior() refers to the decoded bytes of - # the prior scanline. - for up_x, prior_x in zip(line_encoded, line_above): - raw_x = (up_x + prior_x) & 255 - raw.append(raw_x) - - elif filter_type == 3: - # Filter type 3: Average - # To reverse the effect of the Average() filter after - # decompression, output the following value: - # Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2) - # where the result is computed mod 256, but the prediction is - # calculated in the same way as for encoding. Raw() refers to the - # bytes already decoded, and Prior() refers to the decoded bytes of - # the prior scanline. - for j, average_x in enumerate(line_encoded): - if j - bpp < 0: - raw_x_bpp = 0 - else: - raw_x_bpp = int(raw[j - bpp]) - prior_x = int(line_above[j]) - raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255 - raw.append(raw_x) - - elif filter_type == 4: - # Filter type 4: Paeth - # To reverse the effect of the Paeth() filter after decompression, - # output the following value: - # Raw(x) = Paeth(x) - # + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp)) - # (computed mod 256), where Raw() and Prior() refer to bytes - # already decoded. Exactly the same PaethPredictor() function is - # used by both encoder and decoder. - for j, paeth_x in enumerate(line_encoded): - if j - bpp < 0: - raw_x_bpp = 0 - prior_x_bpp = 0 - else: - raw_x_bpp = int(raw[j - bpp]) - prior_x_bpp = int(line_above[j - bpp]) - prior_x = int(line_above[j]) - paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp) - raw_x = (paeth_x + paeth) & 255 - raw.append(raw_x) - - else: - raise PDFValueError("Unsupported predictor value: %d" % filter_type) - - buf.extend(raw) - line_above = raw - return bytes(buf) - - -Point = Tuple[float, float] -Rect = Tuple[float, float, float, float] -Matrix = Tuple[float, float, float, float, float, float] -PathSegment = Union[ - Tuple[str], # Literal['h'] - Tuple[str, float, float], # Literal['m', 'l'] - Tuple[str, float, float, float, float], # Literal['v', 'y'] - Tuple[str, float, float, float, float, float, float], -] # Literal['c'] - -# Matrix operations -MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) - - -def parse_rect(o: Any) -> Rect: - try: - (x0, y0, x1, y1) = o - return float(x0), float(y0), float(x1), float(y1) - except ValueError: - raise PDFValueError("Could not parse rectangle") - - -def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix: - (a1, b1, c1, d1, e1, f1) = m1 - (a0, b0, c0, d0, e0, f0) = m0 - """Returns the multiplication of two matrices.""" - return ( - a0 * a1 + c0 * b1, - b0 * a1 + d0 * b1, - a0 * c1 + c0 * d1, - b0 * c1 + d0 * d1, - a0 * e1 + c0 * f1 + e0, - b0 * e1 + d0 * f1 + f0, - ) - - -def translate_matrix(m: Matrix, v: Point) -> Matrix: - """Translates a matrix by (x, y).""" - (a, b, c, d, e, f) = m - (x, y) = v - return a, b, c, d, x * a + y * c + e, x * b + y * d + f - - -def apply_matrix_pt(m: Matrix, v: Point) -> Point: - (a, b, c, d, e, f) = m - (x, y) = v - """Applies a matrix to a point.""" - return a * x + c * y + e, b * x + d * y + f - - -def apply_matrix_norm(m: Matrix, v: Point) -> Point: - """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" - (a, b, c, d, e, f) = m - (p, q) = v - return a * p + c * q, b * p + d * q - - -def matrix_scale(m: Matrix) -> float: - (a, b, c, d, e, f) = m - return (a**2 + c**2) ** 0.5 - - -# Utility functions - - -def isnumber(x: object) -> bool: - return isinstance(x, (int, float)) - - -_T = TypeVar("_T") - - -def uniq(objs: Iterable[_T]) -> Iterator[_T]: - """Eliminates duplicated elements.""" - done = set() - for obj in objs: - if obj in done: - continue - done.add(obj) - yield obj - - -def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]: - """Split a list into two classes according to the predicate.""" - t = [] - f = [] - for obj in objs: - if pred(obj): - t.append(obj) - else: - f.append(obj) - return t, f - - -def drange(v0: float, v1: float, d: int) -> range: - """Returns a discrete range.""" - return range(int(v0) // d, int(v1 + d) // d) - - -def get_bound(pts: Iterable[Point]) -> Rect: - """Compute a minimal rectangle that covers all the points.""" - limit: Rect = (INF, INF, -INF, -INF) - (x0, y0, x1, y1) = limit - for x, y in pts: - x0 = min(x0, x) - y0 = min(y0, y) - x1 = max(x1, x) - y1 = max(y1, y) - return x0, y0, x1, y1 - - -def pick( - seq: Iterable[_T], - func: Callable[[_T], float], - maxobj: Optional[_T] = None, -) -> Optional[_T]: - """Picks the object obj where func(obj) has the highest value.""" - maxscore = None - for obj in seq: - score = func(obj) - if maxscore is None or maxscore < score: - (maxscore, maxobj) = (score, obj) - return maxobj - - -def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]: - """Groups every n elements of the list.""" - r = [] - for x in seq: - r.append(x) - if len(r) == n: - yield tuple(r) - r = [] - - -def nunpack(s: bytes, default: int = 0) -> int: - """Unpacks 1 to 4 or 8 byte integers (big endian).""" - length = len(s) - if not length: - return default - elif length == 1: - return ord(s) - elif length == 2: - return cast(int, struct.unpack(">H", s)[0]) - elif length == 3: - return cast(int, struct.unpack(">L", b"\x00" + s)[0]) - elif length == 4: - return cast(int, struct.unpack(">L", s)[0]) - elif length == 8: - return cast(int, struct.unpack(">Q", s)[0]) - else: - raise PDFTypeError("invalid length: %d" % length) - - -PDFDocEncoding = "".join( - chr(x) - for x in ( - 0x0000, - 0x0001, - 0x0002, - 0x0003, - 0x0004, - 0x0005, - 0x0006, - 0x0007, - 0x0008, - 0x0009, - 0x000A, - 0x000B, - 0x000C, - 0x000D, - 0x000E, - 0x000F, - 0x0010, - 0x0011, - 0x0012, - 0x0013, - 0x0014, - 0x0015, - 0x0017, - 0x0017, - 0x02D8, - 0x02C7, - 0x02C6, - 0x02D9, - 0x02DD, - 0x02DB, - 0x02DA, - 0x02DC, - 0x0020, - 0x0021, - 0x0022, - 0x0023, - 0x0024, - 0x0025, - 0x0026, - 0x0027, - 0x0028, - 0x0029, - 0x002A, - 0x002B, - 0x002C, - 0x002D, - 0x002E, - 0x002F, - 0x0030, - 0x0031, - 0x0032, - 0x0033, - 0x0034, - 0x0035, - 0x0036, - 0x0037, - 0x0038, - 0x0039, - 0x003A, - 0x003B, - 0x003C, - 0x003D, - 0x003E, - 0x003F, - 0x0040, - 0x0041, - 0x0042, - 0x0043, - 0x0044, - 0x0045, - 0x0046, - 0x0047, - 0x0048, - 0x0049, - 0x004A, - 0x004B, - 0x004C, - 0x004D, - 0x004E, - 0x004F, - 0x0050, - 0x0051, - 0x0052, - 0x0053, - 0x0054, - 0x0055, - 0x0056, - 0x0057, - 0x0058, - 0x0059, - 0x005A, - 0x005B, - 0x005C, - 0x005D, - 0x005E, - 0x005F, - 0x0060, - 0x0061, - 0x0062, - 0x0063, - 0x0064, - 0x0065, - 0x0066, - 0x0067, - 0x0068, - 0x0069, - 0x006A, - 0x006B, - 0x006C, - 0x006D, - 0x006E, - 0x006F, - 0x0070, - 0x0071, - 0x0072, - 0x0073, - 0x0074, - 0x0075, - 0x0076, - 0x0077, - 0x0078, - 0x0079, - 0x007A, - 0x007B, - 0x007C, - 0x007D, - 0x007E, - 0x0000, - 0x2022, - 0x2020, - 0x2021, - 0x2026, - 0x2014, - 0x2013, - 0x0192, - 0x2044, - 0x2039, - 0x203A, - 0x2212, - 0x2030, - 0x201E, - 0x201C, - 0x201D, - 0x2018, - 0x2019, - 0x201A, - 0x2122, - 0xFB01, - 0xFB02, - 0x0141, - 0x0152, - 0x0160, - 0x0178, - 0x017D, - 0x0131, - 0x0142, - 0x0153, - 0x0161, - 0x017E, - 0x0000, - 0x20AC, - 0x00A1, - 0x00A2, - 0x00A3, - 0x00A4, - 0x00A5, - 0x00A6, - 0x00A7, - 0x00A8, - 0x00A9, - 0x00AA, - 0x00AB, - 0x00AC, - 0x0000, - 0x00AE, - 0x00AF, - 0x00B0, - 0x00B1, - 0x00B2, - 0x00B3, - 0x00B4, - 0x00B5, - 0x00B6, - 0x00B7, - 0x00B8, - 0x00B9, - 0x00BA, - 0x00BB, - 0x00BC, - 0x00BD, - 0x00BE, - 0x00BF, - 0x00C0, - 0x00C1, - 0x00C2, - 0x00C3, - 0x00C4, - 0x00C5, - 0x00C6, - 0x00C7, - 0x00C8, - 0x00C9, - 0x00CA, - 0x00CB, - 0x00CC, - 0x00CD, - 0x00CE, - 0x00CF, - 0x00D0, - 0x00D1, - 0x00D2, - 0x00D3, - 0x00D4, - 0x00D5, - 0x00D6, - 0x00D7, - 0x00D8, - 0x00D9, - 0x00DA, - 0x00DB, - 0x00DC, - 0x00DD, - 0x00DE, - 0x00DF, - 0x00E0, - 0x00E1, - 0x00E2, - 0x00E3, - 0x00E4, - 0x00E5, - 0x00E6, - 0x00E7, - 0x00E8, - 0x00E9, - 0x00EA, - 0x00EB, - 0x00EC, - 0x00ED, - 0x00EE, - 0x00EF, - 0x00F0, - 0x00F1, - 0x00F2, - 0x00F3, - 0x00F4, - 0x00F5, - 0x00F6, - 0x00F7, - 0x00F8, - 0x00F9, - 0x00FA, - 0x00FB, - 0x00FC, - 0x00FD, - 0x00FE, - 0x00FF, - ) -) - - -def decode_text(s: bytes) -> str: - """Decodes a PDFDocEncoding string to Unicode.""" - if s.startswith(b"\xfe\xff"): - return str(s[2:], "utf-16be", "ignore") - else: - return "".join(PDFDocEncoding[c] for c in s) - - -def enc(x: str) -> str: - """Encodes a string for SGML/XML/HTML""" - if isinstance(x, bytes): - return "" - return escape(x) - - -def bbox2str(bbox: Rect) -> str: - (x0, y0, x1, y1) = bbox - return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}" - - -def matrix2str(m: Matrix) -> str: - (a, b, c, d, e, f) = m - return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]" - - -def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point: - """A distance function between two TextBoxes. - - Consider the bounding rectangle for obj1 and obj2. - Return vector between 2 boxes boundaries if they don't overlap, otherwise - returns vector betweeen boxes centers - - +------+..........+ (x1, y1) - | obj1 | : - +------+www+------+ - : | obj2 | - (x0, y0) +..........+------+ - """ - (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0)) - (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1)) - (ow, oh) = (x1 - x0, y1 - y0) - (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height) - if iw < 0 and ih < 0: - # if one is inside another we compute euclidean distance - (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2) - (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2) - return xc1 - xc2, yc1 - yc2 - else: - return max(0, iw), max(0, ih) - - -LTComponentT = TypeVar("LTComponentT", bound="LTComponent") - - -class Plane(Generic[LTComponentT]): - """A set-like data structure for objects placed on a plane. - - Can efficiently find objects in a certain rectangular area. - It maintains two parallel lists of objects, each of - which is sorted by its x or y coordinate. - """ - - def __init__(self, bbox: Rect, gridsize: int = 50) -> None: - self._seq: List[LTComponentT] = [] # preserve the object order. - self._objs: Set[LTComponentT] = set() - self._grid: Dict[Point, List[LTComponentT]] = {} - self.gridsize = gridsize - (self.x0, self.y0, self.x1, self.y1) = bbox - - def __repr__(self) -> str: - return "" % list(self) - - def __iter__(self) -> Iterator[LTComponentT]: - return (obj for obj in self._seq if obj in self._objs) - - def __len__(self) -> int: - return len(self._objs) - - def __contains__(self, obj: object) -> bool: - return obj in self._objs - - def _getrange(self, bbox: Rect) -> Iterator[Point]: - (x0, y0, x1, y1) = bbox - if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0: - return - x0 = max(self.x0, x0) - y0 = max(self.y0, y0) - x1 = min(self.x1, x1) - y1 = min(self.y1, y1) - for grid_y in drange(y0, y1, self.gridsize): - for grid_x in drange(x0, x1, self.gridsize): - yield (grid_x, grid_y) - - def extend(self, objs: Iterable[LTComponentT]) -> None: - for obj in objs: - self.add(obj) - - def add(self, obj: LTComponentT) -> None: - """Place an object.""" - for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): - if k not in self._grid: - r: List[LTComponentT] = [] - self._grid[k] = r - else: - r = self._grid[k] - r.append(obj) - self._seq.append(obj) - self._objs.add(obj) - - def remove(self, obj: LTComponentT) -> None: - """Displace an object.""" - for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): - try: - self._grid[k].remove(obj) - except (KeyError, ValueError): - pass - self._objs.remove(obj) - - def find(self, bbox: Rect) -> Iterator[LTComponentT]: - """Finds objects that are in a certain area.""" - (x0, y0, x1, y1) = bbox - done = set() - for k in self._getrange(bbox): - if k not in self._grid: - continue - for obj in self._grid[k]: - if obj in done: - continue - done.add(obj) - if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0: - continue - yield obj - - -ROMAN_ONES = ["i", "x", "c", "m"] -ROMAN_FIVES = ["v", "l", "d"] - - -def format_int_roman(value: int) -> str: - """Format a number as lowercase Roman numerals.""" - assert 0 < value < 4000 - result: List[str] = [] - index = 0 - - while value != 0: - value, remainder = divmod(value, 10) - if remainder == 9: - result.insert(0, ROMAN_ONES[index]) - result.insert(1, ROMAN_ONES[index + 1]) - elif remainder == 4: - result.insert(0, ROMAN_ONES[index]) - result.insert(1, ROMAN_FIVES[index]) - else: - over_five = remainder >= 5 - if over_five: - result.insert(0, ROMAN_FIVES[index]) - remainder -= 5 - result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder) - index += 1 - - return "".join(result) - - -def format_int_alpha(value: int) -> str: - """Format a number as lowercase letters a-z, aa-zz, etc.""" - assert value > 0 - result: List[str] = [] - - while value != 0: - value, remainder = divmod(value - 1, len(string.ascii_lowercase)) - result.append(string.ascii_lowercase[remainder]) - - result.reverse() - return "".join(result) - - -def get_device(): - """Get the device to use for computation.""" - try: - import torch - - if torch.cuda.is_available(): - return "cuda:0" - except ImportError: - pass - - return "cpu" diff --git a/pyproject.toml b/pyproject.toml index e95c3f2..b61d955 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "onnx", "onnxruntime", "opencv-python-headless", + "pdfminer.six>=20240706", ] [project.optional-dependencies] diff --git a/setup.cfg b/setup.cfg index 053bd42..d4304f1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,4 @@ [flake8] max-line-length = 120 -ignore = E203,W503,E261 +ignore = E203,E261,E501,W503,E741 exclude = .git,build,dist,docs \ No newline at end of file