" % self.attrs.get("CMapName")
-
- def get_unichr(self, cid: int) -> str:
- # log.debug("get_unichr: %r, %r", self, cid)
- return self.cid2unichr[cid]
-
- def dump(self, out: TextIO = sys.stdout) -> None:
- for k, v in sorted(self.cid2unichr.items()):
- out.write("cid %d = unicode %r\n" % (k, v))
-
-
-class IdentityUnicodeMap(UnicodeMap):
- def get_unichr(self, cid: int) -> str:
- """Interpret character id as unicode codepoint"""
- # log.debug("get_unichr: %r, %r", self, cid)
- return chr(cid)
-
-
-class FileCMap(CMap):
- def add_code2cid(self, code: str, cid: int) -> None:
- assert isinstance(code, str) and isinstance(cid, int), str(
- (type(code), type(cid)),
- )
- d = self.code2cid
- for c in code[:-1]:
- ci = ord(c)
- if ci in d:
- d = cast(Dict[int, object], d[ci])
- else:
- t: Dict[int, object] = {}
- d[ci] = t
- d = t
- ci = ord(code[-1])
- d[ci] = cid
-
-
-class FileUnicodeMap(UnicodeMap):
- def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
- assert isinstance(cid, int), str(type(cid))
- if isinstance(code, PSLiteral):
- # Interpret as an Adobe glyph name.
- assert isinstance(code.name, str)
- unichr = name2unicode(code.name)
- elif isinstance(code, bytes):
- # Interpret as UTF-16BE.
- unichr = code.decode("UTF-16BE", "ignore")
- elif isinstance(code, int):
- unichr = chr(code)
- else:
- raise PDFTypeError(code)
-
- # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
- if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
- return
- self.cid2unichr[cid] = unichr
-
-
-class PyCMap(CMap):
- def __init__(self, name: str, module: Any) -> None:
- super().__init__(CMapName=name)
- self.code2cid = module.CODE2CID
- if module.IS_VERTICAL:
- self.attrs["WMode"] = 1
-
-
-class PyUnicodeMap(UnicodeMap):
- def __init__(self, name: str, module: Any, vertical: bool) -> None:
- super().__init__(CMapName=name)
- if vertical:
- self.cid2unichr = module.CID2UNICHR_V
- self.attrs["WMode"] = 1
- else:
- self.cid2unichr = module.CID2UNICHR_H
-
-
-class CMapDB:
- _cmap_cache: Dict[str, PyCMap] = {}
- _umap_cache: Dict[str, List[PyUnicodeMap]] = {}
-
- class CMapNotFound(CMapError):
- pass
-
- @classmethod
- def _load_data(cls, name: str) -> Any:
- name = name.replace("\0", "")
- filename = "%s.pickle.gz" % name
- # log.debug("loading: %r", name)
- cmap_paths = (
- os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
- os.path.join(os.path.dirname(__file__), "cmap"),
- )
- for directory in cmap_paths:
- path = os.path.join(directory, filename)
- if os.path.exists(path):
- gzfile = gzip.open(path)
- try:
- return type(str(name), (), pickle.loads(gzfile.read()))
- finally:
- gzfile.close()
- raise CMapDB.CMapNotFound(name)
-
- @classmethod
- def get_cmap(cls, name: str) -> CMapBase:
- if name == "Identity-H":
- return IdentityCMap(WMode=0)
- elif name == "Identity-V":
- return IdentityCMap(WMode=1)
- elif name == "OneByteIdentityH":
- return IdentityCMapByte(WMode=0)
- elif name == "OneByteIdentityV":
- return IdentityCMapByte(WMode=1)
- try:
- return cls._cmap_cache[name]
- except KeyError:
- pass
- data = cls._load_data(name)
- cls._cmap_cache[name] = cmap = PyCMap(name, data)
- return cmap
-
- @classmethod
- def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
- try:
- return cls._umap_cache[name][vertical]
- except KeyError:
- pass
- data = cls._load_data("to-unicode-%s" % name)
- cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
- return cls._umap_cache[name][vertical]
-
-
-class CMapParser(PSStackParser[PSKeyword]):
- def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
- PSStackParser.__init__(self, fp)
- self.cmap = cmap
- # some ToUnicode maps don't have "begincmap" keyword.
- self._in_cmap = True
- self._warnings: Set[str] = set()
-
- def run(self) -> None:
- try:
- self.nextobject()
- except PSEOF:
- pass
-
- KEYWORD_BEGINCMAP = KWD(b"begincmap")
- KEYWORD_ENDCMAP = KWD(b"endcmap")
- KEYWORD_USECMAP = KWD(b"usecmap")
- KEYWORD_DEF = KWD(b"def")
- KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
- KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
- KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
- KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
- KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
- KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
- KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
- KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
- KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
- KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
- KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
- KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- """ToUnicode CMaps
-
- See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
- """
- if token is self.KEYWORD_BEGINCMAP:
- self._in_cmap = True
- self.popall()
- return
-
- elif token is self.KEYWORD_ENDCMAP:
- self._in_cmap = False
- return
-
- if not self._in_cmap:
- return
-
- if token is self.KEYWORD_DEF:
- try:
- ((_, k), (_, v)) = self.pop(2)
- self.cmap.set_attr(literal_name(k), v)
- except PSSyntaxError:
- pass
- return
-
- if token is self.KEYWORD_USECMAP:
- try:
- ((_, cmapname),) = self.pop(1)
- self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
- except PSSyntaxError:
- pass
- except CMapDB.CMapNotFound:
- pass
- return
-
- if token is self.KEYWORD_BEGINCODESPACERANGE:
- self.popall()
- return
- if token is self.KEYWORD_ENDCODESPACERANGE:
- self.popall()
- return
-
- if token is self.KEYWORD_BEGINCIDRANGE:
- self.popall()
- return
-
- if token is self.KEYWORD_ENDCIDRANGE:
- objs = [obj for (__, obj) in self.popall()]
- for start_byte, end_byte, cid in choplist(3, objs):
- if not isinstance(start_byte, bytes):
- self._warn_once("The start object of begincidrange is not a byte.")
- continue
- if not isinstance(end_byte, bytes):
- self._warn_once("The end object of begincidrange is not a byte.")
- continue
- if not isinstance(cid, int):
- self._warn_once("The cid object of begincidrange is not a byte.")
- continue
- if len(start_byte) != len(end_byte):
- self._warn_once(
- "The start and end byte of begincidrange have "
- "different lengths.",
- )
- continue
- start_prefix = start_byte[:-4]
- end_prefix = end_byte[:-4]
- if start_prefix != end_prefix:
- self._warn_once(
- "The prefix of the start and end byte of "
- "begincidrange are not the same.",
- )
- continue
- svar = start_byte[-4:]
- evar = end_byte[-4:]
- start = nunpack(svar)
- end = nunpack(evar)
- vlen = len(svar)
- for i in range(end - start + 1):
- x = start_prefix + struct.pack(">L", start + i)[-vlen:]
- self.cmap.add_cid2unichr(cid + i, x)
- return
-
- if token is self.KEYWORD_BEGINCIDCHAR:
- self.popall()
- return
-
- if token is self.KEYWORD_ENDCIDCHAR:
- objs = [obj for (__, obj) in self.popall()]
- for cid, code in choplist(2, objs):
- if isinstance(code, bytes) and isinstance(cid, int):
- self.cmap.add_cid2unichr(cid, code)
- return
-
- if token is self.KEYWORD_BEGINBFRANGE:
- self.popall()
- return
-
- if token is self.KEYWORD_ENDBFRANGE:
- objs = [obj for (__, obj) in self.popall()]
- for start_byte, end_byte, code in choplist(3, objs):
- if not isinstance(start_byte, bytes):
- self._warn_once("The start object is not a byte.")
- continue
- if not isinstance(end_byte, bytes):
- self._warn_once("The end object is not a byte.")
- continue
- if len(start_byte) != len(end_byte):
- self._warn_once("The start and end byte have different lengths.")
- continue
- start = nunpack(start_byte)
- end = nunpack(end_byte)
- if isinstance(code, list):
- if len(code) != end - start + 1:
- self._warn_once(
- "The difference between the start and end "
- "offsets does not match the code length.",
- )
- for cid, unicode_value in zip(range(start, end + 1), code):
- self.cmap.add_cid2unichr(cid, unicode_value)
- else:
- assert isinstance(code, bytes)
- var = code[-4:]
- base = nunpack(var)
- prefix = code[:-4]
- vlen = len(var)
- for i in range(end - start + 1):
- x = prefix + struct.pack(">L", base + i)[-vlen:]
- self.cmap.add_cid2unichr(start + i, x)
- return
-
- if token is self.KEYWORD_BEGINBFCHAR:
- self.popall()
- return
-
- if token is self.KEYWORD_ENDBFCHAR:
- objs = [obj for (__, obj) in self.popall()]
- for cid, code in choplist(2, objs):
- if isinstance(cid, bytes) and isinstance(code, bytes):
- self.cmap.add_cid2unichr(nunpack(cid), code)
- return
-
- if token is self.KEYWORD_BEGINNOTDEFRANGE:
- self.popall()
- return
-
- if token is self.KEYWORD_ENDNOTDEFRANGE:
- self.popall()
- return
-
- self.push((pos, token))
-
- def _warn_once(self, msg: str) -> None:
- """Warn once for each unique message"""
- if msg not in self._warnings:
- self._warnings.add(msg)
- base_msg = (
- "Ignoring (part of) ToUnicode map because the PDF data "
- "does not conform to the format. This could result in "
- "(cid) values in the output. "
- )
- log.warning(base_msg + msg)
diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py
index 4f93269..ec54792 100644
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -1,61 +1,16 @@
-from pdf2zh.utils import (
- AnyIO,
- Matrix,
- PathSegment,
- Point,
- Rect,
- apply_matrix_pt,
- bbox2str,
- enc,
- make_compat_str,
- mult_matrix,
- matrix_scale,
-)
-from pdf2zh.pdftypes import PDFStream
-from pdf2zh.pdfpage import PDFPage
-from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
-from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
-from pdf2zh.pdfexceptions import PDFValueError
-from pdf2zh.pdfdevice import PDFTextDevice
-from pdf2zh.pdfcolor import PDFColorSpace
-from pdf2zh.layout import (
- LAParams,
- LTAnno,
+from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
+from pdfminer.pdffont import PDFCIDFont
+from pdfminer.converter import PDFConverter
+from pdfminer.pdffont import PDFUnicodeNotDefined
+from pdfminer.utils import apply_matrix_pt, mult_matrix
+from pdfminer.layout import (
LTChar,
- LTComponent,
- LTCurve,
LTFigure,
- LTImage,
- LTItem,
- LTLayoutContainer,
LTLine,
LTPage,
- LTRect,
- LTText,
- LTTextBox,
- LTTextBoxVertical,
- LTTextGroup,
- LTTextLine,
- TextGroupElement,
)
-from pdf2zh.image import ImageWriter
-from pdf2zh import utils
-import io
import logging
import re
-from typing import (
- BinaryIO,
- Dict,
- Generic,
- List,
- Optional,
- Sequence,
- TextIO,
- Tuple,
- TypeVar,
- Union,
- cast,
-)
import concurrent.futures
import numpy as np
import unicodedata
@@ -72,47 +27,27 @@
TencentTranslator,
)
-
-def remove_control_characters(s):
- return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
-
-
log = logging.getLogger(__name__)
-class PDFLayoutAnalyzer(PDFTextDevice):
- cur_item: LTLayoutContainer
- ctm: Matrix
-
+class PDFConverterEx(PDFConverter):
def __init__(
self,
rsrcmgr: PDFResourceManager,
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
) -> None:
- PDFTextDevice.__init__(self, rsrcmgr)
- self.pageno = pageno
- self.laparams = laparams
- self._stack: List[LTLayoutContainer] = []
+ PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
- def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
- # (x0, y0, x1, y1) = page.mediabox
+ def begin_page(self, page, ctm) -> None:
(x0, y0, x1, y1) = page.cropbox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
self.cur_item = LTPage(page.pageno, mediabox)
- def end_page(self, page: PDFPage):
- assert not self._stack, str(len(self._stack))
- assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
- # 取消默认排版分析
- # if self.laparams is not None:
- # self.cur_item.analyze(self.laparams)
- self.pageno += 1
+ def end_page(self, page):
return self.receive_layout(self.cur_item)
- def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
+ def begin_figure(self, name, bbox, matrix) -> None:
self._stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
self.cur_item.pageid = self._stack[-1].pageid
@@ -124,142 +59,15 @@ def end_figure(self, _: str) -> None:
self.cur_item.add(fig)
return self.receive_layout(fig)
- def render_image(self, name: str, stream: PDFStream) -> None:
- assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
- item = LTImage(
- name,
- stream,
- (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
- )
- self.cur_item.add(item)
-
- def paint_path(
- self,
- gstate: PDFGraphicState,
- stroke: bool,
- fill: bool,
- evenodd: bool,
- path: Sequence[PathSegment],
- ) -> None:
- """Paint paths described in section 4.4 of the PDF reference manual"""
- shape = "".join(x[0] for x in path)
-
- if shape[:1] != "m":
- # Per PDF Reference Section 4.4.1, "path construction operators may
- # be invoked in any sequence, but the first one invoked must be m
- # or re to begin a new subpath." Since pdf2zh.six already
- # converts all `re` (rectangle) operators to their equivelent
- # `mlllh` representation, paths ingested by `.paint_path(...)` that
- # do not begin with the `m` operator are invalid.
- pass
-
- elif shape.count("m") > 1:
- # recurse if there are multiple m's in this shape
- for m in re.finditer(r"m[^m]+", shape):
- subpath = path[m.start(0) : m.end(0)]
- self.paint_path(gstate, stroke, fill, evenodd, subpath)
-
- else:
- # Although the 'h' command does not not literally provide a
- # point-position, its position is (by definition) equal to the
- # subpath's starting point.
- #
- # And, per Section 4.4's Table 4.9, all other path commands place
- # their point-position in their final two arguments. (Any preceding
- # arguments represent control points on Bézier curves.)
- raw_pts = [
- cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
- ]
- pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
-
- operators = [str(operation[0]) for operation in path]
- transformed_points = [
- [
- apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
- for operand1, operand2 in zip(operation[1::2], operation[2::2])
- ]
- for operation in path
- ]
- transformed_path = [
- cast(PathSegment, (o, *p))
- for o, p in zip(operators, transformed_points)
- ]
-
- if shape in {"mlh", "ml"}:
- # single line segment
- #
- # Note: 'ml', in conditional above, is a frequent anomaly
- # that we want to support.
- line = LTLine(
- gstate.linewidth * matrix_scale(self.ctm),
- pts[0],
- pts[1],
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- original_path=transformed_path,
- dashing_style=gstate.dash,
- )
- self.cur_item.add(line)
-
- elif shape in {"mlllh", "mllll"}:
- (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
-
- is_closed_loop = pts[0] == pts[4]
- has_square_coordinates = (
- x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
- ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
- if is_closed_loop and has_square_coordinates:
- rect = LTRect(
- gstate.linewidth * matrix_scale(self.ctm),
- (*pts[0], *pts[2]),
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(rect)
- else:
- curve = LTCurve(
- gstate.linewidth * matrix_scale(self.ctm),
- pts,
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(curve)
- else:
- curve = LTCurve(
- gstate.linewidth * matrix_scale(self.ctm),
- pts,
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(curve)
-
def render_char(
self,
- matrix: Matrix,
- font: PDFFont,
+ matrix,
+ font,
fontsize: float,
scaling: float,
rise: float,
cid: int,
- ncs: PDFColorSpace,
+ ncs,
graphicstate: PDFGraphicState,
) -> float:
try:
@@ -283,78 +91,14 @@ def render_char(
)
self.cur_item.add(item)
item.cid = cid # hack 插入原字符编码
+ item.font = font # hack 插入原字符字体
return item.adv
- def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
- # log.debug("undefined: %r, %r", font, cid)
- return "(cid:%d)" % cid
-
- def receive_layout(self, ltpage: LTPage) -> None:
- pass
-
-
-class PDFPageAggregator(PDFLayoutAnalyzer):
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- ) -> None:
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
- self.result: Optional[LTPage] = None
-
- def receive_layout(self, ltpage: LTPage) -> None:
- self.result = ltpage
-
- def get_result(self) -> LTPage:
- assert self.result is not None
- return self.result
-
-
-# Some PDFConverter children support only binary I/O
-IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
-
-
-class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: IOType,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- ) -> None:
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
- self.outfp: IOType = outfp
- self.codec = codec
- self.outfp_binary = self._is_binary_stream(self.outfp)
-
- @staticmethod
- def _is_binary_stream(outfp: AnyIO) -> bool:
- """Test if an stream is binary or not"""
- if "b" in getattr(outfp, "mode", ""):
- return True
- elif hasattr(outfp, "mode"):
- # output stream has a mode, but it does not contain 'b'
- return False
- elif isinstance(outfp, io.BytesIO):
- return True
- elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
- return False
-
- return True
-
-class TextConverter(PDFConverter[AnyIO]):
+class TranslateConverter(PDFConverterEx):
def __init__(
self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- showpageno: bool = False,
- imagewriter: Optional[ImageWriter] = None,
+ rsrcmgr,
vfont: str = None,
vchar: str = None,
thread: int = 0,
@@ -363,9 +107,7 @@ def __init__(
lang_out: str = "",
service: str = "",
) -> None:
- super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
- self.showpageno = showpageno
- self.imagewriter = imagewriter
+ super().__init__(rsrcmgr)
self.vfont = vfont
self.vchar = vchar
self.thread = thread
@@ -402,13 +144,6 @@ def __init__(
else:
raise ValueError("Unsupported translation service")
- def write_text(self, text: str) -> None:
- text = utils.compatible_encode_method(text, self.codec, "ignore")
- if self.outfp_binary:
- cast(BinaryIO, self.outfp).write(text.encode())
- else:
- cast(TextIO, self.outfp).write(text)
-
# fmt: off
def receive_layout(self, ltpage: LTPage):
xt = None # 上一个字符
@@ -589,7 +324,6 @@ def worker(s): # 多线程翻译
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
if new is None:
new = self.translator.translate(s)
- new = remove_control_characters(new)
cache.write_paragraph(hash_key, hash_key_paragraph, new)
return new
except BaseException as e:
@@ -708,677 +442,3 @@ def raw_string(fcur, cstk): # 编码字符串
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
ops = f"BT {ops}ET "
return ops
-
- # Some dummy functions to save memory/CPU when all that is wanted
- # is text. This stops all the image and drawing output from being
- # recorded and taking up RAM.
- def render_image(self, name: str, stream: PDFStream) -> None:
- if self.imagewriter is not None:
- PDFConverter.render_image(self, name, stream)
-
- # def paint_path(
- # self,
- # gstate: PDFGraphicState,
- # stroke: bool,
- # fill: bool,
- # evenodd: bool,
- # path: Sequence[PathSegment],
- # ) -> None:
- # pass
-
-
-class HTMLConverter(PDFConverter[AnyIO]):
- RECT_COLORS = {
- "figure": "yellow",
- "textline": "magenta",
- "textbox": "cyan",
- "textgroup": "red",
- "curve": "black",
- "page": "gray",
- }
-
- TEXT_COLORS = {
- "textbox": "blue",
- "char": "black",
- }
-
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- scale: float = 1,
- fontscale: float = 1.0,
- layoutmode: str = "normal",
- showpageno: bool = True,
- pagemargin: int = 50,
- imagewriter: Optional[ImageWriter] = None,
- debug: int = 0,
- rect_colors: Optional[Dict[str, str]] = None,
- text_colors: Optional[Dict[str, str]] = None,
- ) -> None:
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
-
- # write() assumes a codec for binary I/O, or no codec for text I/O.
- if self.outfp_binary and not self.codec:
- raise PDFValueError("Codec is required for a binary I/O output")
- if not self.outfp_binary and self.codec:
- raise PDFValueError("Codec must not be specified for a text I/O output")
-
- if text_colors is None:
- text_colors = {"char": "black"}
- if rect_colors is None:
- rect_colors = {"curve": "black", "page": "gray"}
-
- self.scale = scale
- self.fontscale = fontscale
- self.layoutmode = layoutmode
- self.showpageno = showpageno
- self.pagemargin = pagemargin
- self.imagewriter = imagewriter
- self.rect_colors = rect_colors
- self.text_colors = text_colors
- if debug:
- self.rect_colors.update(self.RECT_COLORS)
- self.text_colors.update(self.TEXT_COLORS)
- self._yoffset: float = self.pagemargin
- self._font: Optional[Tuple[str, float]] = None
- self._fontstack: List[Optional[Tuple[str, float]]] = []
- self.write_header()
-
- def write(self, text: str) -> None:
- if self.codec:
- cast(BinaryIO, self.outfp).write(text.encode(self.codec))
- else:
- cast(TextIO, self.outfp).write(text)
-
- def write_header(self) -> None:
- self.write("\n")
- if self.codec:
- s = (
- '\n' % self.codec
- )
- else:
- s = '\n'
- self.write(s)
- self.write("\n")
-
- def write_footer(self) -> None:
- page_links = [f'{i}' for i in range(1, self.pageno)]
- s = 'Page: %s
\n' % ", ".join(
- page_links,
- )
- self.write(s)
- self.write("\n")
-
- def write_text(self, text: str) -> None:
- self.write(enc(text))
-
- def place_rect(
- self,
- color: str,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- ) -> None:
- color2 = self.rect_colors.get(color)
- if color2 is not None:
- s = (
- '\n'
- % (
- color2,
- borderwidth,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
-
- def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
- self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
-
- def place_image(
- self,
- item: LTImage,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- ) -> None:
- if self.imagewriter is not None:
- name = self.imagewriter.export_image(item)
- s = (
- '\n'
- % (
- enc(name),
- borderwidth,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
-
- def place_text(
- self,
- color: str,
- text: str,
- x: float,
- y: float,
- size: float,
- ) -> None:
- color2 = self.text_colors.get(color)
- if color2 is not None:
- s = (
- ''
- % (
- color2,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- size * self.scale * self.fontscale,
- )
- )
- self.write(s)
- self.write_text(text)
- self.write("\n")
-
- def begin_div(
- self,
- color: str,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- writing_mode: str = "False",
- ) -> None:
- self._fontstack.append(self._font)
- self._font = None
- s = (
- ''
- % (
- color,
- borderwidth,
- writing_mode,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
-
- def end_div(self, color: str) -> None:
- if self._font is not None:
- self.write("")
- self._font = self._fontstack.pop()
- self.write("
")
-
- def put_text(self, text: str, fontname: str, fontsize: float) -> None:
- font = (fontname, fontsize)
- if font != self._font:
- if self._font is not None:
- self.write("")
- # Remove subset tag from fontname, see PDF Reference 5.5.3
- fontname_without_subset_tag = fontname.split("+")[-1]
- self.write(
- ''
- % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
- )
- self._font = font
- self.write_text(text)
-
- def put_newline(self) -> None:
- self.write("
")
-
- def receive_layout(self, ltpage: LTPage) -> None:
- def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
- if isinstance(item, LTTextGroup):
- self.place_border("textgroup", 1, item)
- for child in item:
- show_group(child)
-
- def render(item: LTItem) -> None:
- child: LTItem
- if isinstance(item, LTPage):
- self._yoffset += item.y1
- self.place_border("page", 1, item)
- if self.showpageno:
- self.write(
- '\n',
- )
- for child in item:
- render(child)
- if item.groups is not None:
- for group in item.groups:
- show_group(group)
- elif isinstance(item, LTCurve):
- self.place_border("curve", 1, item)
- elif isinstance(item, LTFigure):
- self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
- for child in item:
- render(child)
- self.end_div("figure")
- elif isinstance(item, LTImage):
- self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
- elif self.layoutmode == "exact":
- if isinstance(item, LTTextLine):
- self.place_border("textline", 1, item)
- for child in item:
- render(child)
- elif isinstance(item, LTTextBox):
- self.place_border("textbox", 1, item)
- self.place_text(
- "textbox",
- str(item.index + 1),
- item.x0,
- item.y1,
- 20,
- )
- for child in item:
- render(child)
- elif isinstance(item, LTChar):
- self.place_border("char", 1, item)
- self.place_text(
- "char",
- item.get_text(),
- item.x0,
- item.y1,
- item.size,
- )
- elif isinstance(item, LTTextLine):
- for child in item:
- render(child)
- if self.layoutmode != "loose":
- self.put_newline()
- elif isinstance(item, LTTextBox):
- self.begin_div(
- "textbox",
- 1,
- item.x0,
- item.y1,
- item.width,
- item.height,
- item.get_writing_mode(),
- )
- for child in item:
- render(child)
- self.end_div("textbox")
- elif isinstance(item, LTChar):
- fontname = make_compat_str(item.fontname)
- self.put_text(item.get_text(), fontname, item.size)
- elif isinstance(item, LTText):
- self.write_text(item.get_text())
-
- render(ltpage)
- self._yoffset += self.pagemargin
-
- def close(self) -> None:
- self.write_footer()
-
-
-class XMLConverter(PDFConverter[AnyIO]):
- CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
-
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- imagewriter: Optional[ImageWriter] = None,
- stripcontrol: bool = False,
- ) -> None:
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
-
- # write() assumes a codec for binary I/O, or no codec for text I/O.
- if self.outfp_binary == (not self.codec):
- raise PDFValueError("Codec is required for a binary I/O output")
-
- self.imagewriter = imagewriter
- self.stripcontrol = stripcontrol
- self.write_header()
-
- def write(self, text: str) -> None:
- if self.codec:
- cast(BinaryIO, self.outfp).write(text.encode(self.codec))
- else:
- cast(TextIO, self.outfp).write(text)
-
- def write_header(self) -> None:
- if self.codec:
- self.write('\n' % self.codec)
- else:
- self.write('\n')
- self.write("\n")
-
- def write_footer(self) -> None:
- self.write("\n")
-
- def write_text(self, text: str) -> None:
- if self.stripcontrol:
- text = self.CONTROL.sub("", text)
- self.write(enc(text))
-
- def receive_layout(self, ltpage: LTPage) -> None:
- def show_group(item: LTItem) -> None:
- if isinstance(item, LTTextBox):
- self.write(
- '\n'
- % (item.index, bbox2str(item.bbox)),
- )
- elif isinstance(item, LTTextGroup):
- self.write('\n' % bbox2str(item.bbox))
- for child in item:
- show_group(child)
- self.write("\n")
-
- def render(item: LTItem) -> None:
- child: LTItem
- if isinstance(item, LTPage):
- s = '\n' % (
- item.pageid,
- bbox2str(item.bbox),
- item.rotate,
- )
- self.write(s)
- for child in item:
- render(child)
- if item.groups is not None:
- self.write("\n")
- for group in item.groups:
- show_group(group)
- self.write("\n")
- self.write("\n")
- elif isinstance(item, LTLine):
- s = '\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- )
- self.write(s)
- elif isinstance(item, LTRect):
- s = '\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- )
- self.write(s)
- elif isinstance(item, LTCurve):
- s = '\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- item.get_pts(),
- )
- self.write(s)
- elif isinstance(item, LTFigure):
- s = f'\n")
- elif isinstance(item, LTTextLine):
- self.write('\n' % bbox2str(item.bbox))
- for child in item:
- render(child)
- self.write("\n")
- elif isinstance(item, LTTextBox):
- wmode = ""
- if isinstance(item, LTTextBoxVertical):
- wmode = ' wmode="vertical"'
- s = '\n' % (
- item.index,
- bbox2str(item.bbox),
- wmode,
- )
- self.write(s)
- for child in item:
- render(child)
- self.write("\n")
- elif isinstance(item, LTChar):
- s = (
- ''
- % (
- enc(item.fontname),
- bbox2str(item.bbox),
- item.ncs.name,
- item.graphicstate.ncolor,
- item.size,
- )
- )
- self.write(s)
- self.write_text(item.get_text())
- self.write("\n")
- elif isinstance(item, LTText):
- self.write("%s\n" % item.get_text())
- elif isinstance(item, LTImage):
- if self.imagewriter is not None:
- name = self.imagewriter.export_image(item)
- self.write(
- '\n'
- % (enc(name), item.width, item.height),
- )
- else:
- self.write(
- '\n'
- % (item.width, item.height),
- )
- else:
- assert False, str(("Unhandled", item))
-
- render(ltpage)
-
- def close(self) -> None:
- self.write_footer()
-
-
-class HOCRConverter(PDFConverter[AnyIO]):
- """Extract an hOCR representation from explicit text information within a PDF."""
-
- # Where text is being extracted from a variety of types of PDF within a
- # business process, those PDFs where the text is only present in image
- # form will need to be analysed using an OCR tool which will typically
- # output hOCR. This converter extracts the explicit text information from
- # those PDFs that do have it and uses it to genxerate a basic hOCR
- # representation that is designed to be used in conjunction with the image
- # of the PDF in the same way as genuine OCR output would be, but without the
- # inevitable OCR errors.
-
- # The converter does not handle images, diagrams or text colors.
-
- # In the examples processed by the contributor it was necessary to set
- # LAParams.all_texts to True.
-
- CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
-
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- stripcontrol: bool = False,
- ):
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
- self.stripcontrol = stripcontrol
- self.within_chars = False
- self.write_header()
-
- def bbox_repr(self, bbox: Rect) -> str:
- (in_x0, in_y0, in_x1, in_y1) = bbox
- # PDF y-coordinates are the other way round from hOCR coordinates
- out_x0 = int(in_x0)
- out_y0 = int(self.page_bbox[3] - in_y1)
- out_x1 = int(in_x1)
- out_y1 = int(self.page_bbox[3] - in_y0)
- return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
-
- def write(self, text: str) -> None:
- if self.codec:
- encoded_text = text.encode(self.codec)
- cast(BinaryIO, self.outfp).write(encoded_text)
- else:
- cast(TextIO, self.outfp).write(text)
-
- def write_header(self) -> None:
- if self.codec:
- self.write(
- "\n" % self.codec,
- )
- else:
- self.write(
- "\n",
- )
- self.write("\n")
- self.write("\n")
- self.write(
- "\n",
- )
- self.write(
- "\n",
- )
- self.write(
- " \n",
- )
- self.write("\n")
- self.write("\n")
-
- def write_footer(self) -> None:
- self.write("\n")
- self.write(
- "\n",
- )
-
- def write_text(self, text: str) -> None:
- if self.stripcontrol:
- text = self.CONTROL.sub("", text)
- self.write(text)
-
- def write_word(self) -> None:
- if len(self.working_text) > 0:
- bold_and_italic_styles = ""
- if "Italic" in self.working_font:
- bold_and_italic_styles = "font-style: italic; "
- if "Bold" in self.working_font:
- bold_and_italic_styles += "font-weight: bold; "
- self.write(
- "%s"
- % (
- (
- self.working_font,
- self.working_size,
- bold_and_italic_styles,
- self.bbox_repr(self.working_bbox),
- self.working_font,
- self.working_size,
- self.working_text.strip(),
- )
- ),
- )
- self.within_chars = False
-
- def receive_layout(self, ltpage: LTPage) -> None:
- def render(item: LTItem) -> None:
- if self.within_chars and isinstance(item, LTAnno):
- self.write_word()
- if isinstance(item, LTPage):
- self.page_bbox = item.bbox
- self.write(
- "\n"
- % (item.pageid, self.bbox_repr(item.bbox)),
- )
- for child in item:
- render(child)
- self.write("
\n")
- elif isinstance(item, LTTextLine):
- self.write(
- "" % (self.bbox_repr(item.bbox)),
- )
- for child_line in item:
- render(child_line)
- self.write("\n")
- elif isinstance(item, LTTextBox):
- self.write(
- "\n"
- % (item.index, self.bbox_repr(item.bbox)),
- )
- for child in item:
- render(child)
- self.write("
\n")
- elif isinstance(item, LTChar):
- if not self.within_chars:
- self.within_chars = True
- self.working_text = item.get_text()
- self.working_bbox = item.bbox
- self.working_font = item.fontname
- self.working_size = item.size
- elif len(item.get_text().strip()) == 0:
- self.write_word()
- self.write(item.get_text())
- else:
- if (
- self.working_bbox[1] != item.bbox[1]
- or self.working_font != item.fontname
- or self.working_size != item.size
- ):
- self.write_word()
- self.working_bbox = item.bbox
- self.working_font = item.fontname
- self.working_size = item.size
- self.working_text += item.get_text()
- self.working_bbox = (
- self.working_bbox[0],
- self.working_bbox[1],
- item.bbox[2],
- self.working_bbox[3],
- )
-
- render(ltpage)
-
- def close(self) -> None:
- self.write_footer()
diff --git a/pdf2zh/data_structures.py b/pdf2zh/data_structures.py
deleted file mode 100644
index cbce5e3..0000000
--- a/pdf2zh/data_structures.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any, Iterable, List, Optional, Tuple
-
-from pdf2zh import settings
-from pdf2zh.pdfparser import PDFSyntaxError
-from pdf2zh.pdftypes import dict_value, int_value, list_value
-from pdf2zh.utils import choplist
-
-
-class NumberTree:
- """A PDF number tree.
-
- See Section 3.8.6 of the PDF Reference.
- """
-
- def __init__(self, obj: Any):
- self._obj = dict_value(obj)
- self.nums: Optional[Iterable[Any]] = None
- self.kids: Optional[Iterable[Any]] = None
- self.limits: Optional[Iterable[Any]] = None
-
- if "Nums" in self._obj:
- self.nums = list_value(self._obj["Nums"])
- if "Kids" in self._obj:
- self.kids = list_value(self._obj["Kids"])
- if "Limits" in self._obj:
- self.limits = list_value(self._obj["Limits"])
-
- def _parse(self) -> List[Tuple[int, Any]]:
- items = []
- if self.nums: # Leaf node
- for k, v in choplist(2, self.nums):
- items.append((int_value(k), v))
-
- if self.kids: # Root or intermediate node
- for child_ref in self.kids:
- items += NumberTree(child_ref)._parse()
-
- return items
-
- values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
-
- @property # type: ignore[no-redef,misc]
- def values(self) -> List[Tuple[int, Any]]:
- values = self._parse()
-
- if settings.STRICT:
- if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
- raise PDFSyntaxError("Number tree elements are out of order")
- else:
- values.sort(key=lambda t: t[0])
-
- return values
diff --git a/pdf2zh/encodingdb.py b/pdf2zh/encodingdb.py
deleted file mode 100644
index ee6a106..0000000
--- a/pdf2zh/encodingdb.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import logging
-import re
-from typing import Dict, Iterable, Optional, cast
-
-from pdf2zh.glyphlist import glyphname2unicode
-from pdf2zh.latin_enc import ENCODING
-from pdf2zh.pdfexceptions import PDFKeyError
-from pdf2zh.psparser import PSLiteral
-
-HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
-
-log = logging.getLogger(__name__)
-
-
-def name2unicode(name: str) -> str:
- """Converts Adobe glyph names to Unicode numbers.
-
- In contrast to the specification, this raises a KeyError instead of return
- an empty string when the key is unknown.
- This way the caller must explicitly define what to do
- when there is not a match.
-
- Reference:
- https://github.com/adobe-type-tools/agl-specification#2-the-mapping
-
- :returns unicode character if name resembles something,
- otherwise a KeyError
- """
- if not isinstance(name, str):
- raise PDFKeyError(
- 'Could not convert unicode name "%s" to character because '
- "it should be of type str but is of type %s" % (name, type(name)),
- )
-
- name = name.split(".")[0]
- components = name.split("_")
-
- if len(components) > 1:
- return "".join(map(name2unicode, components))
-
- elif name in glyphname2unicode:
- return glyphname2unicode[name]
-
- elif name.startswith("uni"):
- name_without_uni = name.strip("uni")
-
- if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
- unicode_digits = [
- int(name_without_uni[i : i + 4], base=16)
- for i in range(0, len(name_without_uni), 4)
- ]
- for digit in unicode_digits:
- raise_key_error_for_invalid_unicode(digit)
- characters = map(chr, unicode_digits)
- return "".join(characters)
-
- elif name.startswith("u"):
- name_without_u = name.strip("u")
-
- if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
- unicode_digit = int(name_without_u, base=16)
- raise_key_error_for_invalid_unicode(unicode_digit)
- return chr(unicode_digit)
-
- raise PDFKeyError(
- 'Could not convert unicode name "%s" to character because '
- "it does not match specification" % name,
- )
-
-
-def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
- """Unicode values should not be in the range D800 through DFFF because
- that is used for surrogate pairs in UTF-16
-
- :raises KeyError if unicode digit is invalid
- """
- if 55295 < unicode_digit < 57344:
- raise PDFKeyError(
- "Unicode digit %d is invalid because "
- "it is in the range D800 through DFFF" % unicode_digit,
- )
-
-
-class EncodingDB:
- std2unicode: Dict[int, str] = {}
- mac2unicode: Dict[int, str] = {}
- win2unicode: Dict[int, str] = {}
- pdf2unicode: Dict[int, str] = {}
- for name, std, mac, win, pdf in ENCODING:
- c = name2unicode(name)
- if std:
- std2unicode[std] = c
- if mac:
- mac2unicode[mac] = c
- if win:
- win2unicode[win] = c
- if pdf:
- pdf2unicode[pdf] = c
-
- encodings = {
- "StandardEncoding": std2unicode,
- "MacRomanEncoding": mac2unicode,
- "WinAnsiEncoding": win2unicode,
- "PDFDocEncoding": pdf2unicode,
- }
-
- @classmethod
- def get_encoding(
- cls,
- name: str,
- diff: Optional[Iterable[object]] = None,
- ) -> Dict[int, str]:
- cid2unicode = cls.encodings.get(name, cls.std2unicode)
- if diff:
- cid2unicode = cid2unicode.copy()
- cid = 0
- for x in diff:
- if isinstance(x, int):
- cid = x
- elif isinstance(x, PSLiteral):
- try:
- cid2unicode[cid] = name2unicode(cast(str, x.name))
- except (KeyError, ValueError):
- # log.debug(str(e))
- pass
- cid += 1
- return cid2unicode
diff --git a/pdf2zh/fontmetrics.py b/pdf2zh/fontmetrics.py
deleted file mode 100644
index c95c1c1..0000000
--- a/pdf2zh/fontmetrics.py
+++ /dev/null
@@ -1,4464 +0,0 @@
-"""Font metrics for the Adobe core 14 fonts.
-
-Font metrics are used to compute the boundary of each character
-written with a proportional font.
-
-The following data were extracted from the AFM files:
-
- http://www.ctan.org/tex-archive/fonts/adobe/afm/
-
-"""
-
-# BEGIN Verbatim copy of the license part
-
-#
-# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
-#
-# This file and the 35 PostScript(R) AFM files it accompanies may be
-# used, copied, and distributed for any purpose and without charge,
-# with or without modification, provided that all copyright notices
-# are retained; that the AFM files are not distributed without this
-# file; that all modifications to this file or any of the AFM files
-# are prominently noted in the modified file(s); and that this
-# paragraph is not modified. Adobe Systems has no responsibility or
-# obligation to support the use of the AFM files.
-#
-
-# END Verbatim copy of the license part
-
-# flake8: noqa
-from typing import Dict
-
-
-def convert_font_metrics(path: str) -> None:
- """Convert an AFM file to a mapping of font metrics.
-
- See below for the output.
- """
- fonts = {}
- with open(path) as fileinput:
- for line in fileinput.readlines():
- f = line.strip().split(" ")
- if not f:
- continue
- k = f[0]
- if k == "FontName":
- fontname = f[1]
- props = {"FontName": fontname, "Flags": 0}
- chars: Dict[int, int] = {}
- fonts[fontname] = (props, chars)
- elif k == "C":
- cid = int(f[1])
- if 0 <= cid and cid <= 255:
- width = int(f[4])
- chars[cid] = width
- elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"):
- k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k)
- props[k] = float(f[1])
- elif k in ("FontName", "FamilyName", "Weight"):
- k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k)
- props[k] = f[1]
- elif k == "IsFixedPitch":
- if f[1].lower() == "true":
- props["Flags"] = 64
- elif k == "FontBBox":
- props[k] = tuple(map(float, f[1:5]))
- print("# -*- python -*-")
- print("FONT_METRICS = {")
- for fontname, (props, chars) in fonts.items():
- print(f" {fontname!r}: {(props, chars)!r},")
- print("}")
-
-
-FONT_METRICS = {
- "Courier": (
- {
- "FontName": "Courier",
- "Descent": -194.0,
- "FontBBox": (-6.0, -249.0, 639.0, 803.0),
- "FontWeight": "Medium",
- "CapHeight": 572.0,
- "FontFamily": "Courier",
- "Flags": 64,
- "XHeight": 434.0,
- "ItalicAngle": 0.0,
- "Ascent": 627.0,
- },
- {
- " ": 600,
- "!": 600,
- '"': 600,
- "#": 600,
- "$": 600,
- "%": 600,
- "&": 600,
- "'": 600,
- "(": 600,
- ")": 600,
- "*": 600,
- "+": 600,
- ",": 600,
- "-": 600,
- ".": 600,
- "/": 600,
- "0": 600,
- "1": 600,
- "2": 600,
- "3": 600,
- "4": 600,
- "5": 600,
- "6": 600,
- "7": 600,
- "8": 600,
- "9": 600,
- ":": 600,
- ";": 600,
- "<": 600,
- "=": 600,
- ">": 600,
- "?": 600,
- "@": 600,
- "A": 600,
- "B": 600,
- "C": 600,
- "D": 600,
- "E": 600,
- "F": 600,
- "G": 600,
- "H": 600,
- "I": 600,
- "J": 600,
- "K": 600,
- "L": 600,
- "M": 600,
- "N": 600,
- "O": 600,
- "P": 600,
- "Q": 600,
- "R": 600,
- "S": 600,
- "T": 600,
- "U": 600,
- "V": 600,
- "W": 600,
- "X": 600,
- "Y": 600,
- "Z": 600,
- "[": 600,
- "\\": 600,
- "]": 600,
- "^": 600,
- "_": 600,
- "`": 600,
- "a": 600,
- "b": 600,
- "c": 600,
- "d": 600,
- "e": 600,
- "f": 600,
- "g": 600,
- "h": 600,
- "i": 600,
- "j": 600,
- "k": 600,
- "l": 600,
- "m": 600,
- "n": 600,
- "o": 600,
- "p": 600,
- "q": 600,
- "r": 600,
- "s": 600,
- "t": 600,
- "u": 600,
- "v": 600,
- "w": 600,
- "x": 600,
- "y": 600,
- "z": 600,
- "{": 600,
- "|": 600,
- "}": 600,
- "~": 600,
- "\xa1": 600,
- "\xa2": 600,
- "\xa3": 600,
- "\xa4": 600,
- "\xa5": 600,
- "\xa6": 600,
- "\xa7": 600,
- "\xa8": 600,
- "\xa9": 600,
- "\xaa": 600,
- "\xab": 600,
- "\xac": 600,
- "\xae": 600,
- "\xaf": 600,
- "\xb0": 600,
- "\xb1": 600,
- "\xb2": 600,
- "\xb3": 600,
- "\xb4": 600,
- "\xb5": 600,
- "\xb6": 600,
- "\xb7": 600,
- "\xb8": 600,
- "\xb9": 600,
- "\xba": 600,
- "\xbb": 600,
- "\xbc": 600,
- "\xbd": 600,
- "\xbe": 600,
- "\xbf": 600,
- "\xc0": 600,
- "\xc1": 600,
- "\xc2": 600,
- "\xc3": 600,
- "\xc4": 600,
- "\xc5": 600,
- "\xc6": 600,
- "\xc7": 600,
- "\xc8": 600,
- "\xc9": 600,
- "\xca": 600,
- "\xcb": 600,
- "\xcc": 600,
- "\xcd": 600,
- "\xce": 600,
- "\xcf": 600,
- "\xd0": 600,
- "\xd1": 600,
- "\xd2": 600,
- "\xd3": 600,
- "\xd4": 600,
- "\xd5": 600,
- "\xd6": 600,
- "\xd7": 600,
- "\xd8": 600,
- "\xd9": 600,
- "\xda": 600,
- "\xdb": 600,
- "\xdc": 600,
- "\xdd": 600,
- "\xde": 600,
- "\xdf": 600,
- "\xe0": 600,
- "\xe1": 600,
- "\xe2": 600,
- "\xe3": 600,
- "\xe4": 600,
- "\xe5": 600,
- "\xe6": 600,
- "\xe7": 600,
- "\xe8": 600,
- "\xe9": 600,
- "\xea": 600,
- "\xeb": 600,
- "\xec": 600,
- "\xed": 600,
- "\xee": 600,
- "\xef": 600,
- "\xf0": 600,
- "\xf1": 600,
- "\xf2": 600,
- "\xf3": 600,
- "\xf4": 600,
- "\xf5": 600,
- "\xf6": 600,
- "\xf7": 600,
- "\xf8": 600,
- "\xf9": 600,
- "\xfa": 600,
- "\xfb": 600,
- "\xfc": 600,
- "\xfd": 600,
- "\xfe": 600,
- "\xff": 600,
- "\u0100": 600,
- "\u0101": 600,
- "\u0102": 600,
- "\u0103": 600,
- "\u0104": 600,
- "\u0105": 600,
- "\u0106": 600,
- "\u0107": 600,
- "\u010c": 600,
- "\u010d": 600,
- "\u010e": 600,
- "\u010f": 600,
- "\u0110": 600,
- "\u0111": 600,
- "\u0112": 600,
- "\u0113": 600,
- "\u0116": 600,
- "\u0117": 600,
- "\u0118": 600,
- "\u0119": 600,
- "\u011a": 600,
- "\u011b": 600,
- "\u011e": 600,
- "\u011f": 600,
- "\u0122": 600,
- "\u0123": 600,
- "\u012a": 600,
- "\u012b": 600,
- "\u012e": 600,
- "\u012f": 600,
- "\u0130": 600,
- "\u0131": 600,
- "\u0136": 600,
- "\u0137": 600,
- "\u0139": 600,
- "\u013a": 600,
- "\u013b": 600,
- "\u013c": 600,
- "\u013d": 600,
- "\u013e": 600,
- "\u0141": 600,
- "\u0142": 600,
- "\u0143": 600,
- "\u0144": 600,
- "\u0145": 600,
- "\u0146": 600,
- "\u0147": 600,
- "\u0148": 600,
- "\u014c": 600,
- "\u014d": 600,
- "\u0150": 600,
- "\u0151": 600,
- "\u0152": 600,
- "\u0153": 600,
- "\u0154": 600,
- "\u0155": 600,
- "\u0156": 600,
- "\u0157": 600,
- "\u0158": 600,
- "\u0159": 600,
- "\u015a": 600,
- "\u015b": 600,
- "\u015e": 600,
- "\u015f": 600,
- "\u0160": 600,
- "\u0161": 600,
- "\u0162": 600,
- "\u0163": 600,
- "\u0164": 600,
- "\u0165": 600,
- "\u016a": 600,
- "\u016b": 600,
- "\u016e": 600,
- "\u016f": 600,
- "\u0170": 600,
- "\u0171": 600,
- "\u0172": 600,
- "\u0173": 600,
- "\u0178": 600,
- "\u0179": 600,
- "\u017a": 600,
- "\u017b": 600,
- "\u017c": 600,
- "\u017d": 600,
- "\u017e": 600,
- "\u0192": 600,
- "\u0218": 600,
- "\u0219": 600,
- "\u02c6": 600,
- "\u02c7": 600,
- "\u02d8": 600,
- "\u02d9": 600,
- "\u02da": 600,
- "\u02db": 600,
- "\u02dc": 600,
- "\u02dd": 600,
- "\u2013": 600,
- "\u2014": 600,
- "\u2018": 600,
- "\u2019": 600,
- "\u201a": 600,
- "\u201c": 600,
- "\u201d": 600,
- "\u201e": 600,
- "\u2020": 600,
- "\u2021": 600,
- "\u2022": 600,
- "\u2026": 600,
- "\u2030": 600,
- "\u2039": 600,
- "\u203a": 600,
- "\u2044": 600,
- "\u2122": 600,
- "\u2202": 600,
- "\u2206": 600,
- "\u2211": 600,
- "\u2212": 600,
- "\u221a": 600,
- "\u2260": 600,
- "\u2264": 600,
- "\u2265": 600,
- "\u25ca": 600,
- "\uf6c3": 600,
- "\ufb01": 600,
- "\ufb02": 600,
- },
- ),
- "Courier-Bold": (
- {
- "FontName": "Courier-Bold",
- "Descent": -194.0,
- "FontBBox": (-88.0, -249.0, 697.0, 811.0),
- "FontWeight": "Bold",
- "CapHeight": 572.0,
- "FontFamily": "Courier",
- "Flags": 64,
- "XHeight": 434.0,
- "ItalicAngle": 0.0,
- "Ascent": 627.0,
- },
- {
- " ": 600,
- "!": 600,
- '"': 600,
- "#": 600,
- "$": 600,
- "%": 600,
- "&": 600,
- "'": 600,
- "(": 600,
- ")": 600,
- "*": 600,
- "+": 600,
- ",": 600,
- "-": 600,
- ".": 600,
- "/": 600,
- "0": 600,
- "1": 600,
- "2": 600,
- "3": 600,
- "4": 600,
- "5": 600,
- "6": 600,
- "7": 600,
- "8": 600,
- "9": 600,
- ":": 600,
- ";": 600,
- "<": 600,
- "=": 600,
- ">": 600,
- "?": 600,
- "@": 600,
- "A": 600,
- "B": 600,
- "C": 600,
- "D": 600,
- "E": 600,
- "F": 600,
- "G": 600,
- "H": 600,
- "I": 600,
- "J": 600,
- "K": 600,
- "L": 600,
- "M": 600,
- "N": 600,
- "O": 600,
- "P": 600,
- "Q": 600,
- "R": 600,
- "S": 600,
- "T": 600,
- "U": 600,
- "V": 600,
- "W": 600,
- "X": 600,
- "Y": 600,
- "Z": 600,
- "[": 600,
- "\\": 600,
- "]": 600,
- "^": 600,
- "_": 600,
- "`": 600,
- "a": 600,
- "b": 600,
- "c": 600,
- "d": 600,
- "e": 600,
- "f": 600,
- "g": 600,
- "h": 600,
- "i": 600,
- "j": 600,
- "k": 600,
- "l": 600,
- "m": 600,
- "n": 600,
- "o": 600,
- "p": 600,
- "q": 600,
- "r": 600,
- "s": 600,
- "t": 600,
- "u": 600,
- "v": 600,
- "w": 600,
- "x": 600,
- "y": 600,
- "z": 600,
- "{": 600,
- "|": 600,
- "}": 600,
- "~": 600,
- "\xa1": 600,
- "\xa2": 600,
- "\xa3": 600,
- "\xa4": 600,
- "\xa5": 600,
- "\xa6": 600,
- "\xa7": 600,
- "\xa8": 600,
- "\xa9": 600,
- "\xaa": 600,
- "\xab": 600,
- "\xac": 600,
- "\xae": 600,
- "\xaf": 600,
- "\xb0": 600,
- "\xb1": 600,
- "\xb2": 600,
- "\xb3": 600,
- "\xb4": 600,
- "\xb5": 600,
- "\xb6": 600,
- "\xb7": 600,
- "\xb8": 600,
- "\xb9": 600,
- "\xba": 600,
- "\xbb": 600,
- "\xbc": 600,
- "\xbd": 600,
- "\xbe": 600,
- "\xbf": 600,
- "\xc0": 600,
- "\xc1": 600,
- "\xc2": 600,
- "\xc3": 600,
- "\xc4": 600,
- "\xc5": 600,
- "\xc6": 600,
- "\xc7": 600,
- "\xc8": 600,
- "\xc9": 600,
- "\xca": 600,
- "\xcb": 600,
- "\xcc": 600,
- "\xcd": 600,
- "\xce": 600,
- "\xcf": 600,
- "\xd0": 600,
- "\xd1": 600,
- "\xd2": 600,
- "\xd3": 600,
- "\xd4": 600,
- "\xd5": 600,
- "\xd6": 600,
- "\xd7": 600,
- "\xd8": 600,
- "\xd9": 600,
- "\xda": 600,
- "\xdb": 600,
- "\xdc": 600,
- "\xdd": 600,
- "\xde": 600,
- "\xdf": 600,
- "\xe0": 600,
- "\xe1": 600,
- "\xe2": 600,
- "\xe3": 600,
- "\xe4": 600,
- "\xe5": 600,
- "\xe6": 600,
- "\xe7": 600,
- "\xe8": 600,
- "\xe9": 600,
- "\xea": 600,
- "\xeb": 600,
- "\xec": 600,
- "\xed": 600,
- "\xee": 600,
- "\xef": 600,
- "\xf0": 600,
- "\xf1": 600,
- "\xf2": 600,
- "\xf3": 600,
- "\xf4": 600,
- "\xf5": 600,
- "\xf6": 600,
- "\xf7": 600,
- "\xf8": 600,
- "\xf9": 600,
- "\xfa": 600,
- "\xfb": 600,
- "\xfc": 600,
- "\xfd": 600,
- "\xfe": 600,
- "\xff": 600,
- "\u0100": 600,
- "\u0101": 600,
- "\u0102": 600,
- "\u0103": 600,
- "\u0104": 600,
- "\u0105": 600,
- "\u0106": 600,
- "\u0107": 600,
- "\u010c": 600,
- "\u010d": 600,
- "\u010e": 600,
- "\u010f": 600,
- "\u0110": 600,
- "\u0111": 600,
- "\u0112": 600,
- "\u0113": 600,
- "\u0116": 600,
- "\u0117": 600,
- "\u0118": 600,
- "\u0119": 600,
- "\u011a": 600,
- "\u011b": 600,
- "\u011e": 600,
- "\u011f": 600,
- "\u0122": 600,
- "\u0123": 600,
- "\u012a": 600,
- "\u012b": 600,
- "\u012e": 600,
- "\u012f": 600,
- "\u0130": 600,
- "\u0131": 600,
- "\u0136": 600,
- "\u0137": 600,
- "\u0139": 600,
- "\u013a": 600,
- "\u013b": 600,
- "\u013c": 600,
- "\u013d": 600,
- "\u013e": 600,
- "\u0141": 600,
- "\u0142": 600,
- "\u0143": 600,
- "\u0144": 600,
- "\u0145": 600,
- "\u0146": 600,
- "\u0147": 600,
- "\u0148": 600,
- "\u014c": 600,
- "\u014d": 600,
- "\u0150": 600,
- "\u0151": 600,
- "\u0152": 600,
- "\u0153": 600,
- "\u0154": 600,
- "\u0155": 600,
- "\u0156": 600,
- "\u0157": 600,
- "\u0158": 600,
- "\u0159": 600,
- "\u015a": 600,
- "\u015b": 600,
- "\u015e": 600,
- "\u015f": 600,
- "\u0160": 600,
- "\u0161": 600,
- "\u0162": 600,
- "\u0163": 600,
- "\u0164": 600,
- "\u0165": 600,
- "\u016a": 600,
- "\u016b": 600,
- "\u016e": 600,
- "\u016f": 600,
- "\u0170": 600,
- "\u0171": 600,
- "\u0172": 600,
- "\u0173": 600,
- "\u0178": 600,
- "\u0179": 600,
- "\u017a": 600,
- "\u017b": 600,
- "\u017c": 600,
- "\u017d": 600,
- "\u017e": 600,
- "\u0192": 600,
- "\u0218": 600,
- "\u0219": 600,
- "\u02c6": 600,
- "\u02c7": 600,
- "\u02d8": 600,
- "\u02d9": 600,
- "\u02da": 600,
- "\u02db": 600,
- "\u02dc": 600,
- "\u02dd": 600,
- "\u2013": 600,
- "\u2014": 600,
- "\u2018": 600,
- "\u2019": 600,
- "\u201a": 600,
- "\u201c": 600,
- "\u201d": 600,
- "\u201e": 600,
- "\u2020": 600,
- "\u2021": 600,
- "\u2022": 600,
- "\u2026": 600,
- "\u2030": 600,
- "\u2039": 600,
- "\u203a": 600,
- "\u2044": 600,
- "\u2122": 600,
- "\u2202": 600,
- "\u2206": 600,
- "\u2211": 600,
- "\u2212": 600,
- "\u221a": 600,
- "\u2260": 600,
- "\u2264": 600,
- "\u2265": 600,
- "\u25ca": 600,
- "\uf6c3": 600,
- "\ufb01": 600,
- "\ufb02": 600,
- },
- ),
- "Courier-BoldOblique": (
- {
- "FontName": "Courier-BoldOblique",
- "Descent": -194.0,
- "FontBBox": (-49.0, -249.0, 758.0, 811.0),
- "FontWeight": "Bold",
- "CapHeight": 572.0,
- "FontFamily": "Courier",
- "Flags": 64,
- "XHeight": 434.0,
- "ItalicAngle": -11.0,
- "Ascent": 627.0,
- },
- {
- " ": 600,
- "!": 600,
- '"': 600,
- "#": 600,
- "$": 600,
- "%": 600,
- "&": 600,
- "'": 600,
- "(": 600,
- ")": 600,
- "*": 600,
- "+": 600,
- ",": 600,
- "-": 600,
- ".": 600,
- "/": 600,
- "0": 600,
- "1": 600,
- "2": 600,
- "3": 600,
- "4": 600,
- "5": 600,
- "6": 600,
- "7": 600,
- "8": 600,
- "9": 600,
- ":": 600,
- ";": 600,
- "<": 600,
- "=": 600,
- ">": 600,
- "?": 600,
- "@": 600,
- "A": 600,
- "B": 600,
- "C": 600,
- "D": 600,
- "E": 600,
- "F": 600,
- "G": 600,
- "H": 600,
- "I": 600,
- "J": 600,
- "K": 600,
- "L": 600,
- "M": 600,
- "N": 600,
- "O": 600,
- "P": 600,
- "Q": 600,
- "R": 600,
- "S": 600,
- "T": 600,
- "U": 600,
- "V": 600,
- "W": 600,
- "X": 600,
- "Y": 600,
- "Z": 600,
- "[": 600,
- "\\": 600,
- "]": 600,
- "^": 600,
- "_": 600,
- "`": 600,
- "a": 600,
- "b": 600,
- "c": 600,
- "d": 600,
- "e": 600,
- "f": 600,
- "g": 600,
- "h": 600,
- "i": 600,
- "j": 600,
- "k": 600,
- "l": 600,
- "m": 600,
- "n": 600,
- "o": 600,
- "p": 600,
- "q": 600,
- "r": 600,
- "s": 600,
- "t": 600,
- "u": 600,
- "v": 600,
- "w": 600,
- "x": 600,
- "y": 600,
- "z": 600,
- "{": 600,
- "|": 600,
- "}": 600,
- "~": 600,
- "\xa1": 600,
- "\xa2": 600,
- "\xa3": 600,
- "\xa4": 600,
- "\xa5": 600,
- "\xa6": 600,
- "\xa7": 600,
- "\xa8": 600,
- "\xa9": 600,
- "\xaa": 600,
- "\xab": 600,
- "\xac": 600,
- "\xae": 600,
- "\xaf": 600,
- "\xb0": 600,
- "\xb1": 600,
- "\xb2": 600,
- "\xb3": 600,
- "\xb4": 600,
- "\xb5": 600,
- "\xb6": 600,
- "\xb7": 600,
- "\xb8": 600,
- "\xb9": 600,
- "\xba": 600,
- "\xbb": 600,
- "\xbc": 600,
- "\xbd": 600,
- "\xbe": 600,
- "\xbf": 600,
- "\xc0": 600,
- "\xc1": 600,
- "\xc2": 600,
- "\xc3": 600,
- "\xc4": 600,
- "\xc5": 600,
- "\xc6": 600,
- "\xc7": 600,
- "\xc8": 600,
- "\xc9": 600,
- "\xca": 600,
- "\xcb": 600,
- "\xcc": 600,
- "\xcd": 600,
- "\xce": 600,
- "\xcf": 600,
- "\xd0": 600,
- "\xd1": 600,
- "\xd2": 600,
- "\xd3": 600,
- "\xd4": 600,
- "\xd5": 600,
- "\xd6": 600,
- "\xd7": 600,
- "\xd8": 600,
- "\xd9": 600,
- "\xda": 600,
- "\xdb": 600,
- "\xdc": 600,
- "\xdd": 600,
- "\xde": 600,
- "\xdf": 600,
- "\xe0": 600,
- "\xe1": 600,
- "\xe2": 600,
- "\xe3": 600,
- "\xe4": 600,
- "\xe5": 600,
- "\xe6": 600,
- "\xe7": 600,
- "\xe8": 600,
- "\xe9": 600,
- "\xea": 600,
- "\xeb": 600,
- "\xec": 600,
- "\xed": 600,
- "\xee": 600,
- "\xef": 600,
- "\xf0": 600,
- "\xf1": 600,
- "\xf2": 600,
- "\xf3": 600,
- "\xf4": 600,
- "\xf5": 600,
- "\xf6": 600,
- "\xf7": 600,
- "\xf8": 600,
- "\xf9": 600,
- "\xfa": 600,
- "\xfb": 600,
- "\xfc": 600,
- "\xfd": 600,
- "\xfe": 600,
- "\xff": 600,
- "\u0100": 600,
- "\u0101": 600,
- "\u0102": 600,
- "\u0103": 600,
- "\u0104": 600,
- "\u0105": 600,
- "\u0106": 600,
- "\u0107": 600,
- "\u010c": 600,
- "\u010d": 600,
- "\u010e": 600,
- "\u010f": 600,
- "\u0110": 600,
- "\u0111": 600,
- "\u0112": 600,
- "\u0113": 600,
- "\u0116": 600,
- "\u0117": 600,
- "\u0118": 600,
- "\u0119": 600,
- "\u011a": 600,
- "\u011b": 600,
- "\u011e": 600,
- "\u011f": 600,
- "\u0122": 600,
- "\u0123": 600,
- "\u012a": 600,
- "\u012b": 600,
- "\u012e": 600,
- "\u012f": 600,
- "\u0130": 600,
- "\u0131": 600,
- "\u0136": 600,
- "\u0137": 600,
- "\u0139": 600,
- "\u013a": 600,
- "\u013b": 600,
- "\u013c": 600,
- "\u013d": 600,
- "\u013e": 600,
- "\u0141": 600,
- "\u0142": 600,
- "\u0143": 600,
- "\u0144": 600,
- "\u0145": 600,
- "\u0146": 600,
- "\u0147": 600,
- "\u0148": 600,
- "\u014c": 600,
- "\u014d": 600,
- "\u0150": 600,
- "\u0151": 600,
- "\u0152": 600,
- "\u0153": 600,
- "\u0154": 600,
- "\u0155": 600,
- "\u0156": 600,
- "\u0157": 600,
- "\u0158": 600,
- "\u0159": 600,
- "\u015a": 600,
- "\u015b": 600,
- "\u015e": 600,
- "\u015f": 600,
- "\u0160": 600,
- "\u0161": 600,
- "\u0162": 600,
- "\u0163": 600,
- "\u0164": 600,
- "\u0165": 600,
- "\u016a": 600,
- "\u016b": 600,
- "\u016e": 600,
- "\u016f": 600,
- "\u0170": 600,
- "\u0171": 600,
- "\u0172": 600,
- "\u0173": 600,
- "\u0178": 600,
- "\u0179": 600,
- "\u017a": 600,
- "\u017b": 600,
- "\u017c": 600,
- "\u017d": 600,
- "\u017e": 600,
- "\u0192": 600,
- "\u0218": 600,
- "\u0219": 600,
- "\u02c6": 600,
- "\u02c7": 600,
- "\u02d8": 600,
- "\u02d9": 600,
- "\u02da": 600,
- "\u02db": 600,
- "\u02dc": 600,
- "\u02dd": 600,
- "\u2013": 600,
- "\u2014": 600,
- "\u2018": 600,
- "\u2019": 600,
- "\u201a": 600,
- "\u201c": 600,
- "\u201d": 600,
- "\u201e": 600,
- "\u2020": 600,
- "\u2021": 600,
- "\u2022": 600,
- "\u2026": 600,
- "\u2030": 600,
- "\u2039": 600,
- "\u203a": 600,
- "\u2044": 600,
- "\u2122": 600,
- "\u2202": 600,
- "\u2206": 600,
- "\u2211": 600,
- "\u2212": 600,
- "\u221a": 600,
- "\u2260": 600,
- "\u2264": 600,
- "\u2265": 600,
- "\u25ca": 600,
- "\uf6c3": 600,
- "\ufb01": 600,
- "\ufb02": 600,
- },
- ),
- "Courier-Oblique": (
- {
- "FontName": "Courier-Oblique",
- "Descent": -194.0,
- "FontBBox": (-49.0, -249.0, 749.0, 803.0),
- "FontWeight": "Medium",
- "CapHeight": 572.0,
- "FontFamily": "Courier",
- "Flags": 64,
- "XHeight": 434.0,
- "ItalicAngle": -11.0,
- "Ascent": 627.0,
- },
- {
- " ": 600,
- "!": 600,
- '"': 600,
- "#": 600,
- "$": 600,
- "%": 600,
- "&": 600,
- "'": 600,
- "(": 600,
- ")": 600,
- "*": 600,
- "+": 600,
- ",": 600,
- "-": 600,
- ".": 600,
- "/": 600,
- "0": 600,
- "1": 600,
- "2": 600,
- "3": 600,
- "4": 600,
- "5": 600,
- "6": 600,
- "7": 600,
- "8": 600,
- "9": 600,
- ":": 600,
- ";": 600,
- "<": 600,
- "=": 600,
- ">": 600,
- "?": 600,
- "@": 600,
- "A": 600,
- "B": 600,
- "C": 600,
- "D": 600,
- "E": 600,
- "F": 600,
- "G": 600,
- "H": 600,
- "I": 600,
- "J": 600,
- "K": 600,
- "L": 600,
- "M": 600,
- "N": 600,
- "O": 600,
- "P": 600,
- "Q": 600,
- "R": 600,
- "S": 600,
- "T": 600,
- "U": 600,
- "V": 600,
- "W": 600,
- "X": 600,
- "Y": 600,
- "Z": 600,
- "[": 600,
- "\\": 600,
- "]": 600,
- "^": 600,
- "_": 600,
- "`": 600,
- "a": 600,
- "b": 600,
- "c": 600,
- "d": 600,
- "e": 600,
- "f": 600,
- "g": 600,
- "h": 600,
- "i": 600,
- "j": 600,
- "k": 600,
- "l": 600,
- "m": 600,
- "n": 600,
- "o": 600,
- "p": 600,
- "q": 600,
- "r": 600,
- "s": 600,
- "t": 600,
- "u": 600,
- "v": 600,
- "w": 600,
- "x": 600,
- "y": 600,
- "z": 600,
- "{": 600,
- "|": 600,
- "}": 600,
- "~": 600,
- "\xa1": 600,
- "\xa2": 600,
- "\xa3": 600,
- "\xa4": 600,
- "\xa5": 600,
- "\xa6": 600,
- "\xa7": 600,
- "\xa8": 600,
- "\xa9": 600,
- "\xaa": 600,
- "\xab": 600,
- "\xac": 600,
- "\xae": 600,
- "\xaf": 600,
- "\xb0": 600,
- "\xb1": 600,
- "\xb2": 600,
- "\xb3": 600,
- "\xb4": 600,
- "\xb5": 600,
- "\xb6": 600,
- "\xb7": 600,
- "\xb8": 600,
- "\xb9": 600,
- "\xba": 600,
- "\xbb": 600,
- "\xbc": 600,
- "\xbd": 600,
- "\xbe": 600,
- "\xbf": 600,
- "\xc0": 600,
- "\xc1": 600,
- "\xc2": 600,
- "\xc3": 600,
- "\xc4": 600,
- "\xc5": 600,
- "\xc6": 600,
- "\xc7": 600,
- "\xc8": 600,
- "\xc9": 600,
- "\xca": 600,
- "\xcb": 600,
- "\xcc": 600,
- "\xcd": 600,
- "\xce": 600,
- "\xcf": 600,
- "\xd0": 600,
- "\xd1": 600,
- "\xd2": 600,
- "\xd3": 600,
- "\xd4": 600,
- "\xd5": 600,
- "\xd6": 600,
- "\xd7": 600,
- "\xd8": 600,
- "\xd9": 600,
- "\xda": 600,
- "\xdb": 600,
- "\xdc": 600,
- "\xdd": 600,
- "\xde": 600,
- "\xdf": 600,
- "\xe0": 600,
- "\xe1": 600,
- "\xe2": 600,
- "\xe3": 600,
- "\xe4": 600,
- "\xe5": 600,
- "\xe6": 600,
- "\xe7": 600,
- "\xe8": 600,
- "\xe9": 600,
- "\xea": 600,
- "\xeb": 600,
- "\xec": 600,
- "\xed": 600,
- "\xee": 600,
- "\xef": 600,
- "\xf0": 600,
- "\xf1": 600,
- "\xf2": 600,
- "\xf3": 600,
- "\xf4": 600,
- "\xf5": 600,
- "\xf6": 600,
- "\xf7": 600,
- "\xf8": 600,
- "\xf9": 600,
- "\xfa": 600,
- "\xfb": 600,
- "\xfc": 600,
- "\xfd": 600,
- "\xfe": 600,
- "\xff": 600,
- "\u0100": 600,
- "\u0101": 600,
- "\u0102": 600,
- "\u0103": 600,
- "\u0104": 600,
- "\u0105": 600,
- "\u0106": 600,
- "\u0107": 600,
- "\u010c": 600,
- "\u010d": 600,
- "\u010e": 600,
- "\u010f": 600,
- "\u0110": 600,
- "\u0111": 600,
- "\u0112": 600,
- "\u0113": 600,
- "\u0116": 600,
- "\u0117": 600,
- "\u0118": 600,
- "\u0119": 600,
- "\u011a": 600,
- "\u011b": 600,
- "\u011e": 600,
- "\u011f": 600,
- "\u0122": 600,
- "\u0123": 600,
- "\u012a": 600,
- "\u012b": 600,
- "\u012e": 600,
- "\u012f": 600,
- "\u0130": 600,
- "\u0131": 600,
- "\u0136": 600,
- "\u0137": 600,
- "\u0139": 600,
- "\u013a": 600,
- "\u013b": 600,
- "\u013c": 600,
- "\u013d": 600,
- "\u013e": 600,
- "\u0141": 600,
- "\u0142": 600,
- "\u0143": 600,
- "\u0144": 600,
- "\u0145": 600,
- "\u0146": 600,
- "\u0147": 600,
- "\u0148": 600,
- "\u014c": 600,
- "\u014d": 600,
- "\u0150": 600,
- "\u0151": 600,
- "\u0152": 600,
- "\u0153": 600,
- "\u0154": 600,
- "\u0155": 600,
- "\u0156": 600,
- "\u0157": 600,
- "\u0158": 600,
- "\u0159": 600,
- "\u015a": 600,
- "\u015b": 600,
- "\u015e": 600,
- "\u015f": 600,
- "\u0160": 600,
- "\u0161": 600,
- "\u0162": 600,
- "\u0163": 600,
- "\u0164": 600,
- "\u0165": 600,
- "\u016a": 600,
- "\u016b": 600,
- "\u016e": 600,
- "\u016f": 600,
- "\u0170": 600,
- "\u0171": 600,
- "\u0172": 600,
- "\u0173": 600,
- "\u0178": 600,
- "\u0179": 600,
- "\u017a": 600,
- "\u017b": 600,
- "\u017c": 600,
- "\u017d": 600,
- "\u017e": 600,
- "\u0192": 600,
- "\u0218": 600,
- "\u0219": 600,
- "\u02c6": 600,
- "\u02c7": 600,
- "\u02d8": 600,
- "\u02d9": 600,
- "\u02da": 600,
- "\u02db": 600,
- "\u02dc": 600,
- "\u02dd": 600,
- "\u2013": 600,
- "\u2014": 600,
- "\u2018": 600,
- "\u2019": 600,
- "\u201a": 600,
- "\u201c": 600,
- "\u201d": 600,
- "\u201e": 600,
- "\u2020": 600,
- "\u2021": 600,
- "\u2022": 600,
- "\u2026": 600,
- "\u2030": 600,
- "\u2039": 600,
- "\u203a": 600,
- "\u2044": 600,
- "\u2122": 600,
- "\u2202": 600,
- "\u2206": 600,
- "\u2211": 600,
- "\u2212": 600,
- "\u221a": 600,
- "\u2260": 600,
- "\u2264": 600,
- "\u2265": 600,
- "\u25ca": 600,
- "\uf6c3": 600,
- "\ufb01": 600,
- "\ufb02": 600,
- },
- ),
- "Helvetica": (
- {
- "FontName": "Helvetica",
- "Descent": -207.0,
- "FontBBox": (-166.0, -225.0, 1000.0, 931.0),
- "FontWeight": "Medium",
- "CapHeight": 718.0,
- "FontFamily": "Helvetica",
- "Flags": 0,
- "XHeight": 523.0,
- "ItalicAngle": 0.0,
- "Ascent": 718.0,
- },
- {
- " ": 278,
- "!": 278,
- '"': 355,
- "#": 556,
- "$": 556,
- "%": 889,
- "&": 667,
- "'": 191,
- "(": 333,
- ")": 333,
- "*": 389,
- "+": 584,
- ",": 278,
- "-": 333,
- ".": 278,
- "/": 278,
- "0": 556,
- "1": 556,
- "2": 556,
- "3": 556,
- "4": 556,
- "5": 556,
- "6": 556,
- "7": 556,
- "8": 556,
- "9": 556,
- ":": 278,
- ";": 278,
- "<": 584,
- "=": 584,
- ">": 584,
- "?": 556,
- "@": 1015,
- "A": 667,
- "B": 667,
- "C": 722,
- "D": 722,
- "E": 667,
- "F": 611,
- "G": 778,
- "H": 722,
- "I": 278,
- "J": 500,
- "K": 667,
- "L": 556,
- "M": 833,
- "N": 722,
- "O": 778,
- "P": 667,
- "Q": 778,
- "R": 722,
- "S": 667,
- "T": 611,
- "U": 722,
- "V": 667,
- "W": 944,
- "X": 667,
- "Y": 667,
- "Z": 611,
- "[": 278,
- "\\": 278,
- "]": 278,
- "^": 469,
- "_": 556,
- "`": 333,
- "a": 556,
- "b": 556,
- "c": 500,
- "d": 556,
- "e": 556,
- "f": 278,
- "g": 556,
- "h": 556,
- "i": 222,
- "j": 222,
- "k": 500,
- "l": 222,
- "m": 833,
- "n": 556,
- "o": 556,
- "p": 556,
- "q": 556,
- "r": 333,
- "s": 500,
- "t": 278,
- "u": 556,
- "v": 500,
- "w": 722,
- "x": 500,
- "y": 500,
- "z": 500,
- "{": 334,
- "|": 260,
- "}": 334,
- "~": 584,
- "\xa1": 333,
- "\xa2": 556,
- "\xa3": 556,
- "\xa4": 556,
- "\xa5": 556,
- "\xa6": 260,
- "\xa7": 556,
- "\xa8": 333,
- "\xa9": 737,
- "\xaa": 370,
- "\xab": 556,
- "\xac": 584,
- "\xae": 737,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 584,
- "\xb2": 333,
- "\xb3": 333,
- "\xb4": 333,
- "\xb5": 556,
- "\xb6": 537,
- "\xb7": 278,
- "\xb8": 333,
- "\xb9": 333,
- "\xba": 365,
- "\xbb": 556,
- "\xbc": 834,
- "\xbd": 834,
- "\xbe": 834,
- "\xbf": 611,
- "\xc0": 667,
- "\xc1": 667,
- "\xc2": 667,
- "\xc3": 667,
- "\xc4": 667,
- "\xc5": 667,
- "\xc6": 1000,
- "\xc7": 722,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 278,
- "\xcd": 278,
- "\xce": 278,
- "\xcf": 278,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 778,
- "\xd3": 778,
- "\xd4": 778,
- "\xd5": 778,
- "\xd6": 778,
- "\xd7": 584,
- "\xd8": 778,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 667,
- "\xde": 667,
- "\xdf": 611,
- "\xe0": 556,
- "\xe1": 556,
- "\xe2": 556,
- "\xe3": 556,
- "\xe4": 556,
- "\xe5": 556,
- "\xe6": 889,
- "\xe7": 500,
- "\xe8": 556,
- "\xe9": 556,
- "\xea": 556,
- "\xeb": 556,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 556,
- "\xf1": 556,
- "\xf2": 556,
- "\xf3": 556,
- "\xf4": 556,
- "\xf5": 556,
- "\xf6": 556,
- "\xf7": 584,
- "\xf8": 611,
- "\xf9": 556,
- "\xfa": 556,
- "\xfb": 556,
- "\xfc": 556,
- "\xfd": 500,
- "\xfe": 556,
- "\xff": 500,
- "\u0100": 667,
- "\u0101": 556,
- "\u0102": 667,
- "\u0103": 556,
- "\u0104": 667,
- "\u0105": 556,
- "\u0106": 722,
- "\u0107": 500,
- "\u010c": 722,
- "\u010d": 500,
- "\u010e": 722,
- "\u010f": 643,
- "\u0110": 722,
- "\u0111": 556,
- "\u0112": 667,
- "\u0113": 556,
- "\u0116": 667,
- "\u0117": 556,
- "\u0118": 667,
- "\u0119": 556,
- "\u011a": 667,
- "\u011b": 556,
- "\u011e": 778,
- "\u011f": 556,
- "\u0122": 778,
- "\u0123": 556,
- "\u012a": 278,
- "\u012b": 278,
- "\u012e": 278,
- "\u012f": 222,
- "\u0130": 278,
- "\u0131": 278,
- "\u0136": 667,
- "\u0137": 500,
- "\u0139": 556,
- "\u013a": 222,
- "\u013b": 556,
- "\u013c": 222,
- "\u013d": 556,
- "\u013e": 299,
- "\u0141": 556,
- "\u0142": 222,
- "\u0143": 722,
- "\u0144": 556,
- "\u0145": 722,
- "\u0146": 556,
- "\u0147": 722,
- "\u0148": 556,
- "\u014c": 778,
- "\u014d": 556,
- "\u0150": 778,
- "\u0151": 556,
- "\u0152": 1000,
- "\u0153": 944,
- "\u0154": 722,
- "\u0155": 333,
- "\u0156": 722,
- "\u0157": 333,
- "\u0158": 722,
- "\u0159": 333,
- "\u015a": 667,
- "\u015b": 500,
- "\u015e": 667,
- "\u015f": 500,
- "\u0160": 667,
- "\u0161": 500,
- "\u0162": 611,
- "\u0163": 278,
- "\u0164": 611,
- "\u0165": 317,
- "\u016a": 722,
- "\u016b": 556,
- "\u016e": 722,
- "\u016f": 556,
- "\u0170": 722,
- "\u0171": 556,
- "\u0172": 722,
- "\u0173": 556,
- "\u0178": 667,
- "\u0179": 611,
- "\u017a": 500,
- "\u017b": 611,
- "\u017c": 500,
- "\u017d": 611,
- "\u017e": 500,
- "\u0192": 556,
- "\u0218": 667,
- "\u0219": 500,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 556,
- "\u2014": 1000,
- "\u2018": 222,
- "\u2019": 222,
- "\u201a": 222,
- "\u201c": 333,
- "\u201d": 333,
- "\u201e": 333,
- "\u2020": 556,
- "\u2021": 556,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 476,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 584,
- "\u221a": 453,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 471,
- "\uf6c3": 250,
- "\ufb01": 500,
- "\ufb02": 500,
- },
- ),
- "Helvetica-Bold": (
- {
- "FontName": "Helvetica-Bold",
- "Descent": -207.0,
- "FontBBox": (-170.0, -228.0, 1003.0, 962.0),
- "FontWeight": "Bold",
- "CapHeight": 718.0,
- "FontFamily": "Helvetica",
- "Flags": 0,
- "XHeight": 532.0,
- "ItalicAngle": 0.0,
- "Ascent": 718.0,
- },
- {
- " ": 278,
- "!": 333,
- '"': 474,
- "#": 556,
- "$": 556,
- "%": 889,
- "&": 722,
- "'": 238,
- "(": 333,
- ")": 333,
- "*": 389,
- "+": 584,
- ",": 278,
- "-": 333,
- ".": 278,
- "/": 278,
- "0": 556,
- "1": 556,
- "2": 556,
- "3": 556,
- "4": 556,
- "5": 556,
- "6": 556,
- "7": 556,
- "8": 556,
- "9": 556,
- ":": 333,
- ";": 333,
- "<": 584,
- "=": 584,
- ">": 584,
- "?": 611,
- "@": 975,
- "A": 722,
- "B": 722,
- "C": 722,
- "D": 722,
- "E": 667,
- "F": 611,
- "G": 778,
- "H": 722,
- "I": 278,
- "J": 556,
- "K": 722,
- "L": 611,
- "M": 833,
- "N": 722,
- "O": 778,
- "P": 667,
- "Q": 778,
- "R": 722,
- "S": 667,
- "T": 611,
- "U": 722,
- "V": 667,
- "W": 944,
- "X": 667,
- "Y": 667,
- "Z": 611,
- "[": 333,
- "\\": 278,
- "]": 333,
- "^": 584,
- "_": 556,
- "`": 333,
- "a": 556,
- "b": 611,
- "c": 556,
- "d": 611,
- "e": 556,
- "f": 333,
- "g": 611,
- "h": 611,
- "i": 278,
- "j": 278,
- "k": 556,
- "l": 278,
- "m": 889,
- "n": 611,
- "o": 611,
- "p": 611,
- "q": 611,
- "r": 389,
- "s": 556,
- "t": 333,
- "u": 611,
- "v": 556,
- "w": 778,
- "x": 556,
- "y": 556,
- "z": 500,
- "{": 389,
- "|": 280,
- "}": 389,
- "~": 584,
- "\xa1": 333,
- "\xa2": 556,
- "\xa3": 556,
- "\xa4": 556,
- "\xa5": 556,
- "\xa6": 280,
- "\xa7": 556,
- "\xa8": 333,
- "\xa9": 737,
- "\xaa": 370,
- "\xab": 556,
- "\xac": 584,
- "\xae": 737,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 584,
- "\xb2": 333,
- "\xb3": 333,
- "\xb4": 333,
- "\xb5": 611,
- "\xb6": 556,
- "\xb7": 278,
- "\xb8": 333,
- "\xb9": 333,
- "\xba": 365,
- "\xbb": 556,
- "\xbc": 834,
- "\xbd": 834,
- "\xbe": 834,
- "\xbf": 611,
- "\xc0": 722,
- "\xc1": 722,
- "\xc2": 722,
- "\xc3": 722,
- "\xc4": 722,
- "\xc5": 722,
- "\xc6": 1000,
- "\xc7": 722,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 278,
- "\xcd": 278,
- "\xce": 278,
- "\xcf": 278,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 778,
- "\xd3": 778,
- "\xd4": 778,
- "\xd5": 778,
- "\xd6": 778,
- "\xd7": 584,
- "\xd8": 778,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 667,
- "\xde": 667,
- "\xdf": 611,
- "\xe0": 556,
- "\xe1": 556,
- "\xe2": 556,
- "\xe3": 556,
- "\xe4": 556,
- "\xe5": 556,
- "\xe6": 889,
- "\xe7": 556,
- "\xe8": 556,
- "\xe9": 556,
- "\xea": 556,
- "\xeb": 556,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 611,
- "\xf1": 611,
- "\xf2": 611,
- "\xf3": 611,
- "\xf4": 611,
- "\xf5": 611,
- "\xf6": 611,
- "\xf7": 584,
- "\xf8": 611,
- "\xf9": 611,
- "\xfa": 611,
- "\xfb": 611,
- "\xfc": 611,
- "\xfd": 556,
- "\xfe": 611,
- "\xff": 556,
- "\u0100": 722,
- "\u0101": 556,
- "\u0102": 722,
- "\u0103": 556,
- "\u0104": 722,
- "\u0105": 556,
- "\u0106": 722,
- "\u0107": 556,
- "\u010c": 722,
- "\u010d": 556,
- "\u010e": 722,
- "\u010f": 743,
- "\u0110": 722,
- "\u0111": 611,
- "\u0112": 667,
- "\u0113": 556,
- "\u0116": 667,
- "\u0117": 556,
- "\u0118": 667,
- "\u0119": 556,
- "\u011a": 667,
- "\u011b": 556,
- "\u011e": 778,
- "\u011f": 611,
- "\u0122": 778,
- "\u0123": 611,
- "\u012a": 278,
- "\u012b": 278,
- "\u012e": 278,
- "\u012f": 278,
- "\u0130": 278,
- "\u0131": 278,
- "\u0136": 722,
- "\u0137": 556,
- "\u0139": 611,
- "\u013a": 278,
- "\u013b": 611,
- "\u013c": 278,
- "\u013d": 611,
- "\u013e": 400,
- "\u0141": 611,
- "\u0142": 278,
- "\u0143": 722,
- "\u0144": 611,
- "\u0145": 722,
- "\u0146": 611,
- "\u0147": 722,
- "\u0148": 611,
- "\u014c": 778,
- "\u014d": 611,
- "\u0150": 778,
- "\u0151": 611,
- "\u0152": 1000,
- "\u0153": 944,
- "\u0154": 722,
- "\u0155": 389,
- "\u0156": 722,
- "\u0157": 389,
- "\u0158": 722,
- "\u0159": 389,
- "\u015a": 667,
- "\u015b": 556,
- "\u015e": 667,
- "\u015f": 556,
- "\u0160": 667,
- "\u0161": 556,
- "\u0162": 611,
- "\u0163": 333,
- "\u0164": 611,
- "\u0165": 389,
- "\u016a": 722,
- "\u016b": 611,
- "\u016e": 722,
- "\u016f": 611,
- "\u0170": 722,
- "\u0171": 611,
- "\u0172": 722,
- "\u0173": 611,
- "\u0178": 667,
- "\u0179": 611,
- "\u017a": 500,
- "\u017b": 611,
- "\u017c": 500,
- "\u017d": 611,
- "\u017e": 500,
- "\u0192": 556,
- "\u0218": 667,
- "\u0219": 556,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 556,
- "\u2014": 1000,
- "\u2018": 278,
- "\u2019": 278,
- "\u201a": 278,
- "\u201c": 500,
- "\u201d": 500,
- "\u201e": 500,
- "\u2020": 556,
- "\u2021": 556,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 494,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 584,
- "\u221a": 549,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 494,
- "\uf6c3": 250,
- "\ufb01": 611,
- "\ufb02": 611,
- },
- ),
- "Helvetica-BoldOblique": (
- {
- "FontName": "Helvetica-BoldOblique",
- "Descent": -207.0,
- "FontBBox": (-175.0, -228.0, 1114.0, 962.0),
- "FontWeight": "Bold",
- "CapHeight": 718.0,
- "FontFamily": "Helvetica",
- "Flags": 0,
- "XHeight": 532.0,
- "ItalicAngle": -12.0,
- "Ascent": 718.0,
- },
- {
- " ": 278,
- "!": 333,
- '"': 474,
- "#": 556,
- "$": 556,
- "%": 889,
- "&": 722,
- "'": 238,
- "(": 333,
- ")": 333,
- "*": 389,
- "+": 584,
- ",": 278,
- "-": 333,
- ".": 278,
- "/": 278,
- "0": 556,
- "1": 556,
- "2": 556,
- "3": 556,
- "4": 556,
- "5": 556,
- "6": 556,
- "7": 556,
- "8": 556,
- "9": 556,
- ":": 333,
- ";": 333,
- "<": 584,
- "=": 584,
- ">": 584,
- "?": 611,
- "@": 975,
- "A": 722,
- "B": 722,
- "C": 722,
- "D": 722,
- "E": 667,
- "F": 611,
- "G": 778,
- "H": 722,
- "I": 278,
- "J": 556,
- "K": 722,
- "L": 611,
- "M": 833,
- "N": 722,
- "O": 778,
- "P": 667,
- "Q": 778,
- "R": 722,
- "S": 667,
- "T": 611,
- "U": 722,
- "V": 667,
- "W": 944,
- "X": 667,
- "Y": 667,
- "Z": 611,
- "[": 333,
- "\\": 278,
- "]": 333,
- "^": 584,
- "_": 556,
- "`": 333,
- "a": 556,
- "b": 611,
- "c": 556,
- "d": 611,
- "e": 556,
- "f": 333,
- "g": 611,
- "h": 611,
- "i": 278,
- "j": 278,
- "k": 556,
- "l": 278,
- "m": 889,
- "n": 611,
- "o": 611,
- "p": 611,
- "q": 611,
- "r": 389,
- "s": 556,
- "t": 333,
- "u": 611,
- "v": 556,
- "w": 778,
- "x": 556,
- "y": 556,
- "z": 500,
- "{": 389,
- "|": 280,
- "}": 389,
- "~": 584,
- "\xa1": 333,
- "\xa2": 556,
- "\xa3": 556,
- "\xa4": 556,
- "\xa5": 556,
- "\xa6": 280,
- "\xa7": 556,
- "\xa8": 333,
- "\xa9": 737,
- "\xaa": 370,
- "\xab": 556,
- "\xac": 584,
- "\xae": 737,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 584,
- "\xb2": 333,
- "\xb3": 333,
- "\xb4": 333,
- "\xb5": 611,
- "\xb6": 556,
- "\xb7": 278,
- "\xb8": 333,
- "\xb9": 333,
- "\xba": 365,
- "\xbb": 556,
- "\xbc": 834,
- "\xbd": 834,
- "\xbe": 834,
- "\xbf": 611,
- "\xc0": 722,
- "\xc1": 722,
- "\xc2": 722,
- "\xc3": 722,
- "\xc4": 722,
- "\xc5": 722,
- "\xc6": 1000,
- "\xc7": 722,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 278,
- "\xcd": 278,
- "\xce": 278,
- "\xcf": 278,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 778,
- "\xd3": 778,
- "\xd4": 778,
- "\xd5": 778,
- "\xd6": 778,
- "\xd7": 584,
- "\xd8": 778,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 667,
- "\xde": 667,
- "\xdf": 611,
- "\xe0": 556,
- "\xe1": 556,
- "\xe2": 556,
- "\xe3": 556,
- "\xe4": 556,
- "\xe5": 556,
- "\xe6": 889,
- "\xe7": 556,
- "\xe8": 556,
- "\xe9": 556,
- "\xea": 556,
- "\xeb": 556,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 611,
- "\xf1": 611,
- "\xf2": 611,
- "\xf3": 611,
- "\xf4": 611,
- "\xf5": 611,
- "\xf6": 611,
- "\xf7": 584,
- "\xf8": 611,
- "\xf9": 611,
- "\xfa": 611,
- "\xfb": 611,
- "\xfc": 611,
- "\xfd": 556,
- "\xfe": 611,
- "\xff": 556,
- "\u0100": 722,
- "\u0101": 556,
- "\u0102": 722,
- "\u0103": 556,
- "\u0104": 722,
- "\u0105": 556,
- "\u0106": 722,
- "\u0107": 556,
- "\u010c": 722,
- "\u010d": 556,
- "\u010e": 722,
- "\u010f": 743,
- "\u0110": 722,
- "\u0111": 611,
- "\u0112": 667,
- "\u0113": 556,
- "\u0116": 667,
- "\u0117": 556,
- "\u0118": 667,
- "\u0119": 556,
- "\u011a": 667,
- "\u011b": 556,
- "\u011e": 778,
- "\u011f": 611,
- "\u0122": 778,
- "\u0123": 611,
- "\u012a": 278,
- "\u012b": 278,
- "\u012e": 278,
- "\u012f": 278,
- "\u0130": 278,
- "\u0131": 278,
- "\u0136": 722,
- "\u0137": 556,
- "\u0139": 611,
- "\u013a": 278,
- "\u013b": 611,
- "\u013c": 278,
- "\u013d": 611,
- "\u013e": 400,
- "\u0141": 611,
- "\u0142": 278,
- "\u0143": 722,
- "\u0144": 611,
- "\u0145": 722,
- "\u0146": 611,
- "\u0147": 722,
- "\u0148": 611,
- "\u014c": 778,
- "\u014d": 611,
- "\u0150": 778,
- "\u0151": 611,
- "\u0152": 1000,
- "\u0153": 944,
- "\u0154": 722,
- "\u0155": 389,
- "\u0156": 722,
- "\u0157": 389,
- "\u0158": 722,
- "\u0159": 389,
- "\u015a": 667,
- "\u015b": 556,
- "\u015e": 667,
- "\u015f": 556,
- "\u0160": 667,
- "\u0161": 556,
- "\u0162": 611,
- "\u0163": 333,
- "\u0164": 611,
- "\u0165": 389,
- "\u016a": 722,
- "\u016b": 611,
- "\u016e": 722,
- "\u016f": 611,
- "\u0170": 722,
- "\u0171": 611,
- "\u0172": 722,
- "\u0173": 611,
- "\u0178": 667,
- "\u0179": 611,
- "\u017a": 500,
- "\u017b": 611,
- "\u017c": 500,
- "\u017d": 611,
- "\u017e": 500,
- "\u0192": 556,
- "\u0218": 667,
- "\u0219": 556,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 556,
- "\u2014": 1000,
- "\u2018": 278,
- "\u2019": 278,
- "\u201a": 278,
- "\u201c": 500,
- "\u201d": 500,
- "\u201e": 500,
- "\u2020": 556,
- "\u2021": 556,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 494,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 584,
- "\u221a": 549,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 494,
- "\uf6c3": 250,
- "\ufb01": 611,
- "\ufb02": 611,
- },
- ),
- "Helvetica-Oblique": (
- {
- "FontName": "Helvetica-Oblique",
- "Descent": -207.0,
- "FontBBox": (-171.0, -225.0, 1116.0, 931.0),
- "FontWeight": "Medium",
- "CapHeight": 718.0,
- "FontFamily": "Helvetica",
- "Flags": 0,
- "XHeight": 523.0,
- "ItalicAngle": -12.0,
- "Ascent": 718.0,
- },
- {
- " ": 278,
- "!": 278,
- '"': 355,
- "#": 556,
- "$": 556,
- "%": 889,
- "&": 667,
- "'": 191,
- "(": 333,
- ")": 333,
- "*": 389,
- "+": 584,
- ",": 278,
- "-": 333,
- ".": 278,
- "/": 278,
- "0": 556,
- "1": 556,
- "2": 556,
- "3": 556,
- "4": 556,
- "5": 556,
- "6": 556,
- "7": 556,
- "8": 556,
- "9": 556,
- ":": 278,
- ";": 278,
- "<": 584,
- "=": 584,
- ">": 584,
- "?": 556,
- "@": 1015,
- "A": 667,
- "B": 667,
- "C": 722,
- "D": 722,
- "E": 667,
- "F": 611,
- "G": 778,
- "H": 722,
- "I": 278,
- "J": 500,
- "K": 667,
- "L": 556,
- "M": 833,
- "N": 722,
- "O": 778,
- "P": 667,
- "Q": 778,
- "R": 722,
- "S": 667,
- "T": 611,
- "U": 722,
- "V": 667,
- "W": 944,
- "X": 667,
- "Y": 667,
- "Z": 611,
- "[": 278,
- "\\": 278,
- "]": 278,
- "^": 469,
- "_": 556,
- "`": 333,
- "a": 556,
- "b": 556,
- "c": 500,
- "d": 556,
- "e": 556,
- "f": 278,
- "g": 556,
- "h": 556,
- "i": 222,
- "j": 222,
- "k": 500,
- "l": 222,
- "m": 833,
- "n": 556,
- "o": 556,
- "p": 556,
- "q": 556,
- "r": 333,
- "s": 500,
- "t": 278,
- "u": 556,
- "v": 500,
- "w": 722,
- "x": 500,
- "y": 500,
- "z": 500,
- "{": 334,
- "|": 260,
- "}": 334,
- "~": 584,
- "\xa1": 333,
- "\xa2": 556,
- "\xa3": 556,
- "\xa4": 556,
- "\xa5": 556,
- "\xa6": 260,
- "\xa7": 556,
- "\xa8": 333,
- "\xa9": 737,
- "\xaa": 370,
- "\xab": 556,
- "\xac": 584,
- "\xae": 737,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 584,
- "\xb2": 333,
- "\xb3": 333,
- "\xb4": 333,
- "\xb5": 556,
- "\xb6": 537,
- "\xb7": 278,
- "\xb8": 333,
- "\xb9": 333,
- "\xba": 365,
- "\xbb": 556,
- "\xbc": 834,
- "\xbd": 834,
- "\xbe": 834,
- "\xbf": 611,
- "\xc0": 667,
- "\xc1": 667,
- "\xc2": 667,
- "\xc3": 667,
- "\xc4": 667,
- "\xc5": 667,
- "\xc6": 1000,
- "\xc7": 722,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 278,
- "\xcd": 278,
- "\xce": 278,
- "\xcf": 278,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 778,
- "\xd3": 778,
- "\xd4": 778,
- "\xd5": 778,
- "\xd6": 778,
- "\xd7": 584,
- "\xd8": 778,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 667,
- "\xde": 667,
- "\xdf": 611,
- "\xe0": 556,
- "\xe1": 556,
- "\xe2": 556,
- "\xe3": 556,
- "\xe4": 556,
- "\xe5": 556,
- "\xe6": 889,
- "\xe7": 500,
- "\xe8": 556,
- "\xe9": 556,
- "\xea": 556,
- "\xeb": 556,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 556,
- "\xf1": 556,
- "\xf2": 556,
- "\xf3": 556,
- "\xf4": 556,
- "\xf5": 556,
- "\xf6": 556,
- "\xf7": 584,
- "\xf8": 611,
- "\xf9": 556,
- "\xfa": 556,
- "\xfb": 556,
- "\xfc": 556,
- "\xfd": 500,
- "\xfe": 556,
- "\xff": 500,
- "\u0100": 667,
- "\u0101": 556,
- "\u0102": 667,
- "\u0103": 556,
- "\u0104": 667,
- "\u0105": 556,
- "\u0106": 722,
- "\u0107": 500,
- "\u010c": 722,
- "\u010d": 500,
- "\u010e": 722,
- "\u010f": 643,
- "\u0110": 722,
- "\u0111": 556,
- "\u0112": 667,
- "\u0113": 556,
- "\u0116": 667,
- "\u0117": 556,
- "\u0118": 667,
- "\u0119": 556,
- "\u011a": 667,
- "\u011b": 556,
- "\u011e": 778,
- "\u011f": 556,
- "\u0122": 778,
- "\u0123": 556,
- "\u012a": 278,
- "\u012b": 278,
- "\u012e": 278,
- "\u012f": 222,
- "\u0130": 278,
- "\u0131": 278,
- "\u0136": 667,
- "\u0137": 500,
- "\u0139": 556,
- "\u013a": 222,
- "\u013b": 556,
- "\u013c": 222,
- "\u013d": 556,
- "\u013e": 299,
- "\u0141": 556,
- "\u0142": 222,
- "\u0143": 722,
- "\u0144": 556,
- "\u0145": 722,
- "\u0146": 556,
- "\u0147": 722,
- "\u0148": 556,
- "\u014c": 778,
- "\u014d": 556,
- "\u0150": 778,
- "\u0151": 556,
- "\u0152": 1000,
- "\u0153": 944,
- "\u0154": 722,
- "\u0155": 333,
- "\u0156": 722,
- "\u0157": 333,
- "\u0158": 722,
- "\u0159": 333,
- "\u015a": 667,
- "\u015b": 500,
- "\u015e": 667,
- "\u015f": 500,
- "\u0160": 667,
- "\u0161": 500,
- "\u0162": 611,
- "\u0163": 278,
- "\u0164": 611,
- "\u0165": 317,
- "\u016a": 722,
- "\u016b": 556,
- "\u016e": 722,
- "\u016f": 556,
- "\u0170": 722,
- "\u0171": 556,
- "\u0172": 722,
- "\u0173": 556,
- "\u0178": 667,
- "\u0179": 611,
- "\u017a": 500,
- "\u017b": 611,
- "\u017c": 500,
- "\u017d": 611,
- "\u017e": 500,
- "\u0192": 556,
- "\u0218": 667,
- "\u0219": 500,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 556,
- "\u2014": 1000,
- "\u2018": 222,
- "\u2019": 222,
- "\u201a": 222,
- "\u201c": 333,
- "\u201d": 333,
- "\u201e": 333,
- "\u2020": 556,
- "\u2021": 556,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 476,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 584,
- "\u221a": 453,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 471,
- "\uf6c3": 250,
- "\ufb01": 500,
- "\ufb02": 500,
- },
- ),
- "Symbol": (
- {
- "FontName": "Symbol",
- "FontBBox": (-180.0, -293.0, 1090.0, 1010.0),
- "FontWeight": "Medium",
- "FontFamily": "Symbol",
- "Flags": 0,
- "ItalicAngle": 0.0,
- },
- {
- " ": 250,
- "!": 333,
- "#": 500,
- "%": 833,
- "&": 778,
- "(": 333,
- ")": 333,
- "+": 549,
- ",": 250,
- ".": 250,
- "/": 278,
- "0": 500,
- "1": 500,
- "2": 500,
- "3": 500,
- "4": 500,
- "5": 500,
- "6": 500,
- "7": 500,
- "8": 500,
- "9": 500,
- ":": 278,
- ";": 278,
- "<": 549,
- "=": 549,
- ">": 549,
- "?": 444,
- "[": 333,
- "]": 333,
- "_": 500,
- "{": 480,
- "|": 200,
- "}": 480,
- "\xac": 713,
- "\xb0": 400,
- "\xb1": 549,
- "\xb5": 576,
- "\xd7": 549,
- "\xf7": 549,
- "\u0192": 500,
- "\u0391": 722,
- "\u0392": 667,
- "\u0393": 603,
- "\u0395": 611,
- "\u0396": 611,
- "\u0397": 722,
- "\u0398": 741,
- "\u0399": 333,
- "\u039a": 722,
- "\u039b": 686,
- "\u039c": 889,
- "\u039d": 722,
- "\u039e": 645,
- "\u039f": 722,
- "\u03a0": 768,
- "\u03a1": 556,
- "\u03a3": 592,
- "\u03a4": 611,
- "\u03a5": 690,
- "\u03a6": 763,
- "\u03a7": 722,
- "\u03a8": 795,
- "\u03b1": 631,
- "\u03b2": 549,
- "\u03b3": 411,
- "\u03b4": 494,
- "\u03b5": 439,
- "\u03b6": 494,
- "\u03b7": 603,
- "\u03b8": 521,
- "\u03b9": 329,
- "\u03ba": 549,
- "\u03bb": 549,
- "\u03bd": 521,
- "\u03be": 493,
- "\u03bf": 549,
- "\u03c0": 549,
- "\u03c1": 549,
- "\u03c2": 439,
- "\u03c3": 603,
- "\u03c4": 439,
- "\u03c5": 576,
- "\u03c6": 521,
- "\u03c7": 549,
- "\u03c8": 686,
- "\u03c9": 686,
- "\u03d1": 631,
- "\u03d2": 620,
- "\u03d5": 603,
- "\u03d6": 713,
- "\u2022": 460,
- "\u2026": 1000,
- "\u2032": 247,
- "\u2033": 411,
- "\u2044": 167,
- "\u20ac": 750,
- "\u2111": 686,
- "\u2118": 987,
- "\u211c": 795,
- "\u2126": 768,
- "\u2135": 823,
- "\u2190": 987,
- "\u2191": 603,
- "\u2192": 987,
- "\u2193": 603,
- "\u2194": 1042,
- "\u21b5": 658,
- "\u21d0": 987,
- "\u21d1": 603,
- "\u21d2": 987,
- "\u21d3": 603,
- "\u21d4": 1042,
- "\u2200": 713,
- "\u2202": 494,
- "\u2203": 549,
- "\u2205": 823,
- "\u2206": 612,
- "\u2207": 713,
- "\u2208": 713,
- "\u2209": 713,
- "\u220b": 439,
- "\u220f": 823,
- "\u2211": 713,
- "\u2212": 549,
- "\u2217": 500,
- "\u221a": 549,
- "\u221d": 713,
- "\u221e": 713,
- "\u2220": 768,
- "\u2227": 603,
- "\u2228": 603,
- "\u2229": 768,
- "\u222a": 768,
- "\u222b": 274,
- "\u2234": 863,
- "\u223c": 549,
- "\u2245": 549,
- "\u2248": 549,
- "\u2260": 549,
- "\u2261": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u2282": 713,
- "\u2283": 713,
- "\u2284": 713,
- "\u2286": 713,
- "\u2287": 713,
- "\u2295": 768,
- "\u2297": 768,
- "\u22a5": 658,
- "\u22c5": 250,
- "\u2320": 686,
- "\u2321": 686,
- "\u2329": 329,
- "\u232a": 329,
- "\u25ca": 494,
- "\u2660": 753,
- "\u2663": 753,
- "\u2665": 753,
- "\u2666": 753,
- "\uf6d9": 790,
- "\uf6da": 790,
- "\uf6db": 890,
- "\uf8e5": 500,
- "\uf8e6": 603,
- "\uf8e7": 1000,
- "\uf8e8": 790,
- "\uf8e9": 790,
- "\uf8ea": 786,
- "\uf8eb": 384,
- "\uf8ec": 384,
- "\uf8ed": 384,
- "\uf8ee": 384,
- "\uf8ef": 384,
- "\uf8f0": 384,
- "\uf8f1": 494,
- "\uf8f2": 494,
- "\uf8f3": 494,
- "\uf8f4": 494,
- "\uf8f5": 686,
- "\uf8f6": 384,
- "\uf8f7": 384,
- "\uf8f8": 384,
- "\uf8f9": 384,
- "\uf8fa": 384,
- "\uf8fb": 384,
- "\uf8fc": 494,
- "\uf8fd": 494,
- "\uf8fe": 494,
- "\uf8ff": 790,
- },
- ),
- "Times-Bold": (
- {
- "FontName": "Times-Bold",
- "Descent": -217.0,
- "FontBBox": (-168.0, -218.0, 1000.0, 935.0),
- "FontWeight": "Bold",
- "CapHeight": 676.0,
- "FontFamily": "Times",
- "Flags": 0,
- "XHeight": 461.0,
- "ItalicAngle": 0.0,
- "Ascent": 683.0,
- },
- {
- " ": 250,
- "!": 333,
- '"': 555,
- "#": 500,
- "$": 500,
- "%": 1000,
- "&": 833,
- "'": 278,
- "(": 333,
- ")": 333,
- "*": 500,
- "+": 570,
- ",": 250,
- "-": 333,
- ".": 250,
- "/": 278,
- "0": 500,
- "1": 500,
- "2": 500,
- "3": 500,
- "4": 500,
- "5": 500,
- "6": 500,
- "7": 500,
- "8": 500,
- "9": 500,
- ":": 333,
- ";": 333,
- "<": 570,
- "=": 570,
- ">": 570,
- "?": 500,
- "@": 930,
- "A": 722,
- "B": 667,
- "C": 722,
- "D": 722,
- "E": 667,
- "F": 611,
- "G": 778,
- "H": 778,
- "I": 389,
- "J": 500,
- "K": 778,
- "L": 667,
- "M": 944,
- "N": 722,
- "O": 778,
- "P": 611,
- "Q": 778,
- "R": 722,
- "S": 556,
- "T": 667,
- "U": 722,
- "V": 722,
- "W": 1000,
- "X": 722,
- "Y": 722,
- "Z": 667,
- "[": 333,
- "\\": 278,
- "]": 333,
- "^": 581,
- "_": 500,
- "`": 333,
- "a": 500,
- "b": 556,
- "c": 444,
- "d": 556,
- "e": 444,
- "f": 333,
- "g": 500,
- "h": 556,
- "i": 278,
- "j": 333,
- "k": 556,
- "l": 278,
- "m": 833,
- "n": 556,
- "o": 500,
- "p": 556,
- "q": 556,
- "r": 444,
- "s": 389,
- "t": 333,
- "u": 556,
- "v": 500,
- "w": 722,
- "x": 500,
- "y": 500,
- "z": 444,
- "{": 394,
- "|": 220,
- "}": 394,
- "~": 520,
- "\xa1": 333,
- "\xa2": 500,
- "\xa3": 500,
- "\xa4": 500,
- "\xa5": 500,
- "\xa6": 220,
- "\xa7": 500,
- "\xa8": 333,
- "\xa9": 747,
- "\xaa": 300,
- "\xab": 500,
- "\xac": 570,
- "\xae": 747,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 570,
- "\xb2": 300,
- "\xb3": 300,
- "\xb4": 333,
- "\xb5": 556,
- "\xb6": 540,
- "\xb7": 250,
- "\xb8": 333,
- "\xb9": 300,
- "\xba": 330,
- "\xbb": 500,
- "\xbc": 750,
- "\xbd": 750,
- "\xbe": 750,
- "\xbf": 500,
- "\xc0": 722,
- "\xc1": 722,
- "\xc2": 722,
- "\xc3": 722,
- "\xc4": 722,
- "\xc5": 722,
- "\xc6": 1000,
- "\xc7": 722,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 389,
- "\xcd": 389,
- "\xce": 389,
- "\xcf": 389,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 778,
- "\xd3": 778,
- "\xd4": 778,
- "\xd5": 778,
- "\xd6": 778,
- "\xd7": 570,
- "\xd8": 778,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 722,
- "\xde": 611,
- "\xdf": 556,
- "\xe0": 500,
- "\xe1": 500,
- "\xe2": 500,
- "\xe3": 500,
- "\xe4": 500,
- "\xe5": 500,
- "\xe6": 722,
- "\xe7": 444,
- "\xe8": 444,
- "\xe9": 444,
- "\xea": 444,
- "\xeb": 444,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 500,
- "\xf1": 556,
- "\xf2": 500,
- "\xf3": 500,
- "\xf4": 500,
- "\xf5": 500,
- "\xf6": 500,
- "\xf7": 570,
- "\xf8": 500,
- "\xf9": 556,
- "\xfa": 556,
- "\xfb": 556,
- "\xfc": 556,
- "\xfd": 500,
- "\xfe": 556,
- "\xff": 500,
- "\u0100": 722,
- "\u0101": 500,
- "\u0102": 722,
- "\u0103": 500,
- "\u0104": 722,
- "\u0105": 500,
- "\u0106": 722,
- "\u0107": 444,
- "\u010c": 722,
- "\u010d": 444,
- "\u010e": 722,
- "\u010f": 672,
- "\u0110": 722,
- "\u0111": 556,
- "\u0112": 667,
- "\u0113": 444,
- "\u0116": 667,
- "\u0117": 444,
- "\u0118": 667,
- "\u0119": 444,
- "\u011a": 667,
- "\u011b": 444,
- "\u011e": 778,
- "\u011f": 500,
- "\u0122": 778,
- "\u0123": 500,
- "\u012a": 389,
- "\u012b": 278,
- "\u012e": 389,
- "\u012f": 278,
- "\u0130": 389,
- "\u0131": 278,
- "\u0136": 778,
- "\u0137": 556,
- "\u0139": 667,
- "\u013a": 278,
- "\u013b": 667,
- "\u013c": 278,
- "\u013d": 667,
- "\u013e": 394,
- "\u0141": 667,
- "\u0142": 278,
- "\u0143": 722,
- "\u0144": 556,
- "\u0145": 722,
- "\u0146": 556,
- "\u0147": 722,
- "\u0148": 556,
- "\u014c": 778,
- "\u014d": 500,
- "\u0150": 778,
- "\u0151": 500,
- "\u0152": 1000,
- "\u0153": 722,
- "\u0154": 722,
- "\u0155": 444,
- "\u0156": 722,
- "\u0157": 444,
- "\u0158": 722,
- "\u0159": 444,
- "\u015a": 556,
- "\u015b": 389,
- "\u015e": 556,
- "\u015f": 389,
- "\u0160": 556,
- "\u0161": 389,
- "\u0162": 667,
- "\u0163": 333,
- "\u0164": 667,
- "\u0165": 416,
- "\u016a": 722,
- "\u016b": 556,
- "\u016e": 722,
- "\u016f": 556,
- "\u0170": 722,
- "\u0171": 556,
- "\u0172": 722,
- "\u0173": 556,
- "\u0178": 722,
- "\u0179": 667,
- "\u017a": 444,
- "\u017b": 667,
- "\u017c": 444,
- "\u017d": 667,
- "\u017e": 444,
- "\u0192": 500,
- "\u0218": 556,
- "\u0219": 389,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 500,
- "\u2014": 1000,
- "\u2018": 333,
- "\u2019": 333,
- "\u201a": 333,
- "\u201c": 500,
- "\u201d": 500,
- "\u201e": 500,
- "\u2020": 500,
- "\u2021": 500,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 494,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 570,
- "\u221a": 549,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 494,
- "\uf6c3": 250,
- "\ufb01": 556,
- "\ufb02": 556,
- },
- ),
- "Times-BoldItalic": (
- {
- "FontName": "Times-BoldItalic",
- "Descent": -217.0,
- "FontBBox": (-200.0, -218.0, 996.0, 921.0),
- "FontWeight": "Bold",
- "CapHeight": 669.0,
- "FontFamily": "Times",
- "Flags": 0,
- "XHeight": 462.0,
- "ItalicAngle": -15.0,
- "Ascent": 683.0,
- },
- {
- " ": 250,
- "!": 389,
- '"': 555,
- "#": 500,
- "$": 500,
- "%": 833,
- "&": 778,
- "'": 278,
- "(": 333,
- ")": 333,
- "*": 500,
- "+": 570,
- ",": 250,
- "-": 333,
- ".": 250,
- "/": 278,
- "0": 500,
- "1": 500,
- "2": 500,
- "3": 500,
- "4": 500,
- "5": 500,
- "6": 500,
- "7": 500,
- "8": 500,
- "9": 500,
- ":": 333,
- ";": 333,
- "<": 570,
- "=": 570,
- ">": 570,
- "?": 500,
- "@": 832,
- "A": 667,
- "B": 667,
- "C": 667,
- "D": 722,
- "E": 667,
- "F": 667,
- "G": 722,
- "H": 778,
- "I": 389,
- "J": 500,
- "K": 667,
- "L": 611,
- "M": 889,
- "N": 722,
- "O": 722,
- "P": 611,
- "Q": 722,
- "R": 667,
- "S": 556,
- "T": 611,
- "U": 722,
- "V": 667,
- "W": 889,
- "X": 667,
- "Y": 611,
- "Z": 611,
- "[": 333,
- "\\": 278,
- "]": 333,
- "^": 570,
- "_": 500,
- "`": 333,
- "a": 500,
- "b": 500,
- "c": 444,
- "d": 500,
- "e": 444,
- "f": 333,
- "g": 500,
- "h": 556,
- "i": 278,
- "j": 278,
- "k": 500,
- "l": 278,
- "m": 778,
- "n": 556,
- "o": 500,
- "p": 500,
- "q": 500,
- "r": 389,
- "s": 389,
- "t": 278,
- "u": 556,
- "v": 444,
- "w": 667,
- "x": 500,
- "y": 444,
- "z": 389,
- "{": 348,
- "|": 220,
- "}": 348,
- "~": 570,
- "\xa1": 389,
- "\xa2": 500,
- "\xa3": 500,
- "\xa4": 500,
- "\xa5": 500,
- "\xa6": 220,
- "\xa7": 500,
- "\xa8": 333,
- "\xa9": 747,
- "\xaa": 266,
- "\xab": 500,
- "\xac": 606,
- "\xae": 747,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 570,
- "\xb2": 300,
- "\xb3": 300,
- "\xb4": 333,
- "\xb5": 576,
- "\xb6": 500,
- "\xb7": 250,
- "\xb8": 333,
- "\xb9": 300,
- "\xba": 300,
- "\xbb": 500,
- "\xbc": 750,
- "\xbd": 750,
- "\xbe": 750,
- "\xbf": 500,
- "\xc0": 667,
- "\xc1": 667,
- "\xc2": 667,
- "\xc3": 667,
- "\xc4": 667,
- "\xc5": 667,
- "\xc6": 944,
- "\xc7": 667,
- "\xc8": 667,
- "\xc9": 667,
- "\xca": 667,
- "\xcb": 667,
- "\xcc": 389,
- "\xcd": 389,
- "\xce": 389,
- "\xcf": 389,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 722,
- "\xd3": 722,
- "\xd4": 722,
- "\xd5": 722,
- "\xd6": 722,
- "\xd7": 570,
- "\xd8": 722,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 611,
- "\xde": 611,
- "\xdf": 500,
- "\xe0": 500,
- "\xe1": 500,
- "\xe2": 500,
- "\xe3": 500,
- "\xe4": 500,
- "\xe5": 500,
- "\xe6": 722,
- "\xe7": 444,
- "\xe8": 444,
- "\xe9": 444,
- "\xea": 444,
- "\xeb": 444,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 500,
- "\xf1": 556,
- "\xf2": 500,
- "\xf3": 500,
- "\xf4": 500,
- "\xf5": 500,
- "\xf6": 500,
- "\xf7": 570,
- "\xf8": 500,
- "\xf9": 556,
- "\xfa": 556,
- "\xfb": 556,
- "\xfc": 556,
- "\xfd": 444,
- "\xfe": 500,
- "\xff": 444,
- "\u0100": 667,
- "\u0101": 500,
- "\u0102": 667,
- "\u0103": 500,
- "\u0104": 667,
- "\u0105": 500,
- "\u0106": 667,
- "\u0107": 444,
- "\u010c": 667,
- "\u010d": 444,
- "\u010e": 722,
- "\u010f": 608,
- "\u0110": 722,
- "\u0111": 500,
- "\u0112": 667,
- "\u0113": 444,
- "\u0116": 667,
- "\u0117": 444,
- "\u0118": 667,
- "\u0119": 444,
- "\u011a": 667,
- "\u011b": 444,
- "\u011e": 722,
- "\u011f": 500,
- "\u0122": 722,
- "\u0123": 500,
- "\u012a": 389,
- "\u012b": 278,
- "\u012e": 389,
- "\u012f": 278,
- "\u0130": 389,
- "\u0131": 278,
- "\u0136": 667,
- "\u0137": 500,
- "\u0139": 611,
- "\u013a": 278,
- "\u013b": 611,
- "\u013c": 278,
- "\u013d": 611,
- "\u013e": 382,
- "\u0141": 611,
- "\u0142": 278,
- "\u0143": 722,
- "\u0144": 556,
- "\u0145": 722,
- "\u0146": 556,
- "\u0147": 722,
- "\u0148": 556,
- "\u014c": 722,
- "\u014d": 500,
- "\u0150": 722,
- "\u0151": 500,
- "\u0152": 944,
- "\u0153": 722,
- "\u0154": 667,
- "\u0155": 389,
- "\u0156": 667,
- "\u0157": 389,
- "\u0158": 667,
- "\u0159": 389,
- "\u015a": 556,
- "\u015b": 389,
- "\u015e": 556,
- "\u015f": 389,
- "\u0160": 556,
- "\u0161": 389,
- "\u0162": 611,
- "\u0163": 278,
- "\u0164": 611,
- "\u0165": 366,
- "\u016a": 722,
- "\u016b": 556,
- "\u016e": 722,
- "\u016f": 556,
- "\u0170": 722,
- "\u0171": 556,
- "\u0172": 722,
- "\u0173": 556,
- "\u0178": 611,
- "\u0179": 611,
- "\u017a": 389,
- "\u017b": 611,
- "\u017c": 389,
- "\u017d": 611,
- "\u017e": 389,
- "\u0192": 500,
- "\u0218": 556,
- "\u0219": 389,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 500,
- "\u2014": 1000,
- "\u2018": 333,
- "\u2019": 333,
- "\u201a": 333,
- "\u201c": 500,
- "\u201d": 500,
- "\u201e": 500,
- "\u2020": 500,
- "\u2021": 500,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 1000,
- "\u2202": 494,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 606,
- "\u221a": 549,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 494,
- "\uf6c3": 250,
- "\ufb01": 556,
- "\ufb02": 556,
- },
- ),
- "Times-Italic": (
- {
- "FontName": "Times-Italic",
- "Descent": -217.0,
- "FontBBox": (-169.0, -217.0, 1010.0, 883.0),
- "FontWeight": "Medium",
- "CapHeight": 653.0,
- "FontFamily": "Times",
- "Flags": 0,
- "XHeight": 441.0,
- "ItalicAngle": -15.5,
- "Ascent": 683.0,
- },
- {
- " ": 250,
- "!": 333,
- '"': 420,
- "#": 500,
- "$": 500,
- "%": 833,
- "&": 778,
- "'": 214,
- "(": 333,
- ")": 333,
- "*": 500,
- "+": 675,
- ",": 250,
- "-": 333,
- ".": 250,
- "/": 278,
- "0": 500,
- "1": 500,
- "2": 500,
- "3": 500,
- "4": 500,
- "5": 500,
- "6": 500,
- "7": 500,
- "8": 500,
- "9": 500,
- ":": 333,
- ";": 333,
- "<": 675,
- "=": 675,
- ">": 675,
- "?": 500,
- "@": 920,
- "A": 611,
- "B": 611,
- "C": 667,
- "D": 722,
- "E": 611,
- "F": 611,
- "G": 722,
- "H": 722,
- "I": 333,
- "J": 444,
- "K": 667,
- "L": 556,
- "M": 833,
- "N": 667,
- "O": 722,
- "P": 611,
- "Q": 722,
- "R": 611,
- "S": 500,
- "T": 556,
- "U": 722,
- "V": 611,
- "W": 833,
- "X": 611,
- "Y": 556,
- "Z": 556,
- "[": 389,
- "\\": 278,
- "]": 389,
- "^": 422,
- "_": 500,
- "`": 333,
- "a": 500,
- "b": 500,
- "c": 444,
- "d": 500,
- "e": 444,
- "f": 278,
- "g": 500,
- "h": 500,
- "i": 278,
- "j": 278,
- "k": 444,
- "l": 278,
- "m": 722,
- "n": 500,
- "o": 500,
- "p": 500,
- "q": 500,
- "r": 389,
- "s": 389,
- "t": 278,
- "u": 500,
- "v": 444,
- "w": 667,
- "x": 444,
- "y": 444,
- "z": 389,
- "{": 400,
- "|": 275,
- "}": 400,
- "~": 541,
- "\xa1": 389,
- "\xa2": 500,
- "\xa3": 500,
- "\xa4": 500,
- "\xa5": 500,
- "\xa6": 275,
- "\xa7": 500,
- "\xa8": 333,
- "\xa9": 760,
- "\xaa": 276,
- "\xab": 500,
- "\xac": 675,
- "\xae": 760,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 675,
- "\xb2": 300,
- "\xb3": 300,
- "\xb4": 333,
- "\xb5": 500,
- "\xb6": 523,
- "\xb7": 250,
- "\xb8": 333,
- "\xb9": 300,
- "\xba": 310,
- "\xbb": 500,
- "\xbc": 750,
- "\xbd": 750,
- "\xbe": 750,
- "\xbf": 500,
- "\xc0": 611,
- "\xc1": 611,
- "\xc2": 611,
- "\xc3": 611,
- "\xc4": 611,
- "\xc5": 611,
- "\xc6": 889,
- "\xc7": 667,
- "\xc8": 611,
- "\xc9": 611,
- "\xca": 611,
- "\xcb": 611,
- "\xcc": 333,
- "\xcd": 333,
- "\xce": 333,
- "\xcf": 333,
- "\xd0": 722,
- "\xd1": 667,
- "\xd2": 722,
- "\xd3": 722,
- "\xd4": 722,
- "\xd5": 722,
- "\xd6": 722,
- "\xd7": 675,
- "\xd8": 722,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 556,
- "\xde": 611,
- "\xdf": 500,
- "\xe0": 500,
- "\xe1": 500,
- "\xe2": 500,
- "\xe3": 500,
- "\xe4": 500,
- "\xe5": 500,
- "\xe6": 667,
- "\xe7": 444,
- "\xe8": 444,
- "\xe9": 444,
- "\xea": 444,
- "\xeb": 444,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 500,
- "\xf1": 500,
- "\xf2": 500,
- "\xf3": 500,
- "\xf4": 500,
- "\xf5": 500,
- "\xf6": 500,
- "\xf7": 675,
- "\xf8": 500,
- "\xf9": 500,
- "\xfa": 500,
- "\xfb": 500,
- "\xfc": 500,
- "\xfd": 444,
- "\xfe": 500,
- "\xff": 444,
- "\u0100": 611,
- "\u0101": 500,
- "\u0102": 611,
- "\u0103": 500,
- "\u0104": 611,
- "\u0105": 500,
- "\u0106": 667,
- "\u0107": 444,
- "\u010c": 667,
- "\u010d": 444,
- "\u010e": 722,
- "\u010f": 544,
- "\u0110": 722,
- "\u0111": 500,
- "\u0112": 611,
- "\u0113": 444,
- "\u0116": 611,
- "\u0117": 444,
- "\u0118": 611,
- "\u0119": 444,
- "\u011a": 611,
- "\u011b": 444,
- "\u011e": 722,
- "\u011f": 500,
- "\u0122": 722,
- "\u0123": 500,
- "\u012a": 333,
- "\u012b": 278,
- "\u012e": 333,
- "\u012f": 278,
- "\u0130": 333,
- "\u0131": 278,
- "\u0136": 667,
- "\u0137": 444,
- "\u0139": 556,
- "\u013a": 278,
- "\u013b": 556,
- "\u013c": 278,
- "\u013d": 611,
- "\u013e": 300,
- "\u0141": 556,
- "\u0142": 278,
- "\u0143": 667,
- "\u0144": 500,
- "\u0145": 667,
- "\u0146": 500,
- "\u0147": 667,
- "\u0148": 500,
- "\u014c": 722,
- "\u014d": 500,
- "\u0150": 722,
- "\u0151": 500,
- "\u0152": 944,
- "\u0153": 667,
- "\u0154": 611,
- "\u0155": 389,
- "\u0156": 611,
- "\u0157": 389,
- "\u0158": 611,
- "\u0159": 389,
- "\u015a": 500,
- "\u015b": 389,
- "\u015e": 500,
- "\u015f": 389,
- "\u0160": 500,
- "\u0161": 389,
- "\u0162": 556,
- "\u0163": 278,
- "\u0164": 556,
- "\u0165": 300,
- "\u016a": 722,
- "\u016b": 500,
- "\u016e": 722,
- "\u016f": 500,
- "\u0170": 722,
- "\u0171": 500,
- "\u0172": 722,
- "\u0173": 500,
- "\u0178": 556,
- "\u0179": 556,
- "\u017a": 389,
- "\u017b": 556,
- "\u017c": 389,
- "\u017d": 556,
- "\u017e": 389,
- "\u0192": 500,
- "\u0218": 500,
- "\u0219": 389,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 500,
- "\u2014": 889,
- "\u2018": 333,
- "\u2019": 333,
- "\u201a": 333,
- "\u201c": 556,
- "\u201d": 556,
- "\u201e": 556,
- "\u2020": 500,
- "\u2021": 500,
- "\u2022": 350,
- "\u2026": 889,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 980,
- "\u2202": 476,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 675,
- "\u221a": 453,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 471,
- "\uf6c3": 250,
- "\ufb01": 500,
- "\ufb02": 500,
- },
- ),
- "Times-Roman": (
- {
- "FontName": "Times-Roman",
- "Descent": -217.0,
- "FontBBox": (-168.0, -218.0, 1000.0, 898.0),
- "FontWeight": "Roman",
- "CapHeight": 662.0,
- "FontFamily": "Times",
- "Flags": 0,
- "XHeight": 450.0,
- "ItalicAngle": 0.0,
- "Ascent": 683.0,
- },
- {
- " ": 250,
- "!": 333,
- '"': 408,
- "#": 500,
- "$": 500,
- "%": 833,
- "&": 778,
- "'": 180,
- "(": 333,
- ")": 333,
- "*": 500,
- "+": 564,
- ",": 250,
- "-": 333,
- ".": 250,
- "/": 278,
- "0": 500,
- "1": 500,
- "2": 500,
- "3": 500,
- "4": 500,
- "5": 500,
- "6": 500,
- "7": 500,
- "8": 500,
- "9": 500,
- ":": 278,
- ";": 278,
- "<": 564,
- "=": 564,
- ">": 564,
- "?": 444,
- "@": 921,
- "A": 722,
- "B": 667,
- "C": 667,
- "D": 722,
- "E": 611,
- "F": 556,
- "G": 722,
- "H": 722,
- "I": 333,
- "J": 389,
- "K": 722,
- "L": 611,
- "M": 889,
- "N": 722,
- "O": 722,
- "P": 556,
- "Q": 722,
- "R": 667,
- "S": 556,
- "T": 611,
- "U": 722,
- "V": 722,
- "W": 944,
- "X": 722,
- "Y": 722,
- "Z": 611,
- "[": 333,
- "\\": 278,
- "]": 333,
- "^": 469,
- "_": 500,
- "`": 333,
- "a": 444,
- "b": 500,
- "c": 444,
- "d": 500,
- "e": 444,
- "f": 333,
- "g": 500,
- "h": 500,
- "i": 278,
- "j": 278,
- "k": 500,
- "l": 278,
- "m": 778,
- "n": 500,
- "o": 500,
- "p": 500,
- "q": 500,
- "r": 333,
- "s": 389,
- "t": 278,
- "u": 500,
- "v": 500,
- "w": 722,
- "x": 500,
- "y": 500,
- "z": 444,
- "{": 480,
- "|": 200,
- "}": 480,
- "~": 541,
- "\xa1": 333,
- "\xa2": 500,
- "\xa3": 500,
- "\xa4": 500,
- "\xa5": 500,
- "\xa6": 200,
- "\xa7": 500,
- "\xa8": 333,
- "\xa9": 760,
- "\xaa": 276,
- "\xab": 500,
- "\xac": 564,
- "\xae": 760,
- "\xaf": 333,
- "\xb0": 400,
- "\xb1": 564,
- "\xb2": 300,
- "\xb3": 300,
- "\xb4": 333,
- "\xb5": 500,
- "\xb6": 453,
- "\xb7": 250,
- "\xb8": 333,
- "\xb9": 300,
- "\xba": 310,
- "\xbb": 500,
- "\xbc": 750,
- "\xbd": 750,
- "\xbe": 750,
- "\xbf": 444,
- "\xc0": 722,
- "\xc1": 722,
- "\xc2": 722,
- "\xc3": 722,
- "\xc4": 722,
- "\xc5": 722,
- "\xc6": 889,
- "\xc7": 667,
- "\xc8": 611,
- "\xc9": 611,
- "\xca": 611,
- "\xcb": 611,
- "\xcc": 333,
- "\xcd": 333,
- "\xce": 333,
- "\xcf": 333,
- "\xd0": 722,
- "\xd1": 722,
- "\xd2": 722,
- "\xd3": 722,
- "\xd4": 722,
- "\xd5": 722,
- "\xd6": 722,
- "\xd7": 564,
- "\xd8": 722,
- "\xd9": 722,
- "\xda": 722,
- "\xdb": 722,
- "\xdc": 722,
- "\xdd": 722,
- "\xde": 556,
- "\xdf": 500,
- "\xe0": 444,
- "\xe1": 444,
- "\xe2": 444,
- "\xe3": 444,
- "\xe4": 444,
- "\xe5": 444,
- "\xe6": 667,
- "\xe7": 444,
- "\xe8": 444,
- "\xe9": 444,
- "\xea": 444,
- "\xeb": 444,
- "\xec": 278,
- "\xed": 278,
- "\xee": 278,
- "\xef": 278,
- "\xf0": 500,
- "\xf1": 500,
- "\xf2": 500,
- "\xf3": 500,
- "\xf4": 500,
- "\xf5": 500,
- "\xf6": 500,
- "\xf7": 564,
- "\xf8": 500,
- "\xf9": 500,
- "\xfa": 500,
- "\xfb": 500,
- "\xfc": 500,
- "\xfd": 500,
- "\xfe": 500,
- "\xff": 500,
- "\u0100": 722,
- "\u0101": 444,
- "\u0102": 722,
- "\u0103": 444,
- "\u0104": 722,
- "\u0105": 444,
- "\u0106": 667,
- "\u0107": 444,
- "\u010c": 667,
- "\u010d": 444,
- "\u010e": 722,
- "\u010f": 588,
- "\u0110": 722,
- "\u0111": 500,
- "\u0112": 611,
- "\u0113": 444,
- "\u0116": 611,
- "\u0117": 444,
- "\u0118": 611,
- "\u0119": 444,
- "\u011a": 611,
- "\u011b": 444,
- "\u011e": 722,
- "\u011f": 500,
- "\u0122": 722,
- "\u0123": 500,
- "\u012a": 333,
- "\u012b": 278,
- "\u012e": 333,
- "\u012f": 278,
- "\u0130": 333,
- "\u0131": 278,
- "\u0136": 722,
- "\u0137": 500,
- "\u0139": 611,
- "\u013a": 278,
- "\u013b": 611,
- "\u013c": 278,
- "\u013d": 611,
- "\u013e": 344,
- "\u0141": 611,
- "\u0142": 278,
- "\u0143": 722,
- "\u0144": 500,
- "\u0145": 722,
- "\u0146": 500,
- "\u0147": 722,
- "\u0148": 500,
- "\u014c": 722,
- "\u014d": 500,
- "\u0150": 722,
- "\u0151": 500,
- "\u0152": 889,
- "\u0153": 722,
- "\u0154": 667,
- "\u0155": 333,
- "\u0156": 667,
- "\u0157": 333,
- "\u0158": 667,
- "\u0159": 333,
- "\u015a": 556,
- "\u015b": 389,
- "\u015e": 556,
- "\u015f": 389,
- "\u0160": 556,
- "\u0161": 389,
- "\u0162": 611,
- "\u0163": 278,
- "\u0164": 611,
- "\u0165": 326,
- "\u016a": 722,
- "\u016b": 500,
- "\u016e": 722,
- "\u016f": 500,
- "\u0170": 722,
- "\u0171": 500,
- "\u0172": 722,
- "\u0173": 500,
- "\u0178": 722,
- "\u0179": 611,
- "\u017a": 444,
- "\u017b": 611,
- "\u017c": 444,
- "\u017d": 611,
- "\u017e": 444,
- "\u0192": 500,
- "\u0218": 556,
- "\u0219": 389,
- "\u02c6": 333,
- "\u02c7": 333,
- "\u02d8": 333,
- "\u02d9": 333,
- "\u02da": 333,
- "\u02db": 333,
- "\u02dc": 333,
- "\u02dd": 333,
- "\u2013": 500,
- "\u2014": 1000,
- "\u2018": 333,
- "\u2019": 333,
- "\u201a": 333,
- "\u201c": 444,
- "\u201d": 444,
- "\u201e": 444,
- "\u2020": 500,
- "\u2021": 500,
- "\u2022": 350,
- "\u2026": 1000,
- "\u2030": 1000,
- "\u2039": 333,
- "\u203a": 333,
- "\u2044": 167,
- "\u2122": 980,
- "\u2202": 476,
- "\u2206": 612,
- "\u2211": 600,
- "\u2212": 564,
- "\u221a": 453,
- "\u2260": 549,
- "\u2264": 549,
- "\u2265": 549,
- "\u25ca": 471,
- "\uf6c3": 250,
- "\ufb01": 556,
- "\ufb02": 556,
- },
- ),
- "ZapfDingbats": (
- {
- "FontName": "ZapfDingbats",
- "FontBBox": (-1.0, -143.0, 981.0, 820.0),
- "FontWeight": "Medium",
- "FontFamily": "ITC",
- "Flags": 0,
- "ItalicAngle": 0.0,
- },
- {
- "\x01": 974,
- "\x02": 961,
- "\x03": 980,
- "\x04": 719,
- "\x05": 789,
- "\x06": 494,
- "\x07": 552,
- "\x08": 537,
- "\t": 577,
- "\n": 692,
- "\x0b": 960,
- "\x0c": 939,
- "\r": 549,
- "\x0e": 855,
- "\x0f": 911,
- "\x10": 933,
- "\x11": 945,
- "\x12": 974,
- "\x13": 755,
- "\x14": 846,
- "\x15": 762,
- "\x16": 761,
- "\x17": 571,
- "\x18": 677,
- "\x19": 763,
- "\x1a": 760,
- "\x1b": 759,
- "\x1c": 754,
- "\x1d": 786,
- "\x1e": 788,
- "\x1f": 788,
- " ": 790,
- "!": 793,
- '"': 794,
- "#": 816,
- "$": 823,
- "%": 789,
- "&": 841,
- "'": 823,
- "(": 833,
- ")": 816,
- "*": 831,
- "+": 923,
- ",": 744,
- "-": 723,
- ".": 749,
- "/": 790,
- "0": 792,
- "1": 695,
- "2": 776,
- "3": 768,
- "4": 792,
- "5": 759,
- "6": 707,
- "7": 708,
- "8": 682,
- "9": 701,
- ":": 826,
- ";": 815,
- "<": 789,
- "=": 789,
- ">": 707,
- "?": 687,
- "@": 696,
- "A": 689,
- "B": 786,
- "C": 787,
- "D": 713,
- "E": 791,
- "F": 785,
- "G": 791,
- "H": 873,
- "I": 761,
- "J": 762,
- "K": 759,
- "L": 892,
- "M": 892,
- "N": 788,
- "O": 784,
- "Q": 438,
- "R": 138,
- "S": 277,
- "T": 415,
- "U": 509,
- "V": 410,
- "W": 234,
- "X": 234,
- "Y": 390,
- "Z": 390,
- "[": 276,
- "\\": 276,
- "]": 317,
- "^": 317,
- "_": 334,
- "`": 334,
- "a": 392,
- "b": 392,
- "c": 668,
- "d": 668,
- "e": 732,
- "f": 544,
- "g": 544,
- "h": 910,
- "i": 911,
- "j": 667,
- "k": 760,
- "l": 760,
- "m": 626,
- "n": 694,
- "o": 595,
- "p": 776,
- "u": 690,
- "v": 791,
- "w": 790,
- "x": 788,
- "y": 788,
- "z": 788,
- "{": 788,
- "|": 788,
- "}": 788,
- "~": 788,
- "\x7f": 788,
- "\x80": 788,
- "\x81": 788,
- "\x82": 788,
- "\x83": 788,
- "\x84": 788,
- "\x85": 788,
- "\x86": 788,
- "\x87": 788,
- "\x88": 788,
- "\x89": 788,
- "\x8a": 788,
- "\x8b": 788,
- "\x8c": 788,
- "\x8d": 788,
- "\x8e": 788,
- "\x8f": 788,
- "\x90": 788,
- "\x91": 788,
- "\x92": 788,
- "\x93": 788,
- "\x94": 788,
- "\x95": 788,
- "\x96": 788,
- "\x97": 788,
- "\x98": 788,
- "\x99": 788,
- "\x9a": 788,
- "\x9b": 788,
- "\x9c": 788,
- "\x9d": 788,
- "\x9e": 788,
- "\x9f": 788,
- "\xa0": 894,
- "\xa1": 838,
- "\xa2": 924,
- "\xa3": 1016,
- "\xa4": 458,
- "\xa5": 924,
- "\xa6": 918,
- "\xa7": 927,
- "\xa8": 928,
- "\xa9": 928,
- "\xaa": 834,
- "\xab": 873,
- "\xac": 828,
- "\xad": 924,
- "\xae": 917,
- "\xaf": 930,
- "\xb0": 931,
- "\xb1": 463,
- "\xb2": 883,
- "\xb3": 836,
- "\xb4": 867,
- "\xb5": 696,
- "\xb6": 874,
- "\xb7": 760,
- "\xb8": 946,
- "\xb9": 865,
- "\xba": 967,
- "\xbb": 831,
- "\xbc": 873,
- "\xbd": 927,
- "\xbe": 970,
- "\xbf": 918,
- "\xc0": 748,
- "\xc1": 836,
- "\xc2": 771,
- "\xc3": 888,
- "\xc4": 748,
- "\xc5": 771,
- "\xc6": 888,
- "\xc7": 867,
- "\xc8": 696,
- "\xc9": 874,
- "\xca": 974,
- "\xcb": 762,
- "\xcc": 759,
- "\xcd": 509,
- "\xce": 410,
- },
- ),
-}
-
-# Aliases defined in implementation note 62 in Appecix H. related to section 5.5.1
-# (Type 1 Fonts) in the PDF Reference.
-FONT_METRICS["Arial"] = FONT_METRICS["Helvetica"]
-FONT_METRICS["Arial,Italic"] = FONT_METRICS["Helvetica-Oblique"]
-FONT_METRICS["Arial,Bold"] = FONT_METRICS["Helvetica-Bold"]
-FONT_METRICS["Arial,BoldItalic"] = FONT_METRICS["Helvetica-BoldOblique"]
-FONT_METRICS["CourierNew"] = FONT_METRICS["Courier"]
-FONT_METRICS["CourierNew,Italic"] = FONT_METRICS["Courier-Oblique"]
-FONT_METRICS["CourierNew,Bold"] = FONT_METRICS["Courier-Bold"]
-FONT_METRICS["CourierNew,BoldItalic"] = FONT_METRICS["Courier-BoldOblique"]
-FONT_METRICS["TimesNewRoman"] = FONT_METRICS["Times-Roman"]
-FONT_METRICS["TimesNewRoman,Italic"] = FONT_METRICS["Times-Italic"]
-FONT_METRICS["TimesNewRoman,Bold"] = FONT_METRICS["Times-Bold"]
-FONT_METRICS["TimesNewRoman,BoldItalic"] = FONT_METRICS["Times-BoldItalic"]
diff --git a/pdf2zh/glyphlist.py b/pdf2zh/glyphlist.py
deleted file mode 100644
index 2ee11a5..0000000
--- a/pdf2zh/glyphlist.py
+++ /dev/null
@@ -1,4366 +0,0 @@
-"""Mappings from Adobe glyph names to Unicode characters.
-
-In some CMap tables, Adobe glyph names are used for specifying
-Unicode characters instead of using decimal/hex character code.
-
-The following data was taken by
-
- $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt
-
-```python
-from pdf2zh.glyphlist import convert_glyphlist
-
-convert_glyphlist("glyphlist.txt")
-"""
-
-# ###################################################################################
-# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this documentation file to use, copy, publish, distribute,
-# sublicense, and/or sell copies of the documentation, and to permit
-# others to do the same, provided that:
-# - No modification, editing or other alteration of this document is
-# allowed; and
-# - The above copyright notice and this permission notice shall be
-# included in all copies of the documentation.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this documentation file, to create their own derivative works
-# from the content of this document to use, copy, publish, distribute,
-# sublicense, and/or sell the derivative works, and to permit others to do
-# the same, provided that the derived work is not represented as being a
-# copy or version of this document.
-#
-# Adobe shall not be liable to any party for any loss of revenue or profit
-# or for indirect, incidental, special, consequential, or other similar
-# damages, whether based on tort (including without limitation negligence
-# or strict liability), contract or other legal or equitable grounds even
-# if Adobe has been advised or had reason to know of the possibility of
-# such damages. The Adobe materials are provided on an "AS IS" basis.
-# Adobe specifically disclaims all express, statutory, or implied
-# warranties relating to the Adobe materials, including but not limited to
-# those concerning merchantability or fitness for a particular purpose or
-# non-infringement of any third party rights regarding the Adobe
-# materials.
-# ###################################################################################
-# Name: Adobe Glyph List
-# Table version: 2.0
-# Date: September 20, 2002
-#
-# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
-#
-# Format: Semicolon-delimited fields:
-# (1) glyph name
-# (2) Unicode scalar value
-
-
-def convert_glyphlist(path: str) -> None:
- """Convert a glyph list into a python representation.
-
- See output below.
- """
- state = 0
- with open(path) as fileinput:
- for line in fileinput.readlines():
- line = line.strip()
- if not line or line.startswith("#"):
- if state == 1:
- state = 2
- print("}\n")
- print(line)
- continue
- if state == 0:
- print("\nglyphname2unicode = {")
- state = 1
- (name, x) = line.split(";")
- codes = x.split(" ")
- print(
- " {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)),
- )
-
-
-glyphname2unicode = {
- "A": "\u0041",
- "AE": "\u00c6",
- "AEacute": "\u01fc",
- "AEmacron": "\u01e2",
- "AEsmall": "\uf7e6",
- "Aacute": "\u00c1",
- "Aacutesmall": "\uf7e1",
- "Abreve": "\u0102",
- "Abreveacute": "\u1eae",
- "Abrevecyrillic": "\u04d0",
- "Abrevedotbelow": "\u1eb6",
- "Abrevegrave": "\u1eb0",
- "Abrevehookabove": "\u1eb2",
- "Abrevetilde": "\u1eb4",
- "Acaron": "\u01cd",
- "Acircle": "\u24b6",
- "Acircumflex": "\u00c2",
- "Acircumflexacute": "\u1ea4",
- "Acircumflexdotbelow": "\u1eac",
- "Acircumflexgrave": "\u1ea6",
- "Acircumflexhookabove": "\u1ea8",
- "Acircumflexsmall": "\uf7e2",
- "Acircumflextilde": "\u1eaa",
- "Acute": "\uf6c9",
- "Acutesmall": "\uf7b4",
- "Acyrillic": "\u0410",
- "Adblgrave": "\u0200",
- "Adieresis": "\u00c4",
- "Adieresiscyrillic": "\u04d2",
- "Adieresismacron": "\u01de",
- "Adieresissmall": "\uf7e4",
- "Adotbelow": "\u1ea0",
- "Adotmacron": "\u01e0",
- "Agrave": "\u00c0",
- "Agravesmall": "\uf7e0",
- "Ahookabove": "\u1ea2",
- "Aiecyrillic": "\u04d4",
- "Ainvertedbreve": "\u0202",
- "Alpha": "\u0391",
- "Alphatonos": "\u0386",
- "Amacron": "\u0100",
- "Amonospace": "\uff21",
- "Aogonek": "\u0104",
- "Aring": "\u00c5",
- "Aringacute": "\u01fa",
- "Aringbelow": "\u1e00",
- "Aringsmall": "\uf7e5",
- "Asmall": "\uf761",
- "Atilde": "\u00c3",
- "Atildesmall": "\uf7e3",
- "Aybarmenian": "\u0531",
- "B": "\u0042",
- "Bcircle": "\u24b7",
- "Bdotaccent": "\u1e02",
- "Bdotbelow": "\u1e04",
- "Becyrillic": "\u0411",
- "Benarmenian": "\u0532",
- "Beta": "\u0392",
- "Bhook": "\u0181",
- "Blinebelow": "\u1e06",
- "Bmonospace": "\uff22",
- "Brevesmall": "\uf6f4",
- "Bsmall": "\uf762",
- "Btopbar": "\u0182",
- "C": "\u0043",
- "Caarmenian": "\u053e",
- "Cacute": "\u0106",
- "Caron": "\uf6ca",
- "Caronsmall": "\uf6f5",
- "Ccaron": "\u010c",
- "Ccedilla": "\u00c7",
- "Ccedillaacute": "\u1e08",
- "Ccedillasmall": "\uf7e7",
- "Ccircle": "\u24b8",
- "Ccircumflex": "\u0108",
- "Cdot": "\u010a",
- "Cdotaccent": "\u010a",
- "Cedillasmall": "\uf7b8",
- "Chaarmenian": "\u0549",
- "Cheabkhasiancyrillic": "\u04bc",
- "Checyrillic": "\u0427",
- "Chedescenderabkhasiancyrillic": "\u04be",
- "Chedescendercyrillic": "\u04b6",
- "Chedieresiscyrillic": "\u04f4",
- "Cheharmenian": "\u0543",
- "Chekhakassiancyrillic": "\u04cb",
- "Cheverticalstrokecyrillic": "\u04b8",
- "Chi": "\u03a7",
- "Chook": "\u0187",
- "Circumflexsmall": "\uf6f6",
- "Cmonospace": "\uff23",
- "Coarmenian": "\u0551",
- "Csmall": "\uf763",
- "D": "\u0044",
- "DZ": "\u01f1",
- "DZcaron": "\u01c4",
- "Daarmenian": "\u0534",
- "Dafrican": "\u0189",
- "Dcaron": "\u010e",
- "Dcedilla": "\u1e10",
- "Dcircle": "\u24b9",
- "Dcircumflexbelow": "\u1e12",
- "Dcroat": "\u0110",
- "Ddotaccent": "\u1e0a",
- "Ddotbelow": "\u1e0c",
- "Decyrillic": "\u0414",
- "Deicoptic": "\u03ee",
- "Delta": "\u2206",
- "Deltagreek": "\u0394",
- "Dhook": "\u018a",
- "Dieresis": "\uf6cb",
- "DieresisAcute": "\uf6cc",
- "DieresisGrave": "\uf6cd",
- "Dieresissmall": "\uf7a8",
- "Digammagreek": "\u03dc",
- "Djecyrillic": "\u0402",
- "Dlinebelow": "\u1e0e",
- "Dmonospace": "\uff24",
- "Dotaccentsmall": "\uf6f7",
- "Dslash": "\u0110",
- "Dsmall": "\uf764",
- "Dtopbar": "\u018b",
- "Dz": "\u01f2",
- "Dzcaron": "\u01c5",
- "Dzeabkhasiancyrillic": "\u04e0",
- "Dzecyrillic": "\u0405",
- "Dzhecyrillic": "\u040f",
- "E": "\u0045",
- "Eacute": "\u00c9",
- "Eacutesmall": "\uf7e9",
- "Ebreve": "\u0114",
- "Ecaron": "\u011a",
- "Ecedillabreve": "\u1e1c",
- "Echarmenian": "\u0535",
- "Ecircle": "\u24ba",
- "Ecircumflex": "\u00ca",
- "Ecircumflexacute": "\u1ebe",
- "Ecircumflexbelow": "\u1e18",
- "Ecircumflexdotbelow": "\u1ec6",
- "Ecircumflexgrave": "\u1ec0",
- "Ecircumflexhookabove": "\u1ec2",
- "Ecircumflexsmall": "\uf7ea",
- "Ecircumflextilde": "\u1ec4",
- "Ecyrillic": "\u0404",
- "Edblgrave": "\u0204",
- "Edieresis": "\u00cb",
- "Edieresissmall": "\uf7eb",
- "Edot": "\u0116",
- "Edotaccent": "\u0116",
- "Edotbelow": "\u1eb8",
- "Efcyrillic": "\u0424",
- "Egrave": "\u00c8",
- "Egravesmall": "\uf7e8",
- "Eharmenian": "\u0537",
- "Ehookabove": "\u1eba",
- "Eightroman": "\u2167",
- "Einvertedbreve": "\u0206",
- "Eiotifiedcyrillic": "\u0464",
- "Elcyrillic": "\u041b",
- "Elevenroman": "\u216a",
- "Emacron": "\u0112",
- "Emacronacute": "\u1e16",
- "Emacrongrave": "\u1e14",
- "Emcyrillic": "\u041c",
- "Emonospace": "\uff25",
- "Encyrillic": "\u041d",
- "Endescendercyrillic": "\u04a2",
- "Eng": "\u014a",
- "Enghecyrillic": "\u04a4",
- "Enhookcyrillic": "\u04c7",
- "Eogonek": "\u0118",
- "Eopen": "\u0190",
- "Epsilon": "\u0395",
- "Epsilontonos": "\u0388",
- "Ercyrillic": "\u0420",
- "Ereversed": "\u018e",
- "Ereversedcyrillic": "\u042d",
- "Escyrillic": "\u0421",
- "Esdescendercyrillic": "\u04aa",
- "Esh": "\u01a9",
- "Esmall": "\uf765",
- "Eta": "\u0397",
- "Etarmenian": "\u0538",
- "Etatonos": "\u0389",
- "Eth": "\u00d0",
- "Ethsmall": "\uf7f0",
- "Etilde": "\u1ebc",
- "Etildebelow": "\u1e1a",
- "Euro": "\u20ac",
- "Ezh": "\u01b7",
- "Ezhcaron": "\u01ee",
- "Ezhreversed": "\u01b8",
- "F": "\u0046",
- "Fcircle": "\u24bb",
- "Fdotaccent": "\u1e1e",
- "Feharmenian": "\u0556",
- "Feicoptic": "\u03e4",
- "Fhook": "\u0191",
- "Fitacyrillic": "\u0472",
- "Fiveroman": "\u2164",
- "Fmonospace": "\uff26",
- "Fourroman": "\u2163",
- "Fsmall": "\uf766",
- "G": "\u0047",
- "GBsquare": "\u3387",
- "Gacute": "\u01f4",
- "Gamma": "\u0393",
- "Gammaafrican": "\u0194",
- "Gangiacoptic": "\u03ea",
- "Gbreve": "\u011e",
- "Gcaron": "\u01e6",
- "Gcedilla": "\u0122",
- "Gcircle": "\u24bc",
- "Gcircumflex": "\u011c",
- "Gcommaaccent": "\u0122",
- "Gdot": "\u0120",
- "Gdotaccent": "\u0120",
- "Gecyrillic": "\u0413",
- "Ghadarmenian": "\u0542",
- "Ghemiddlehookcyrillic": "\u0494",
- "Ghestrokecyrillic": "\u0492",
- "Gheupturncyrillic": "\u0490",
- "Ghook": "\u0193",
- "Gimarmenian": "\u0533",
- "Gjecyrillic": "\u0403",
- "Gmacron": "\u1e20",
- "Gmonospace": "\uff27",
- "Grave": "\uf6ce",
- "Gravesmall": "\uf760",
- "Gsmall": "\uf767",
- "Gsmallhook": "\u029b",
- "Gstroke": "\u01e4",
- "H": "\u0048",
- "H18533": "\u25cf",
- "H18543": "\u25aa",
- "H18551": "\u25ab",
- "H22073": "\u25a1",
- "HPsquare": "\u33cb",
- "Haabkhasiancyrillic": "\u04a8",
- "Hadescendercyrillic": "\u04b2",
- "Hardsigncyrillic": "\u042a",
- "Hbar": "\u0126",
- "Hbrevebelow": "\u1e2a",
- "Hcedilla": "\u1e28",
- "Hcircle": "\u24bd",
- "Hcircumflex": "\u0124",
- "Hdieresis": "\u1e26",
- "Hdotaccent": "\u1e22",
- "Hdotbelow": "\u1e24",
- "Hmonospace": "\uff28",
- "Hoarmenian": "\u0540",
- "Horicoptic": "\u03e8",
- "Hsmall": "\uf768",
- "Hungarumlaut": "\uf6cf",
- "Hungarumlautsmall": "\uf6f8",
- "Hzsquare": "\u3390",
- "I": "\u0049",
- "IAcyrillic": "\u042f",
- "IJ": "\u0132",
- "IUcyrillic": "\u042e",
- "Iacute": "\u00cd",
- "Iacutesmall": "\uf7ed",
- "Ibreve": "\u012c",
- "Icaron": "\u01cf",
- "Icircle": "\u24be",
- "Icircumflex": "\u00ce",
- "Icircumflexsmall": "\uf7ee",
- "Icyrillic": "\u0406",
- "Idblgrave": "\u0208",
- "Idieresis": "\u00cf",
- "Idieresisacute": "\u1e2e",
- "Idieresiscyrillic": "\u04e4",
- "Idieresissmall": "\uf7ef",
- "Idot": "\u0130",
- "Idotaccent": "\u0130",
- "Idotbelow": "\u1eca",
- "Iebrevecyrillic": "\u04d6",
- "Iecyrillic": "\u0415",
- "Ifraktur": "\u2111",
- "Igrave": "\u00cc",
- "Igravesmall": "\uf7ec",
- "Ihookabove": "\u1ec8",
- "Iicyrillic": "\u0418",
- "Iinvertedbreve": "\u020a",
- "Iishortcyrillic": "\u0419",
- "Imacron": "\u012a",
- "Imacroncyrillic": "\u04e2",
- "Imonospace": "\uff29",
- "Iniarmenian": "\u053b",
- "Iocyrillic": "\u0401",
- "Iogonek": "\u012e",
- "Iota": "\u0399",
- "Iotaafrican": "\u0196",
- "Iotadieresis": "\u03aa",
- "Iotatonos": "\u038a",
- "Ismall": "\uf769",
- "Istroke": "\u0197",
- "Itilde": "\u0128",
- "Itildebelow": "\u1e2c",
- "Izhitsacyrillic": "\u0474",
- "Izhitsadblgravecyrillic": "\u0476",
- "J": "\u004a",
- "Jaarmenian": "\u0541",
- "Jcircle": "\u24bf",
- "Jcircumflex": "\u0134",
- "Jecyrillic": "\u0408",
- "Jheharmenian": "\u054b",
- "Jmonospace": "\uff2a",
- "Jsmall": "\uf76a",
- "K": "\u004b",
- "KBsquare": "\u3385",
- "KKsquare": "\u33cd",
- "Kabashkircyrillic": "\u04a0",
- "Kacute": "\u1e30",
- "Kacyrillic": "\u041a",
- "Kadescendercyrillic": "\u049a",
- "Kahookcyrillic": "\u04c3",
- "Kappa": "\u039a",
- "Kastrokecyrillic": "\u049e",
- "Kaverticalstrokecyrillic": "\u049c",
- "Kcaron": "\u01e8",
- "Kcedilla": "\u0136",
- "Kcircle": "\u24c0",
- "Kcommaaccent": "\u0136",
- "Kdotbelow": "\u1e32",
- "Keharmenian": "\u0554",
- "Kenarmenian": "\u053f",
- "Khacyrillic": "\u0425",
- "Kheicoptic": "\u03e6",
- "Khook": "\u0198",
- "Kjecyrillic": "\u040c",
- "Klinebelow": "\u1e34",
- "Kmonospace": "\uff2b",
- "Koppacyrillic": "\u0480",
- "Koppagreek": "\u03de",
- "Ksicyrillic": "\u046e",
- "Ksmall": "\uf76b",
- "L": "\u004c",
- "LJ": "\u01c7",
- "LL": "\uf6bf",
- "Lacute": "\u0139",
- "Lambda": "\u039b",
- "Lcaron": "\u013d",
- "Lcedilla": "\u013b",
- "Lcircle": "\u24c1",
- "Lcircumflexbelow": "\u1e3c",
- "Lcommaaccent": "\u013b",
- "Ldot": "\u013f",
- "Ldotaccent": "\u013f",
- "Ldotbelow": "\u1e36",
- "Ldotbelowmacron": "\u1e38",
- "Liwnarmenian": "\u053c",
- "Lj": "\u01c8",
- "Ljecyrillic": "\u0409",
- "Llinebelow": "\u1e3a",
- "Lmonospace": "\uff2c",
- "Lslash": "\u0141",
- "Lslashsmall": "\uf6f9",
- "Lsmall": "\uf76c",
- "M": "\u004d",
- "MBsquare": "\u3386",
- "Macron": "\uf6d0",
- "Macronsmall": "\uf7af",
- "Macute": "\u1e3e",
- "Mcircle": "\u24c2",
- "Mdotaccent": "\u1e40",
- "Mdotbelow": "\u1e42",
- "Menarmenian": "\u0544",
- "Mmonospace": "\uff2d",
- "Msmall": "\uf76d",
- "Mturned": "\u019c",
- "Mu": "\u039c",
- "N": "\u004e",
- "NJ": "\u01ca",
- "Nacute": "\u0143",
- "Ncaron": "\u0147",
- "Ncedilla": "\u0145",
- "Ncircle": "\u24c3",
- "Ncircumflexbelow": "\u1e4a",
- "Ncommaaccent": "\u0145",
- "Ndotaccent": "\u1e44",
- "Ndotbelow": "\u1e46",
- "Nhookleft": "\u019d",
- "Nineroman": "\u2168",
- "Nj": "\u01cb",
- "Njecyrillic": "\u040a",
- "Nlinebelow": "\u1e48",
- "Nmonospace": "\uff2e",
- "Nowarmenian": "\u0546",
- "Nsmall": "\uf76e",
- "Ntilde": "\u00d1",
- "Ntildesmall": "\uf7f1",
- "Nu": "\u039d",
- "O": "\u004f",
- "OE": "\u0152",
- "OEsmall": "\uf6fa",
- "Oacute": "\u00d3",
- "Oacutesmall": "\uf7f3",
- "Obarredcyrillic": "\u04e8",
- "Obarreddieresiscyrillic": "\u04ea",
- "Obreve": "\u014e",
- "Ocaron": "\u01d1",
- "Ocenteredtilde": "\u019f",
- "Ocircle": "\u24c4",
- "Ocircumflex": "\u00d4",
- "Ocircumflexacute": "\u1ed0",
- "Ocircumflexdotbelow": "\u1ed8",
- "Ocircumflexgrave": "\u1ed2",
- "Ocircumflexhookabove": "\u1ed4",
- "Ocircumflexsmall": "\uf7f4",
- "Ocircumflextilde": "\u1ed6",
- "Ocyrillic": "\u041e",
- "Odblacute": "\u0150",
- "Odblgrave": "\u020c",
- "Odieresis": "\u00d6",
- "Odieresiscyrillic": "\u04e6",
- "Odieresissmall": "\uf7f6",
- "Odotbelow": "\u1ecc",
- "Ogoneksmall": "\uf6fb",
- "Ograve": "\u00d2",
- "Ogravesmall": "\uf7f2",
- "Oharmenian": "\u0555",
- "Ohm": "\u2126",
- "Ohookabove": "\u1ece",
- "Ohorn": "\u01a0",
- "Ohornacute": "\u1eda",
- "Ohorndotbelow": "\u1ee2",
- "Ohorngrave": "\u1edc",
- "Ohornhookabove": "\u1ede",
- "Ohorntilde": "\u1ee0",
- "Ohungarumlaut": "\u0150",
- "Oi": "\u01a2",
- "Oinvertedbreve": "\u020e",
- "Omacron": "\u014c",
- "Omacronacute": "\u1e52",
- "Omacrongrave": "\u1e50",
- "Omega": "\u2126",
- "Omegacyrillic": "\u0460",
- "Omegagreek": "\u03a9",
- "Omegaroundcyrillic": "\u047a",
- "Omegatitlocyrillic": "\u047c",
- "Omegatonos": "\u038f",
- "Omicron": "\u039f",
- "Omicrontonos": "\u038c",
- "Omonospace": "\uff2f",
- "Oneroman": "\u2160",
- "Oogonek": "\u01ea",
- "Oogonekmacron": "\u01ec",
- "Oopen": "\u0186",
- "Oslash": "\u00d8",
- "Oslashacute": "\u01fe",
- "Oslashsmall": "\uf7f8",
- "Osmall": "\uf76f",
- "Ostrokeacute": "\u01fe",
- "Otcyrillic": "\u047e",
- "Otilde": "\u00d5",
- "Otildeacute": "\u1e4c",
- "Otildedieresis": "\u1e4e",
- "Otildesmall": "\uf7f5",
- "P": "\u0050",
- "Pacute": "\u1e54",
- "Pcircle": "\u24c5",
- "Pdotaccent": "\u1e56",
- "Pecyrillic": "\u041f",
- "Peharmenian": "\u054a",
- "Pemiddlehookcyrillic": "\u04a6",
- "Phi": "\u03a6",
- "Phook": "\u01a4",
- "Pi": "\u03a0",
- "Piwrarmenian": "\u0553",
- "Pmonospace": "\uff30",
- "Psi": "\u03a8",
- "Psicyrillic": "\u0470",
- "Psmall": "\uf770",
- "Q": "\u0051",
- "Qcircle": "\u24c6",
- "Qmonospace": "\uff31",
- "Qsmall": "\uf771",
- "R": "\u0052",
- "Raarmenian": "\u054c",
- "Racute": "\u0154",
- "Rcaron": "\u0158",
- "Rcedilla": "\u0156",
- "Rcircle": "\u24c7",
- "Rcommaaccent": "\u0156",
- "Rdblgrave": "\u0210",
- "Rdotaccent": "\u1e58",
- "Rdotbelow": "\u1e5a",
- "Rdotbelowmacron": "\u1e5c",
- "Reharmenian": "\u0550",
- "Rfraktur": "\u211c",
- "Rho": "\u03a1",
- "Ringsmall": "\uf6fc",
- "Rinvertedbreve": "\u0212",
- "Rlinebelow": "\u1e5e",
- "Rmonospace": "\uff32",
- "Rsmall": "\uf772",
- "Rsmallinverted": "\u0281",
- "Rsmallinvertedsuperior": "\u02b6",
- "S": "\u0053",
- "SF010000": "\u250c",
- "SF020000": "\u2514",
- "SF030000": "\u2510",
- "SF040000": "\u2518",
- "SF050000": "\u253c",
- "SF060000": "\u252c",
- "SF070000": "\u2534",
- "SF080000": "\u251c",
- "SF090000": "\u2524",
- "SF100000": "\u2500",
- "SF110000": "\u2502",
- "SF190000": "\u2561",
- "SF200000": "\u2562",
- "SF210000": "\u2556",
- "SF220000": "\u2555",
- "SF230000": "\u2563",
- "SF240000": "\u2551",
- "SF250000": "\u2557",
- "SF260000": "\u255d",
- "SF270000": "\u255c",
- "SF280000": "\u255b",
- "SF360000": "\u255e",
- "SF370000": "\u255f",
- "SF380000": "\u255a",
- "SF390000": "\u2554",
- "SF400000": "\u2569",
- "SF410000": "\u2566",
- "SF420000": "\u2560",
- "SF430000": "\u2550",
- "SF440000": "\u256c",
- "SF450000": "\u2567",
- "SF460000": "\u2568",
- "SF470000": "\u2564",
- "SF480000": "\u2565",
- "SF490000": "\u2559",
- "SF500000": "\u2558",
- "SF510000": "\u2552",
- "SF520000": "\u2553",
- "SF530000": "\u256b",
- "SF540000": "\u256a",
- "Sacute": "\u015a",
- "Sacutedotaccent": "\u1e64",
- "Sampigreek": "\u03e0",
- "Scaron": "\u0160",
- "Scarondotaccent": "\u1e66",
- "Scaronsmall": "\uf6fd",
- "Scedilla": "\u015e",
- "Schwa": "\u018f",
- "Schwacyrillic": "\u04d8",
- "Schwadieresiscyrillic": "\u04da",
- "Scircle": "\u24c8",
- "Scircumflex": "\u015c",
- "Scommaaccent": "\u0218",
- "Sdotaccent": "\u1e60",
- "Sdotbelow": "\u1e62",
- "Sdotbelowdotaccent": "\u1e68",
- "Seharmenian": "\u054d",
- "Sevenroman": "\u2166",
- "Shaarmenian": "\u0547",
- "Shacyrillic": "\u0428",
- "Shchacyrillic": "\u0429",
- "Sheicoptic": "\u03e2",
- "Shhacyrillic": "\u04ba",
- "Shimacoptic": "\u03ec",
- "Sigma": "\u03a3",
- "Sixroman": "\u2165",
- "Smonospace": "\uff33",
- "Softsigncyrillic": "\u042c",
- "Ssmall": "\uf773",
- "Stigmagreek": "\u03da",
- "T": "\u0054",
- "Tau": "\u03a4",
- "Tbar": "\u0166",
- "Tcaron": "\u0164",
- "Tcedilla": "\u0162",
- "Tcircle": "\u24c9",
- "Tcircumflexbelow": "\u1e70",
- "Tcommaaccent": "\u0162",
- "Tdotaccent": "\u1e6a",
- "Tdotbelow": "\u1e6c",
- "Tecyrillic": "\u0422",
- "Tedescendercyrillic": "\u04ac",
- "Tenroman": "\u2169",
- "Tetsecyrillic": "\u04b4",
- "Theta": "\u0398",
- "Thook": "\u01ac",
- "Thorn": "\u00de",
- "Thornsmall": "\uf7fe",
- "Threeroman": "\u2162",
- "Tildesmall": "\uf6fe",
- "Tiwnarmenian": "\u054f",
- "Tlinebelow": "\u1e6e",
- "Tmonospace": "\uff34",
- "Toarmenian": "\u0539",
- "Tonefive": "\u01bc",
- "Tonesix": "\u0184",
- "Tonetwo": "\u01a7",
- "Tretroflexhook": "\u01ae",
- "Tsecyrillic": "\u0426",
- "Tshecyrillic": "\u040b",
- "Tsmall": "\uf774",
- "Twelveroman": "\u216b",
- "Tworoman": "\u2161",
- "U": "\u0055",
- "Uacute": "\u00da",
- "Uacutesmall": "\uf7fa",
- "Ubreve": "\u016c",
- "Ucaron": "\u01d3",
- "Ucircle": "\u24ca",
- "Ucircumflex": "\u00db",
- "Ucircumflexbelow": "\u1e76",
- "Ucircumflexsmall": "\uf7fb",
- "Ucyrillic": "\u0423",
- "Udblacute": "\u0170",
- "Udblgrave": "\u0214",
- "Udieresis": "\u00dc",
- "Udieresisacute": "\u01d7",
- "Udieresisbelow": "\u1e72",
- "Udieresiscaron": "\u01d9",
- "Udieresiscyrillic": "\u04f0",
- "Udieresisgrave": "\u01db",
- "Udieresismacron": "\u01d5",
- "Udieresissmall": "\uf7fc",
- "Udotbelow": "\u1ee4",
- "Ugrave": "\u00d9",
- "Ugravesmall": "\uf7f9",
- "Uhookabove": "\u1ee6",
- "Uhorn": "\u01af",
- "Uhornacute": "\u1ee8",
- "Uhorndotbelow": "\u1ef0",
- "Uhorngrave": "\u1eea",
- "Uhornhookabove": "\u1eec",
- "Uhorntilde": "\u1eee",
- "Uhungarumlaut": "\u0170",
- "Uhungarumlautcyrillic": "\u04f2",
- "Uinvertedbreve": "\u0216",
- "Ukcyrillic": "\u0478",
- "Umacron": "\u016a",
- "Umacroncyrillic": "\u04ee",
- "Umacrondieresis": "\u1e7a",
- "Umonospace": "\uff35",
- "Uogonek": "\u0172",
- "Upsilon": "\u03a5",
- "Upsilon1": "\u03d2",
- "Upsilonacutehooksymbolgreek": "\u03d3",
- "Upsilonafrican": "\u01b1",
- "Upsilondieresis": "\u03ab",
- "Upsilondieresishooksymbolgreek": "\u03d4",
- "Upsilonhooksymbol": "\u03d2",
- "Upsilontonos": "\u038e",
- "Uring": "\u016e",
- "Ushortcyrillic": "\u040e",
- "Usmall": "\uf775",
- "Ustraightcyrillic": "\u04ae",
- "Ustraightstrokecyrillic": "\u04b0",
- "Utilde": "\u0168",
- "Utildeacute": "\u1e78",
- "Utildebelow": "\u1e74",
- "V": "\u0056",
- "Vcircle": "\u24cb",
- "Vdotbelow": "\u1e7e",
- "Vecyrillic": "\u0412",
- "Vewarmenian": "\u054e",
- "Vhook": "\u01b2",
- "Vmonospace": "\uff36",
- "Voarmenian": "\u0548",
- "Vsmall": "\uf776",
- "Vtilde": "\u1e7c",
- "W": "\u0057",
- "Wacute": "\u1e82",
- "Wcircle": "\u24cc",
- "Wcircumflex": "\u0174",
- "Wdieresis": "\u1e84",
- "Wdotaccent": "\u1e86",
- "Wdotbelow": "\u1e88",
- "Wgrave": "\u1e80",
- "Wmonospace": "\uff37",
- "Wsmall": "\uf777",
- "X": "\u0058",
- "Xcircle": "\u24cd",
- "Xdieresis": "\u1e8c",
- "Xdotaccent": "\u1e8a",
- "Xeharmenian": "\u053d",
- "Xi": "\u039e",
- "Xmonospace": "\uff38",
- "Xsmall": "\uf778",
- "Y": "\u0059",
- "Yacute": "\u00dd",
- "Yacutesmall": "\uf7fd",
- "Yatcyrillic": "\u0462",
- "Ycircle": "\u24ce",
- "Ycircumflex": "\u0176",
- "Ydieresis": "\u0178",
- "Ydieresissmall": "\uf7ff",
- "Ydotaccent": "\u1e8e",
- "Ydotbelow": "\u1ef4",
- "Yericyrillic": "\u042b",
- "Yerudieresiscyrillic": "\u04f8",
- "Ygrave": "\u1ef2",
- "Yhook": "\u01b3",
- "Yhookabove": "\u1ef6",
- "Yiarmenian": "\u0545",
- "Yicyrillic": "\u0407",
- "Yiwnarmenian": "\u0552",
- "Ymonospace": "\uff39",
- "Ysmall": "\uf779",
- "Ytilde": "\u1ef8",
- "Yusbigcyrillic": "\u046a",
- "Yusbigiotifiedcyrillic": "\u046c",
- "Yuslittlecyrillic": "\u0466",
- "Yuslittleiotifiedcyrillic": "\u0468",
- "Z": "\u005a",
- "Zaarmenian": "\u0536",
- "Zacute": "\u0179",
- "Zcaron": "\u017d",
- "Zcaronsmall": "\uf6ff",
- "Zcircle": "\u24cf",
- "Zcircumflex": "\u1e90",
- "Zdot": "\u017b",
- "Zdotaccent": "\u017b",
- "Zdotbelow": "\u1e92",
- "Zecyrillic": "\u0417",
- "Zedescendercyrillic": "\u0498",
- "Zedieresiscyrillic": "\u04de",
- "Zeta": "\u0396",
- "Zhearmenian": "\u053a",
- "Zhebrevecyrillic": "\u04c1",
- "Zhecyrillic": "\u0416",
- "Zhedescendercyrillic": "\u0496",
- "Zhedieresiscyrillic": "\u04dc",
- "Zlinebelow": "\u1e94",
- "Zmonospace": "\uff3a",
- "Zsmall": "\uf77a",
- "Zstroke": "\u01b5",
- "a": "\u0061",
- "aabengali": "\u0986",
- "aacute": "\u00e1",
- "aadeva": "\u0906",
- "aagujarati": "\u0a86",
- "aagurmukhi": "\u0a06",
- "aamatragurmukhi": "\u0a3e",
- "aarusquare": "\u3303",
- "aavowelsignbengali": "\u09be",
- "aavowelsigndeva": "\u093e",
- "aavowelsigngujarati": "\u0abe",
- "abbreviationmarkarmenian": "\u055f",
- "abbreviationsigndeva": "\u0970",
- "abengali": "\u0985",
- "abopomofo": "\u311a",
- "abreve": "\u0103",
- "abreveacute": "\u1eaf",
- "abrevecyrillic": "\u04d1",
- "abrevedotbelow": "\u1eb7",
- "abrevegrave": "\u1eb1",
- "abrevehookabove": "\u1eb3",
- "abrevetilde": "\u1eb5",
- "acaron": "\u01ce",
- "acircle": "\u24d0",
- "acircumflex": "\u00e2",
- "acircumflexacute": "\u1ea5",
- "acircumflexdotbelow": "\u1ead",
- "acircumflexgrave": "\u1ea7",
- "acircumflexhookabove": "\u1ea9",
- "acircumflextilde": "\u1eab",
- "acute": "\u00b4",
- "acutebelowcmb": "\u0317",
- "acutecmb": "\u0301",
- "acutecomb": "\u0301",
- "acutedeva": "\u0954",
- "acutelowmod": "\u02cf",
- "acutetonecmb": "\u0341",
- "acyrillic": "\u0430",
- "adblgrave": "\u0201",
- "addakgurmukhi": "\u0a71",
- "adeva": "\u0905",
- "adieresis": "\u00e4",
- "adieresiscyrillic": "\u04d3",
- "adieresismacron": "\u01df",
- "adotbelow": "\u1ea1",
- "adotmacron": "\u01e1",
- "ae": "\u00e6",
- "aeacute": "\u01fd",
- "aekorean": "\u3150",
- "aemacron": "\u01e3",
- "afii00208": "\u2015",
- "afii08941": "\u20a4",
- "afii10017": "\u0410",
- "afii10018": "\u0411",
- "afii10019": "\u0412",
- "afii10020": "\u0413",
- "afii10021": "\u0414",
- "afii10022": "\u0415",
- "afii10023": "\u0401",
- "afii10024": "\u0416",
- "afii10025": "\u0417",
- "afii10026": "\u0418",
- "afii10027": "\u0419",
- "afii10028": "\u041a",
- "afii10029": "\u041b",
- "afii10030": "\u041c",
- "afii10031": "\u041d",
- "afii10032": "\u041e",
- "afii10033": "\u041f",
- "afii10034": "\u0420",
- "afii10035": "\u0421",
- "afii10036": "\u0422",
- "afii10037": "\u0423",
- "afii10038": "\u0424",
- "afii10039": "\u0425",
- "afii10040": "\u0426",
- "afii10041": "\u0427",
- "afii10042": "\u0428",
- "afii10043": "\u0429",
- "afii10044": "\u042a",
- "afii10045": "\u042b",
- "afii10046": "\u042c",
- "afii10047": "\u042d",
- "afii10048": "\u042e",
- "afii10049": "\u042f",
- "afii10050": "\u0490",
- "afii10051": "\u0402",
- "afii10052": "\u0403",
- "afii10053": "\u0404",
- "afii10054": "\u0405",
- "afii10055": "\u0406",
- "afii10056": "\u0407",
- "afii10057": "\u0408",
- "afii10058": "\u0409",
- "afii10059": "\u040a",
- "afii10060": "\u040b",
- "afii10061": "\u040c",
- "afii10062": "\u040e",
- "afii10063": "\uf6c4",
- "afii10064": "\uf6c5",
- "afii10065": "\u0430",
- "afii10066": "\u0431",
- "afii10067": "\u0432",
- "afii10068": "\u0433",
- "afii10069": "\u0434",
- "afii10070": "\u0435",
- "afii10071": "\u0451",
- "afii10072": "\u0436",
- "afii10073": "\u0437",
- "afii10074": "\u0438",
- "afii10075": "\u0439",
- "afii10076": "\u043a",
- "afii10077": "\u043b",
- "afii10078": "\u043c",
- "afii10079": "\u043d",
- "afii10080": "\u043e",
- "afii10081": "\u043f",
- "afii10082": "\u0440",
- "afii10083": "\u0441",
- "afii10084": "\u0442",
- "afii10085": "\u0443",
- "afii10086": "\u0444",
- "afii10087": "\u0445",
- "afii10088": "\u0446",
- "afii10089": "\u0447",
- "afii10090": "\u0448",
- "afii10091": "\u0449",
- "afii10092": "\u044a",
- "afii10093": "\u044b",
- "afii10094": "\u044c",
- "afii10095": "\u044d",
- "afii10096": "\u044e",
- "afii10097": "\u044f",
- "afii10098": "\u0491",
- "afii10099": "\u0452",
- "afii10100": "\u0453",
- "afii10101": "\u0454",
- "afii10102": "\u0455",
- "afii10103": "\u0456",
- "afii10104": "\u0457",
- "afii10105": "\u0458",
- "afii10106": "\u0459",
- "afii10107": "\u045a",
- "afii10108": "\u045b",
- "afii10109": "\u045c",
- "afii10110": "\u045e",
- "afii10145": "\u040f",
- "afii10146": "\u0462",
- "afii10147": "\u0472",
- "afii10148": "\u0474",
- "afii10192": "\uf6c6",
- "afii10193": "\u045f",
- "afii10194": "\u0463",
- "afii10195": "\u0473",
- "afii10196": "\u0475",
- "afii10831": "\uf6c7",
- "afii10832": "\uf6c8",
- "afii10846": "\u04d9",
- "afii299": "\u200e",
- "afii300": "\u200f",
- "afii301": "\u200d",
- "afii57381": "\u066a",
- "afii57388": "\u060c",
- "afii57392": "\u0660",
- "afii57393": "\u0661",
- "afii57394": "\u0662",
- "afii57395": "\u0663",
- "afii57396": "\u0664",
- "afii57397": "\u0665",
- "afii57398": "\u0666",
- "afii57399": "\u0667",
- "afii57400": "\u0668",
- "afii57401": "\u0669",
- "afii57403": "\u061b",
- "afii57407": "\u061f",
- "afii57409": "\u0621",
- "afii57410": "\u0622",
- "afii57411": "\u0623",
- "afii57412": "\u0624",
- "afii57413": "\u0625",
- "afii57414": "\u0626",
- "afii57415": "\u0627",
- "afii57416": "\u0628",
- "afii57417": "\u0629",
- "afii57418": "\u062a",
- "afii57419": "\u062b",
- "afii57420": "\u062c",
- "afii57421": "\u062d",
- "afii57422": "\u062e",
- "afii57423": "\u062f",
- "afii57424": "\u0630",
- "afii57425": "\u0631",
- "afii57426": "\u0632",
- "afii57427": "\u0633",
- "afii57428": "\u0634",
- "afii57429": "\u0635",
- "afii57430": "\u0636",
- "afii57431": "\u0637",
- "afii57432": "\u0638",
- "afii57433": "\u0639",
- "afii57434": "\u063a",
- "afii57440": "\u0640",
- "afii57441": "\u0641",
- "afii57442": "\u0642",
- "afii57443": "\u0643",
- "afii57444": "\u0644",
- "afii57445": "\u0645",
- "afii57446": "\u0646",
- "afii57448": "\u0648",
- "afii57449": "\u0649",
- "afii57450": "\u064a",
- "afii57451": "\u064b",
- "afii57452": "\u064c",
- "afii57453": "\u064d",
- "afii57454": "\u064e",
- "afii57455": "\u064f",
- "afii57456": "\u0650",
- "afii57457": "\u0651",
- "afii57458": "\u0652",
- "afii57470": "\u0647",
- "afii57505": "\u06a4",
- "afii57506": "\u067e",
- "afii57507": "\u0686",
- "afii57508": "\u0698",
- "afii57509": "\u06af",
- "afii57511": "\u0679",
- "afii57512": "\u0688",
- "afii57513": "\u0691",
- "afii57514": "\u06ba",
- "afii57519": "\u06d2",
- "afii57534": "\u06d5",
- "afii57636": "\u20aa",
- "afii57645": "\u05be",
- "afii57658": "\u05c3",
- "afii57664": "\u05d0",
- "afii57665": "\u05d1",
- "afii57666": "\u05d2",
- "afii57667": "\u05d3",
- "afii57668": "\u05d4",
- "afii57669": "\u05d5",
- "afii57670": "\u05d6",
- "afii57671": "\u05d7",
- "afii57672": "\u05d8",
- "afii57673": "\u05d9",
- "afii57674": "\u05da",
- "afii57675": "\u05db",
- "afii57676": "\u05dc",
- "afii57677": "\u05dd",
- "afii57678": "\u05de",
- "afii57679": "\u05df",
- "afii57680": "\u05e0",
- "afii57681": "\u05e1",
- "afii57682": "\u05e2",
- "afii57683": "\u05e3",
- "afii57684": "\u05e4",
- "afii57685": "\u05e5",
- "afii57686": "\u05e6",
- "afii57687": "\u05e7",
- "afii57688": "\u05e8",
- "afii57689": "\u05e9",
- "afii57690": "\u05ea",
- "afii57694": "\ufb2a",
- "afii57695": "\ufb2b",
- "afii57700": "\ufb4b",
- "afii57705": "\ufb1f",
- "afii57716": "\u05f0",
- "afii57717": "\u05f1",
- "afii57718": "\u05f2",
- "afii57723": "\ufb35",
- "afii57793": "\u05b4",
- "afii57794": "\u05b5",
- "afii57795": "\u05b6",
- "afii57796": "\u05bb",
- "afii57797": "\u05b8",
- "afii57798": "\u05b7",
- "afii57799": "\u05b0",
- "afii57800": "\u05b2",
- "afii57801": "\u05b1",
- "afii57802": "\u05b3",
- "afii57803": "\u05c2",
- "afii57804": "\u05c1",
- "afii57806": "\u05b9",
- "afii57807": "\u05bc",
- "afii57839": "\u05bd",
- "afii57841": "\u05bf",
- "afii57842": "\u05c0",
- "afii57929": "\u02bc",
- "afii61248": "\u2105",
- "afii61289": "\u2113",
- "afii61352": "\u2116",
- "afii61573": "\u202c",
- "afii61574": "\u202d",
- "afii61575": "\u202e",
- "afii61664": "\u200c",
- "afii63167": "\u066d",
- "afii64937": "\u02bd",
- "agrave": "\u00e0",
- "agujarati": "\u0a85",
- "agurmukhi": "\u0a05",
- "ahiragana": "\u3042",
- "ahookabove": "\u1ea3",
- "aibengali": "\u0990",
- "aibopomofo": "\u311e",
- "aideva": "\u0910",
- "aiecyrillic": "\u04d5",
- "aigujarati": "\u0a90",
- "aigurmukhi": "\u0a10",
- "aimatragurmukhi": "\u0a48",
- "ainarabic": "\u0639",
- "ainfinalarabic": "\ufeca",
- "aininitialarabic": "\ufecb",
- "ainmedialarabic": "\ufecc",
- "ainvertedbreve": "\u0203",
- "aivowelsignbengali": "\u09c8",
- "aivowelsigndeva": "\u0948",
- "aivowelsigngujarati": "\u0ac8",
- "akatakana": "\u30a2",
- "akatakanahalfwidth": "\uff71",
- "akorean": "\u314f",
- "alef": "\u05d0",
- "alefarabic": "\u0627",
- "alefdageshhebrew": "\ufb30",
- "aleffinalarabic": "\ufe8e",
- "alefhamzaabovearabic": "\u0623",
- "alefhamzaabovefinalarabic": "\ufe84",
- "alefhamzabelowarabic": "\u0625",
- "alefhamzabelowfinalarabic": "\ufe88",
- "alefhebrew": "\u05d0",
- "aleflamedhebrew": "\ufb4f",
- "alefmaddaabovearabic": "\u0622",
- "alefmaddaabovefinalarabic": "\ufe82",
- "alefmaksuraarabic": "\u0649",
- "alefmaksurafinalarabic": "\ufef0",
- "alefmaksurainitialarabic": "\ufef3",
- "alefmaksuramedialarabic": "\ufef4",
- "alefpatahhebrew": "\ufb2e",
- "alefqamatshebrew": "\ufb2f",
- "aleph": "\u2135",
- "allequal": "\u224c",
- "alpha": "\u03b1",
- "alphatonos": "\u03ac",
- "amacron": "\u0101",
- "amonospace": "\uff41",
- "ampersand": "\u0026",
- "ampersandmonospace": "\uff06",
- "ampersandsmall": "\uf726",
- "amsquare": "\u33c2",
- "anbopomofo": "\u3122",
- "angbopomofo": "\u3124",
- "angkhankhuthai": "\u0e5a",
- "angle": "\u2220",
- "anglebracketleft": "\u3008",
- "anglebracketleftvertical": "\ufe3f",
- "anglebracketright": "\u3009",
- "anglebracketrightvertical": "\ufe40",
- "angleleft": "\u2329",
- "angleright": "\u232a",
- "angstrom": "\u212b",
- "anoteleia": "\u0387",
- "anudattadeva": "\u0952",
- "anusvarabengali": "\u0982",
- "anusvaradeva": "\u0902",
- "anusvaragujarati": "\u0a82",
- "aogonek": "\u0105",
- "apaatosquare": "\u3300",
- "aparen": "\u249c",
- "apostrophearmenian": "\u055a",
- "apostrophemod": "\u02bc",
- "apple": "\uf8ff",
- "approaches": "\u2250",
- "approxequal": "\u2248",
- "approxequalorimage": "\u2252",
- "approximatelyequal": "\u2245",
- "araeaekorean": "\u318e",
- "araeakorean": "\u318d",
- "arc": "\u2312",
- "arighthalfring": "\u1e9a",
- "aring": "\u00e5",
- "aringacute": "\u01fb",
- "aringbelow": "\u1e01",
- "arrowboth": "\u2194",
- "arrowdashdown": "\u21e3",
- "arrowdashleft": "\u21e0",
- "arrowdashright": "\u21e2",
- "arrowdashup": "\u21e1",
- "arrowdblboth": "\u21d4",
- "arrowdbldown": "\u21d3",
- "arrowdblleft": "\u21d0",
- "arrowdblright": "\u21d2",
- "arrowdblup": "\u21d1",
- "arrowdown": "\u2193",
- "arrowdownleft": "\u2199",
- "arrowdownright": "\u2198",
- "arrowdownwhite": "\u21e9",
- "arrowheaddownmod": "\u02c5",
- "arrowheadleftmod": "\u02c2",
- "arrowheadrightmod": "\u02c3",
- "arrowheadupmod": "\u02c4",
- "arrowhorizex": "\uf8e7",
- "arrowleft": "\u2190",
- "arrowleftdbl": "\u21d0",
- "arrowleftdblstroke": "\u21cd",
- "arrowleftoverright": "\u21c6",
- "arrowleftwhite": "\u21e6",
- "arrowright": "\u2192",
- "arrowrightdblstroke": "\u21cf",
- "arrowrightheavy": "\u279e",
- "arrowrightoverleft": "\u21c4",
- "arrowrightwhite": "\u21e8",
- "arrowtableft": "\u21e4",
- "arrowtabright": "\u21e5",
- "arrowup": "\u2191",
- "arrowupdn": "\u2195",
- "arrowupdnbse": "\u21a8",
- "arrowupdownbase": "\u21a8",
- "arrowupleft": "\u2196",
- "arrowupleftofdown": "\u21c5",
- "arrowupright": "\u2197",
- "arrowupwhite": "\u21e7",
- "arrowvertex": "\uf8e6",
- "asciicircum": "\u005e",
- "asciicircummonospace": "\uff3e",
- "asciitilde": "\u007e",
- "asciitildemonospace": "\uff5e",
- "ascript": "\u0251",
- "ascriptturned": "\u0252",
- "asmallhiragana": "\u3041",
- "asmallkatakana": "\u30a1",
- "asmallkatakanahalfwidth": "\uff67",
- "asterisk": "\u002a",
- "asteriskaltonearabic": "\u066d",
- "asteriskarabic": "\u066d",
- "asteriskmath": "\u2217",
- "asteriskmonospace": "\uff0a",
- "asterisksmall": "\ufe61",
- "asterism": "\u2042",
- "asuperior": "\uf6e9",
- "asymptoticallyequal": "\u2243",
- "at": "\u0040",
- "atilde": "\u00e3",
- "atmonospace": "\uff20",
- "atsmall": "\ufe6b",
- "aturned": "\u0250",
- "aubengali": "\u0994",
- "aubopomofo": "\u3120",
- "audeva": "\u0914",
- "augujarati": "\u0a94",
- "augurmukhi": "\u0a14",
- "aulengthmarkbengali": "\u09d7",
- "aumatragurmukhi": "\u0a4c",
- "auvowelsignbengali": "\u09cc",
- "auvowelsigndeva": "\u094c",
- "auvowelsigngujarati": "\u0acc",
- "avagrahadeva": "\u093d",
- "aybarmenian": "\u0561",
- "ayin": "\u05e2",
- "ayinaltonehebrew": "\ufb20",
- "ayinhebrew": "\u05e2",
- "b": "\u0062",
- "babengali": "\u09ac",
- "backslash": "\u005c",
- "backslashmonospace": "\uff3c",
- "badeva": "\u092c",
- "bagujarati": "\u0aac",
- "bagurmukhi": "\u0a2c",
- "bahiragana": "\u3070",
- "bahtthai": "\u0e3f",
- "bakatakana": "\u30d0",
- "bar": "\u007c",
- "barmonospace": "\uff5c",
- "bbopomofo": "\u3105",
- "bcircle": "\u24d1",
- "bdotaccent": "\u1e03",
- "bdotbelow": "\u1e05",
- "beamedsixteenthnotes": "\u266c",
- "because": "\u2235",
- "becyrillic": "\u0431",
- "beharabic": "\u0628",
- "behfinalarabic": "\ufe90",
- "behinitialarabic": "\ufe91",
- "behiragana": "\u3079",
- "behmedialarabic": "\ufe92",
- "behmeeminitialarabic": "\ufc9f",
- "behmeemisolatedarabic": "\ufc08",
- "behnoonfinalarabic": "\ufc6d",
- "bekatakana": "\u30d9",
- "benarmenian": "\u0562",
- "bet": "\u05d1",
- "beta": "\u03b2",
- "betasymbolgreek": "\u03d0",
- "betdagesh": "\ufb31",
- "betdageshhebrew": "\ufb31",
- "bethebrew": "\u05d1",
- "betrafehebrew": "\ufb4c",
- "bhabengali": "\u09ad",
- "bhadeva": "\u092d",
- "bhagujarati": "\u0aad",
- "bhagurmukhi": "\u0a2d",
- "bhook": "\u0253",
- "bihiragana": "\u3073",
- "bikatakana": "\u30d3",
- "bilabialclick": "\u0298",
- "bindigurmukhi": "\u0a02",
- "birusquare": "\u3331",
- "blackcircle": "\u25cf",
- "blackdiamond": "\u25c6",
- "blackdownpointingtriangle": "\u25bc",
- "blackleftpointingpointer": "\u25c4",
- "blackleftpointingtriangle": "\u25c0",
- "blacklenticularbracketleft": "\u3010",
- "blacklenticularbracketleftvertical": "\ufe3b",
- "blacklenticularbracketright": "\u3011",
- "blacklenticularbracketrightvertical": "\ufe3c",
- "blacklowerlefttriangle": "\u25e3",
- "blacklowerrighttriangle": "\u25e2",
- "blackrectangle": "\u25ac",
- "blackrightpointingpointer": "\u25ba",
- "blackrightpointingtriangle": "\u25b6",
- "blacksmallsquare": "\u25aa",
- "blacksmilingface": "\u263b",
- "blacksquare": "\u25a0",
- "blackstar": "\u2605",
- "blackupperlefttriangle": "\u25e4",
- "blackupperrighttriangle": "\u25e5",
- "blackuppointingsmalltriangle": "\u25b4",
- "blackuppointingtriangle": "\u25b2",
- "blank": "\u2423",
- "blinebelow": "\u1e07",
- "block": "\u2588",
- "bmonospace": "\uff42",
- "bobaimaithai": "\u0e1a",
- "bohiragana": "\u307c",
- "bokatakana": "\u30dc",
- "bparen": "\u249d",
- "bqsquare": "\u33c3",
- "braceex": "\uf8f4",
- "braceleft": "\u007b",
- "braceleftbt": "\uf8f3",
- "braceleftmid": "\uf8f2",
- "braceleftmonospace": "\uff5b",
- "braceleftsmall": "\ufe5b",
- "bracelefttp": "\uf8f1",
- "braceleftvertical": "\ufe37",
- "braceright": "\u007d",
- "bracerightbt": "\uf8fe",
- "bracerightmid": "\uf8fd",
- "bracerightmonospace": "\uff5d",
- "bracerightsmall": "\ufe5c",
- "bracerighttp": "\uf8fc",
- "bracerightvertical": "\ufe38",
- "bracketleft": "\u005b",
- "bracketleftbt": "\uf8f0",
- "bracketleftex": "\uf8ef",
- "bracketleftmonospace": "\uff3b",
- "bracketlefttp": "\uf8ee",
- "bracketright": "\u005d",
- "bracketrightbt": "\uf8fb",
- "bracketrightex": "\uf8fa",
- "bracketrightmonospace": "\uff3d",
- "bracketrighttp": "\uf8f9",
- "breve": "\u02d8",
- "brevebelowcmb": "\u032e",
- "brevecmb": "\u0306",
- "breveinvertedbelowcmb": "\u032f",
- "breveinvertedcmb": "\u0311",
- "breveinverteddoublecmb": "\u0361",
- "bridgebelowcmb": "\u032a",
- "bridgeinvertedbelowcmb": "\u033a",
- "brokenbar": "\u00a6",
- "bstroke": "\u0180",
- "bsuperior": "\uf6ea",
- "btopbar": "\u0183",
- "buhiragana": "\u3076",
- "bukatakana": "\u30d6",
- "bullet": "\u2022",
- "bulletinverse": "\u25d8",
- "bulletoperator": "\u2219",
- "bullseye": "\u25ce",
- "c": "\u0063",
- "caarmenian": "\u056e",
- "cabengali": "\u099a",
- "cacute": "\u0107",
- "cadeva": "\u091a",
- "cagujarati": "\u0a9a",
- "cagurmukhi": "\u0a1a",
- "calsquare": "\u3388",
- "candrabindubengali": "\u0981",
- "candrabinducmb": "\u0310",
- "candrabindudeva": "\u0901",
- "candrabindugujarati": "\u0a81",
- "capslock": "\u21ea",
- "careof": "\u2105",
- "caron": "\u02c7",
- "caronbelowcmb": "\u032c",
- "caroncmb": "\u030c",
- "carriagereturn": "\u21b5",
- "cbopomofo": "\u3118",
- "ccaron": "\u010d",
- "ccedilla": "\u00e7",
- "ccedillaacute": "\u1e09",
- "ccircle": "\u24d2",
- "ccircumflex": "\u0109",
- "ccurl": "\u0255",
- "cdot": "\u010b",
- "cdotaccent": "\u010b",
- "cdsquare": "\u33c5",
- "cedilla": "\u00b8",
- "cedillacmb": "\u0327",
- "cent": "\u00a2",
- "centigrade": "\u2103",
- "centinferior": "\uf6df",
- "centmonospace": "\uffe0",
- "centoldstyle": "\uf7a2",
- "centsuperior": "\uf6e0",
- "chaarmenian": "\u0579",
- "chabengali": "\u099b",
- "chadeva": "\u091b",
- "chagujarati": "\u0a9b",
- "chagurmukhi": "\u0a1b",
- "chbopomofo": "\u3114",
- "cheabkhasiancyrillic": "\u04bd",
- "checkmark": "\u2713",
- "checyrillic": "\u0447",
- "chedescenderabkhasiancyrillic": "\u04bf",
- "chedescendercyrillic": "\u04b7",
- "chedieresiscyrillic": "\u04f5",
- "cheharmenian": "\u0573",
- "chekhakassiancyrillic": "\u04cc",
- "cheverticalstrokecyrillic": "\u04b9",
- "chi": "\u03c7",
- "chieuchacirclekorean": "\u3277",
- "chieuchaparenkorean": "\u3217",
- "chieuchcirclekorean": "\u3269",
- "chieuchkorean": "\u314a",
- "chieuchparenkorean": "\u3209",
- "chochangthai": "\u0e0a",
- "chochanthai": "\u0e08",
- "chochingthai": "\u0e09",
- "chochoethai": "\u0e0c",
- "chook": "\u0188",
- "cieucacirclekorean": "\u3276",
- "cieucaparenkorean": "\u3216",
- "cieuccirclekorean": "\u3268",
- "cieuckorean": "\u3148",
- "cieucparenkorean": "\u3208",
- "cieucuparenkorean": "\u321c",
- "circle": "\u25cb",
- "circlemultiply": "\u2297",
- "circleot": "\u2299",
- "circleplus": "\u2295",
- "circlepostalmark": "\u3036",
- "circlewithlefthalfblack": "\u25d0",
- "circlewithrighthalfblack": "\u25d1",
- "circumflex": "\u02c6",
- "circumflexbelowcmb": "\u032d",
- "circumflexcmb": "\u0302",
- "clear": "\u2327",
- "clickalveolar": "\u01c2",
- "clickdental": "\u01c0",
- "clicklateral": "\u01c1",
- "clickretroflex": "\u01c3",
- "club": "\u2663",
- "clubsuitblack": "\u2663",
- "clubsuitwhite": "\u2667",
- "cmcubedsquare": "\u33a4",
- "cmonospace": "\uff43",
- "cmsquaredsquare": "\u33a0",
- "coarmenian": "\u0581",
- "colon": "\u003a",
- "colonmonetary": "\u20a1",
- "colonmonospace": "\uff1a",
- "colonsign": "\u20a1",
- "colonsmall": "\ufe55",
- "colontriangularhalfmod": "\u02d1",
- "colontriangularmod": "\u02d0",
- "comma": "\u002c",
- "commaabovecmb": "\u0313",
- "commaaboverightcmb": "\u0315",
- "commaaccent": "\uf6c3",
- "commaarabic": "\u060c",
- "commaarmenian": "\u055d",
- "commainferior": "\uf6e1",
- "commamonospace": "\uff0c",
- "commareversedabovecmb": "\u0314",
- "commareversedmod": "\u02bd",
- "commasmall": "\ufe50",
- "commasuperior": "\uf6e2",
- "commaturnedabovecmb": "\u0312",
- "commaturnedmod": "\u02bb",
- "compass": "\u263c",
- "congruent": "\u2245",
- "contourintegral": "\u222e",
- "control": "\u2303",
- "controlACK": "\u0006",
- "controlBEL": "\u0007",
- "controlBS": "\u0008",
- "controlCAN": "\u0018",
- "controlCR": "\u000d",
- "controlDC1": "\u0011",
- "controlDC2": "\u0012",
- "controlDC3": "\u0013",
- "controlDC4": "\u0014",
- "controlDEL": "\u007f",
- "controlDLE": "\u0010",
- "controlEM": "\u0019",
- "controlENQ": "\u0005",
- "controlEOT": "\u0004",
- "controlESC": "\u001b",
- "controlETB": "\u0017",
- "controlETX": "\u0003",
- "controlFF": "\u000c",
- "controlFS": "\u001c",
- "controlGS": "\u001d",
- "controlHT": "\u0009",
- "controlLF": "\u000a",
- "controlNAK": "\u0015",
- "controlRS": "\u001e",
- "controlSI": "\u000f",
- "controlSO": "\u000e",
- "controlSOT": "\u0002",
- "controlSTX": "\u0001",
- "controlSUB": "\u001a",
- "controlSYN": "\u0016",
- "controlUS": "\u001f",
- "controlVT": "\u000b",
- "copyright": "\u00a9",
- "copyrightsans": "\uf8e9",
- "copyrightserif": "\uf6d9",
- "cornerbracketleft": "\u300c",
- "cornerbracketlefthalfwidth": "\uff62",
- "cornerbracketleftvertical": "\ufe41",
- "cornerbracketright": "\u300d",
- "cornerbracketrighthalfwidth": "\uff63",
- "cornerbracketrightvertical": "\ufe42",
- "corporationsquare": "\u337f",
- "cosquare": "\u33c7",
- "coverkgsquare": "\u33c6",
- "cparen": "\u249e",
- "cruzeiro": "\u20a2",
- "cstretched": "\u0297",
- "curlyand": "\u22cf",
- "curlyor": "\u22ce",
- "currency": "\u00a4",
- "cyrBreve": "\uf6d1",
- "cyrFlex": "\uf6d2",
- "cyrbreve": "\uf6d4",
- "cyrflex": "\uf6d5",
- "d": "\u0064",
- "daarmenian": "\u0564",
- "dabengali": "\u09a6",
- "dadarabic": "\u0636",
- "dadeva": "\u0926",
- "dadfinalarabic": "\ufebe",
- "dadinitialarabic": "\ufebf",
- "dadmedialarabic": "\ufec0",
- "dagesh": "\u05bc",
- "dageshhebrew": "\u05bc",
- "dagger": "\u2020",
- "daggerdbl": "\u2021",
- "dagujarati": "\u0aa6",
- "dagurmukhi": "\u0a26",
- "dahiragana": "\u3060",
- "dakatakana": "\u30c0",
- "dalarabic": "\u062f",
- "dalet": "\u05d3",
- "daletdagesh": "\ufb33",
- "daletdageshhebrew": "\ufb33",
- "dalethatafpatah": "\u05d3\u05b2",
- "dalethatafpatahhebrew": "\u05d3\u05b2",
- "dalethatafsegol": "\u05d3\u05b1",
- "dalethatafsegolhebrew": "\u05d3\u05b1",
- "dalethebrew": "\u05d3",
- "dalethiriq": "\u05d3\u05b4",
- "dalethiriqhebrew": "\u05d3\u05b4",
- "daletholam": "\u05d3\u05b9",
- "daletholamhebrew": "\u05d3\u05b9",
- "daletpatah": "\u05d3\u05b7",
- "daletpatahhebrew": "\u05d3\u05b7",
- "daletqamats": "\u05d3\u05b8",
- "daletqamatshebrew": "\u05d3\u05b8",
- "daletqubuts": "\u05d3\u05bb",
- "daletqubutshebrew": "\u05d3\u05bb",
- "daletsegol": "\u05d3\u05b6",
- "daletsegolhebrew": "\u05d3\u05b6",
- "daletsheva": "\u05d3\u05b0",
- "daletshevahebrew": "\u05d3\u05b0",
- "dalettsere": "\u05d3\u05b5",
- "dalettserehebrew": "\u05d3\u05b5",
- "dalfinalarabic": "\ufeaa",
- "dammaarabic": "\u064f",
- "dammalowarabic": "\u064f",
- "dammatanaltonearabic": "\u064c",
- "dammatanarabic": "\u064c",
- "danda": "\u0964",
- "dargahebrew": "\u05a7",
- "dargalefthebrew": "\u05a7",
- "dasiapneumatacyrilliccmb": "\u0485",
- "dblGrave": "\uf6d3",
- "dblanglebracketleft": "\u300a",
- "dblanglebracketleftvertical": "\ufe3d",
- "dblanglebracketright": "\u300b",
- "dblanglebracketrightvertical": "\ufe3e",
- "dblarchinvertedbelowcmb": "\u032b",
- "dblarrowleft": "\u21d4",
- "dblarrowright": "\u21d2",
- "dbldanda": "\u0965",
- "dblgrave": "\uf6d6",
- "dblgravecmb": "\u030f",
- "dblintegral": "\u222c",
- "dbllowline": "\u2017",
- "dbllowlinecmb": "\u0333",
- "dbloverlinecmb": "\u033f",
- "dblprimemod": "\u02ba",
- "dblverticalbar": "\u2016",
- "dblverticallineabovecmb": "\u030e",
- "dbopomofo": "\u3109",
- "dbsquare": "\u33c8",
- "dcaron": "\u010f",
- "dcedilla": "\u1e11",
- "dcircle": "\u24d3",
- "dcircumflexbelow": "\u1e13",
- "dcroat": "\u0111",
- "ddabengali": "\u09a1",
- "ddadeva": "\u0921",
- "ddagujarati": "\u0aa1",
- "ddagurmukhi": "\u0a21",
- "ddalarabic": "\u0688",
- "ddalfinalarabic": "\ufb89",
- "dddhadeva": "\u095c",
- "ddhabengali": "\u09a2",
- "ddhadeva": "\u0922",
- "ddhagujarati": "\u0aa2",
- "ddhagurmukhi": "\u0a22",
- "ddotaccent": "\u1e0b",
- "ddotbelow": "\u1e0d",
- "decimalseparatorarabic": "\u066b",
- "decimalseparatorpersian": "\u066b",
- "decyrillic": "\u0434",
- "degree": "\u00b0",
- "dehihebrew": "\u05ad",
- "dehiragana": "\u3067",
- "deicoptic": "\u03ef",
- "dekatakana": "\u30c7",
- "deleteleft": "\u232b",
- "deleteright": "\u2326",
- "delta": "\u03b4",
- "deltaturned": "\u018d",
- "denominatorminusonenumeratorbengali": "\u09f8",
- "dezh": "\u02a4",
- "dhabengali": "\u09a7",
- "dhadeva": "\u0927",
- "dhagujarati": "\u0aa7",
- "dhagurmukhi": "\u0a27",
- "dhook": "\u0257",
- "dialytikatonos": "\u0385",
- "dialytikatonoscmb": "\u0344",
- "diamond": "\u2666",
- "diamondsuitwhite": "\u2662",
- "dieresis": "\u00a8",
- "dieresisacute": "\uf6d7",
- "dieresisbelowcmb": "\u0324",
- "dieresiscmb": "\u0308",
- "dieresisgrave": "\uf6d8",
- "dieresistonos": "\u0385",
- "dihiragana": "\u3062",
- "dikatakana": "\u30c2",
- "dittomark": "\u3003",
- "divide": "\u00f7",
- "divides": "\u2223",
- "divisionslash": "\u2215",
- "djecyrillic": "\u0452",
- "dkshade": "\u2593",
- "dlinebelow": "\u1e0f",
- "dlsquare": "\u3397",
- "dmacron": "\u0111",
- "dmonospace": "\uff44",
- "dnblock": "\u2584",
- "dochadathai": "\u0e0e",
- "dodekthai": "\u0e14",
- "dohiragana": "\u3069",
- "dokatakana": "\u30c9",
- "dollar": "\u0024",
- "dollarinferior": "\uf6e3",
- "dollarmonospace": "\uff04",
- "dollaroldstyle": "\uf724",
- "dollarsmall": "\ufe69",
- "dollarsuperior": "\uf6e4",
- "dong": "\u20ab",
- "dorusquare": "\u3326",
- "dotaccent": "\u02d9",
- "dotaccentcmb": "\u0307",
- "dotbelowcmb": "\u0323",
- "dotbelowcomb": "\u0323",
- "dotkatakana": "\u30fb",
- "dotlessi": "\u0131",
- "dotlessj": "\uf6be",
- "dotlessjstrokehook": "\u0284",
- "dotmath": "\u22c5",
- "dottedcircle": "\u25cc",
- "doubleyodpatah": "\ufb1f",
- "doubleyodpatahhebrew": "\ufb1f",
- "downtackbelowcmb": "\u031e",
- "downtackmod": "\u02d5",
- "dparen": "\u249f",
- "dsuperior": "\uf6eb",
- "dtail": "\u0256",
- "dtopbar": "\u018c",
- "duhiragana": "\u3065",
- "dukatakana": "\u30c5",
- "dz": "\u01f3",
- "dzaltone": "\u02a3",
- "dzcaron": "\u01c6",
- "dzcurl": "\u02a5",
- "dzeabkhasiancyrillic": "\u04e1",
- "dzecyrillic": "\u0455",
- "dzhecyrillic": "\u045f",
- "e": "\u0065",
- "eacute": "\u00e9",
- "earth": "\u2641",
- "ebengali": "\u098f",
- "ebopomofo": "\u311c",
- "ebreve": "\u0115",
- "ecandradeva": "\u090d",
- "ecandragujarati": "\u0a8d",
- "ecandravowelsigndeva": "\u0945",
- "ecandravowelsigngujarati": "\u0ac5",
- "ecaron": "\u011b",
- "ecedillabreve": "\u1e1d",
- "echarmenian": "\u0565",
- "echyiwnarmenian": "\u0587",
- "ecircle": "\u24d4",
- "ecircumflex": "\u00ea",
- "ecircumflexacute": "\u1ebf",
- "ecircumflexbelow": "\u1e19",
- "ecircumflexdotbelow": "\u1ec7",
- "ecircumflexgrave": "\u1ec1",
- "ecircumflexhookabove": "\u1ec3",
- "ecircumflextilde": "\u1ec5",
- "ecyrillic": "\u0454",
- "edblgrave": "\u0205",
- "edeva": "\u090f",
- "edieresis": "\u00eb",
- "edot": "\u0117",
- "edotaccent": "\u0117",
- "edotbelow": "\u1eb9",
- "eegurmukhi": "\u0a0f",
- "eematragurmukhi": "\u0a47",
- "efcyrillic": "\u0444",
- "egrave": "\u00e8",
- "egujarati": "\u0a8f",
- "eharmenian": "\u0567",
- "ehbopomofo": "\u311d",
- "ehiragana": "\u3048",
- "ehookabove": "\u1ebb",
- "eibopomofo": "\u311f",
- "eight": "\u0038",
- "eightarabic": "\u0668",
- "eightbengali": "\u09ee",
- "eightcircle": "\u2467",
- "eightcircleinversesansserif": "\u2791",
- "eightdeva": "\u096e",
- "eighteencircle": "\u2471",
- "eighteenparen": "\u2485",
- "eighteenperiod": "\u2499",
- "eightgujarati": "\u0aee",
- "eightgurmukhi": "\u0a6e",
- "eighthackarabic": "\u0668",
- "eighthangzhou": "\u3028",
- "eighthnotebeamed": "\u266b",
- "eightideographicparen": "\u3227",
- "eightinferior": "\u2088",
- "eightmonospace": "\uff18",
- "eightoldstyle": "\uf738",
- "eightparen": "\u247b",
- "eightperiod": "\u248f",
- "eightpersian": "\u06f8",
- "eightroman": "\u2177",
- "eightsuperior": "\u2078",
- "eightthai": "\u0e58",
- "einvertedbreve": "\u0207",
- "eiotifiedcyrillic": "\u0465",
- "ekatakana": "\u30a8",
- "ekatakanahalfwidth": "\uff74",
- "ekonkargurmukhi": "\u0a74",
- "ekorean": "\u3154",
- "elcyrillic": "\u043b",
- "element": "\u2208",
- "elevencircle": "\u246a",
- "elevenparen": "\u247e",
- "elevenperiod": "\u2492",
- "elevenroman": "\u217a",
- "ellipsis": "\u2026",
- "ellipsisvertical": "\u22ee",
- "emacron": "\u0113",
- "emacronacute": "\u1e17",
- "emacrongrave": "\u1e15",
- "emcyrillic": "\u043c",
- "emdash": "\u2014",
- "emdashvertical": "\ufe31",
- "emonospace": "\uff45",
- "emphasismarkarmenian": "\u055b",
- "emptyset": "\u2205",
- "enbopomofo": "\u3123",
- "encyrillic": "\u043d",
- "endash": "\u2013",
- "endashvertical": "\ufe32",
- "endescendercyrillic": "\u04a3",
- "eng": "\u014b",
- "engbopomofo": "\u3125",
- "enghecyrillic": "\u04a5",
- "enhookcyrillic": "\u04c8",
- "enspace": "\u2002",
- "eogonek": "\u0119",
- "eokorean": "\u3153",
- "eopen": "\u025b",
- "eopenclosed": "\u029a",
- "eopenreversed": "\u025c",
- "eopenreversedclosed": "\u025e",
- "eopenreversedhook": "\u025d",
- "eparen": "\u24a0",
- "epsilon": "\u03b5",
- "epsilontonos": "\u03ad",
- "equal": "\u003d",
- "equalmonospace": "\uff1d",
- "equalsmall": "\ufe66",
- "equalsuperior": "\u207c",
- "equivalence": "\u2261",
- "erbopomofo": "\u3126",
- "ercyrillic": "\u0440",
- "ereversed": "\u0258",
- "ereversedcyrillic": "\u044d",
- "escyrillic": "\u0441",
- "esdescendercyrillic": "\u04ab",
- "esh": "\u0283",
- "eshcurl": "\u0286",
- "eshortdeva": "\u090e",
- "eshortvowelsigndeva": "\u0946",
- "eshreversedloop": "\u01aa",
- "eshsquatreversed": "\u0285",
- "esmallhiragana": "\u3047",
- "esmallkatakana": "\u30a7",
- "esmallkatakanahalfwidth": "\uff6a",
- "estimated": "\u212e",
- "esuperior": "\uf6ec",
- "eta": "\u03b7",
- "etarmenian": "\u0568",
- "etatonos": "\u03ae",
- "eth": "\u00f0",
- "etilde": "\u1ebd",
- "etildebelow": "\u1e1b",
- "etnahtafoukhhebrew": "\u0591",
- "etnahtafoukhlefthebrew": "\u0591",
- "etnahtahebrew": "\u0591",
- "etnahtalefthebrew": "\u0591",
- "eturned": "\u01dd",
- "eukorean": "\u3161",
- "euro": "\u20ac",
- "evowelsignbengali": "\u09c7",
- "evowelsigndeva": "\u0947",
- "evowelsigngujarati": "\u0ac7",
- "exclam": "\u0021",
- "exclamarmenian": "\u055c",
- "exclamdbl": "\u203c",
- "exclamdown": "\u00a1",
- "exclamdownsmall": "\uf7a1",
- "exclammonospace": "\uff01",
- "exclamsmall": "\uf721",
- "existential": "\u2203",
- "ezh": "\u0292",
- "ezhcaron": "\u01ef",
- "ezhcurl": "\u0293",
- "ezhreversed": "\u01b9",
- "ezhtail": "\u01ba",
- "f": "\u0066",
- "fadeva": "\u095e",
- "fagurmukhi": "\u0a5e",
- "fahrenheit": "\u2109",
- "fathaarabic": "\u064e",
- "fathalowarabic": "\u064e",
- "fathatanarabic": "\u064b",
- "fbopomofo": "\u3108",
- "fcircle": "\u24d5",
- "fdotaccent": "\u1e1f",
- "feharabic": "\u0641",
- "feharmenian": "\u0586",
- "fehfinalarabic": "\ufed2",
- "fehinitialarabic": "\ufed3",
- "fehmedialarabic": "\ufed4",
- "feicoptic": "\u03e5",
- "female": "\u2640",
- "ff": "\ufb00",
- "ffi": "\ufb03",
- "ffl": "\ufb04",
- "fi": "\ufb01",
- "fifteencircle": "\u246e",
- "fifteenparen": "\u2482",
- "fifteenperiod": "\u2496",
- "figuredash": "\u2012",
- "filledbox": "\u25a0",
- "filledrect": "\u25ac",
- "finalkaf": "\u05da",
- "finalkafdagesh": "\ufb3a",
- "finalkafdageshhebrew": "\ufb3a",
- "finalkafhebrew": "\u05da",
- "finalkafqamats": "\u05da\u05b8",
- "finalkafqamatshebrew": "\u05da\u05b8",
- "finalkafsheva": "\u05da\u05b0",
- "finalkafshevahebrew": "\u05da\u05b0",
- "finalmem": "\u05dd",
- "finalmemhebrew": "\u05dd",
- "finalnun": "\u05df",
- "finalnunhebrew": "\u05df",
- "finalpe": "\u05e3",
- "finalpehebrew": "\u05e3",
- "finaltsadi": "\u05e5",
- "finaltsadihebrew": "\u05e5",
- "firsttonechinese": "\u02c9",
- "fisheye": "\u25c9",
- "fitacyrillic": "\u0473",
- "five": "\u0035",
- "fivearabic": "\u0665",
- "fivebengali": "\u09eb",
- "fivecircle": "\u2464",
- "fivecircleinversesansserif": "\u278e",
- "fivedeva": "\u096b",
- "fiveeighths": "\u215d",
- "fivegujarati": "\u0aeb",
- "fivegurmukhi": "\u0a6b",
- "fivehackarabic": "\u0665",
- "fivehangzhou": "\u3025",
- "fiveideographicparen": "\u3224",
- "fiveinferior": "\u2085",
- "fivemonospace": "\uff15",
- "fiveoldstyle": "\uf735",
- "fiveparen": "\u2478",
- "fiveperiod": "\u248c",
- "fivepersian": "\u06f5",
- "fiveroman": "\u2174",
- "fivesuperior": "\u2075",
- "fivethai": "\u0e55",
- "fl": "\ufb02",
- "florin": "\u0192",
- "fmonospace": "\uff46",
- "fmsquare": "\u3399",
- "fofanthai": "\u0e1f",
- "fofathai": "\u0e1d",
- "fongmanthai": "\u0e4f",
- "forall": "\u2200",
- "four": "\u0034",
- "fourarabic": "\u0664",
- "fourbengali": "\u09ea",
- "fourcircle": "\u2463",
- "fourcircleinversesansserif": "\u278d",
- "fourdeva": "\u096a",
- "fourgujarati": "\u0aea",
- "fourgurmukhi": "\u0a6a",
- "fourhackarabic": "\u0664",
- "fourhangzhou": "\u3024",
- "fourideographicparen": "\u3223",
- "fourinferior": "\u2084",
- "fourmonospace": "\uff14",
- "fournumeratorbengali": "\u09f7",
- "fouroldstyle": "\uf734",
- "fourparen": "\u2477",
- "fourperiod": "\u248b",
- "fourpersian": "\u06f4",
- "fourroman": "\u2173",
- "foursuperior": "\u2074",
- "fourteencircle": "\u246d",
- "fourteenparen": "\u2481",
- "fourteenperiod": "\u2495",
- "fourthai": "\u0e54",
- "fourthtonechinese": "\u02cb",
- "fparen": "\u24a1",
- "fraction": "\u2044",
- "franc": "\u20a3",
- "g": "\u0067",
- "gabengali": "\u0997",
- "gacute": "\u01f5",
- "gadeva": "\u0917",
- "gafarabic": "\u06af",
- "gaffinalarabic": "\ufb93",
- "gafinitialarabic": "\ufb94",
- "gafmedialarabic": "\ufb95",
- "gagujarati": "\u0a97",
- "gagurmukhi": "\u0a17",
- "gahiragana": "\u304c",
- "gakatakana": "\u30ac",
- "gamma": "\u03b3",
- "gammalatinsmall": "\u0263",
- "gammasuperior": "\u02e0",
- "gangiacoptic": "\u03eb",
- "gbopomofo": "\u310d",
- "gbreve": "\u011f",
- "gcaron": "\u01e7",
- "gcedilla": "\u0123",
- "gcircle": "\u24d6",
- "gcircumflex": "\u011d",
- "gcommaaccent": "\u0123",
- "gdot": "\u0121",
- "gdotaccent": "\u0121",
- "gecyrillic": "\u0433",
- "gehiragana": "\u3052",
- "gekatakana": "\u30b2",
- "geometricallyequal": "\u2251",
- "gereshaccenthebrew": "\u059c",
- "gereshhebrew": "\u05f3",
- "gereshmuqdamhebrew": "\u059d",
- "germandbls": "\u00df",
- "gershayimaccenthebrew": "\u059e",
- "gershayimhebrew": "\u05f4",
- "getamark": "\u3013",
- "ghabengali": "\u0998",
- "ghadarmenian": "\u0572",
- "ghadeva": "\u0918",
- "ghagujarati": "\u0a98",
- "ghagurmukhi": "\u0a18",
- "ghainarabic": "\u063a",
- "ghainfinalarabic": "\ufece",
- "ghaininitialarabic": "\ufecf",
- "ghainmedialarabic": "\ufed0",
- "ghemiddlehookcyrillic": "\u0495",
- "ghestrokecyrillic": "\u0493",
- "gheupturncyrillic": "\u0491",
- "ghhadeva": "\u095a",
- "ghhagurmukhi": "\u0a5a",
- "ghook": "\u0260",
- "ghzsquare": "\u3393",
- "gihiragana": "\u304e",
- "gikatakana": "\u30ae",
- "gimarmenian": "\u0563",
- "gimel": "\u05d2",
- "gimeldagesh": "\ufb32",
- "gimeldageshhebrew": "\ufb32",
- "gimelhebrew": "\u05d2",
- "gjecyrillic": "\u0453",
- "glottalinvertedstroke": "\u01be",
- "glottalstop": "\u0294",
- "glottalstopinverted": "\u0296",
- "glottalstopmod": "\u02c0",
- "glottalstopreversed": "\u0295",
- "glottalstopreversedmod": "\u02c1",
- "glottalstopreversedsuperior": "\u02e4",
- "glottalstopstroke": "\u02a1",
- "glottalstopstrokereversed": "\u02a2",
- "gmacron": "\u1e21",
- "gmonospace": "\uff47",
- "gohiragana": "\u3054",
- "gokatakana": "\u30b4",
- "gparen": "\u24a2",
- "gpasquare": "\u33ac",
- "gradient": "\u2207",
- "grave": "\u0060",
- "gravebelowcmb": "\u0316",
- "gravecmb": "\u0300",
- "gravecomb": "\u0300",
- "gravedeva": "\u0953",
- "gravelowmod": "\u02ce",
- "gravemonospace": "\uff40",
- "gravetonecmb": "\u0340",
- "greater": "\u003e",
- "greaterequal": "\u2265",
- "greaterequalorless": "\u22db",
- "greatermonospace": "\uff1e",
- "greaterorequivalent": "\u2273",
- "greaterorless": "\u2277",
- "greateroverequal": "\u2267",
- "greatersmall": "\ufe65",
- "gscript": "\u0261",
- "gstroke": "\u01e5",
- "guhiragana": "\u3050",
- "guillemotleft": "\u00ab",
- "guillemotright": "\u00bb",
- "guilsinglleft": "\u2039",
- "guilsinglright": "\u203a",
- "gukatakana": "\u30b0",
- "guramusquare": "\u3318",
- "gysquare": "\u33c9",
- "h": "\u0068",
- "haabkhasiancyrillic": "\u04a9",
- "haaltonearabic": "\u06c1",
- "habengali": "\u09b9",
- "hadescendercyrillic": "\u04b3",
- "hadeva": "\u0939",
- "hagujarati": "\u0ab9",
- "hagurmukhi": "\u0a39",
- "haharabic": "\u062d",
- "hahfinalarabic": "\ufea2",
- "hahinitialarabic": "\ufea3",
- "hahiragana": "\u306f",
- "hahmedialarabic": "\ufea4",
- "haitusquare": "\u332a",
- "hakatakana": "\u30cf",
- "hakatakanahalfwidth": "\uff8a",
- "halantgurmukhi": "\u0a4d",
- "hamzaarabic": "\u0621",
- "hamzadammaarabic": "\u0621\u064f",
- "hamzadammatanarabic": "\u0621\u064c",
- "hamzafathaarabic": "\u0621\u064e",
- "hamzafathatanarabic": "\u0621\u064b",
- "hamzalowarabic": "\u0621",
- "hamzalowkasraarabic": "\u0621\u0650",
- "hamzalowkasratanarabic": "\u0621\u064d",
- "hamzasukunarabic": "\u0621\u0652",
- "hangulfiller": "\u3164",
- "hardsigncyrillic": "\u044a",
- "harpoonleftbarbup": "\u21bc",
- "harpoonrightbarbup": "\u21c0",
- "hasquare": "\u33ca",
- "hatafpatah": "\u05b2",
- "hatafpatah16": "\u05b2",
- "hatafpatah23": "\u05b2",
- "hatafpatah2f": "\u05b2",
- "hatafpatahhebrew": "\u05b2",
- "hatafpatahnarrowhebrew": "\u05b2",
- "hatafpatahquarterhebrew": "\u05b2",
- "hatafpatahwidehebrew": "\u05b2",
- "hatafqamats": "\u05b3",
- "hatafqamats1b": "\u05b3",
- "hatafqamats28": "\u05b3",
- "hatafqamats34": "\u05b3",
- "hatafqamatshebrew": "\u05b3",
- "hatafqamatsnarrowhebrew": "\u05b3",
- "hatafqamatsquarterhebrew": "\u05b3",
- "hatafqamatswidehebrew": "\u05b3",
- "hatafsegol": "\u05b1",
- "hatafsegol17": "\u05b1",
- "hatafsegol24": "\u05b1",
- "hatafsegol30": "\u05b1",
- "hatafsegolhebrew": "\u05b1",
- "hatafsegolnarrowhebrew": "\u05b1",
- "hatafsegolquarterhebrew": "\u05b1",
- "hatafsegolwidehebrew": "\u05b1",
- "hbar": "\u0127",
- "hbopomofo": "\u310f",
- "hbrevebelow": "\u1e2b",
- "hcedilla": "\u1e29",
- "hcircle": "\u24d7",
- "hcircumflex": "\u0125",
- "hdieresis": "\u1e27",
- "hdotaccent": "\u1e23",
- "hdotbelow": "\u1e25",
- "he": "\u05d4",
- "heart": "\u2665",
- "heartsuitblack": "\u2665",
- "heartsuitwhite": "\u2661",
- "hedagesh": "\ufb34",
- "hedageshhebrew": "\ufb34",
- "hehaltonearabic": "\u06c1",
- "heharabic": "\u0647",
- "hehebrew": "\u05d4",
- "hehfinalaltonearabic": "\ufba7",
- "hehfinalalttwoarabic": "\ufeea",
- "hehfinalarabic": "\ufeea",
- "hehhamzaabovefinalarabic": "\ufba5",
- "hehhamzaaboveisolatedarabic": "\ufba4",
- "hehinitialaltonearabic": "\ufba8",
- "hehinitialarabic": "\ufeeb",
- "hehiragana": "\u3078",
- "hehmedialaltonearabic": "\ufba9",
- "hehmedialarabic": "\ufeec",
- "heiseierasquare": "\u337b",
- "hekatakana": "\u30d8",
- "hekatakanahalfwidth": "\uff8d",
- "hekutaarusquare": "\u3336",
- "henghook": "\u0267",
- "herutusquare": "\u3339",
- "het": "\u05d7",
- "hethebrew": "\u05d7",
- "hhook": "\u0266",
- "hhooksuperior": "\u02b1",
- "hieuhacirclekorean": "\u327b",
- "hieuhaparenkorean": "\u321b",
- "hieuhcirclekorean": "\u326d",
- "hieuhkorean": "\u314e",
- "hieuhparenkorean": "\u320d",
- "hihiragana": "\u3072",
- "hikatakana": "\u30d2",
- "hikatakanahalfwidth": "\uff8b",
- "hiriq": "\u05b4",
- "hiriq14": "\u05b4",
- "hiriq21": "\u05b4",
- "hiriq2d": "\u05b4",
- "hiriqhebrew": "\u05b4",
- "hiriqnarrowhebrew": "\u05b4",
- "hiriqquarterhebrew": "\u05b4",
- "hiriqwidehebrew": "\u05b4",
- "hlinebelow": "\u1e96",
- "hmonospace": "\uff48",
- "hoarmenian": "\u0570",
- "hohipthai": "\u0e2b",
- "hohiragana": "\u307b",
- "hokatakana": "\u30db",
- "hokatakanahalfwidth": "\uff8e",
- "holam": "\u05b9",
- "holam19": "\u05b9",
- "holam26": "\u05b9",
- "holam32": "\u05b9",
- "holamhebrew": "\u05b9",
- "holamnarrowhebrew": "\u05b9",
- "holamquarterhebrew": "\u05b9",
- "holamwidehebrew": "\u05b9",
- "honokhukthai": "\u0e2e",
- "hookabovecomb": "\u0309",
- "hookcmb": "\u0309",
- "hookpalatalizedbelowcmb": "\u0321",
- "hookretroflexbelowcmb": "\u0322",
- "hoonsquare": "\u3342",
- "horicoptic": "\u03e9",
- "horizontalbar": "\u2015",
- "horncmb": "\u031b",
- "hotsprings": "\u2668",
- "house": "\u2302",
- "hparen": "\u24a3",
- "hsuperior": "\u02b0",
- "hturned": "\u0265",
- "huhiragana": "\u3075",
- "huiitosquare": "\u3333",
- "hukatakana": "\u30d5",
- "hukatakanahalfwidth": "\uff8c",
- "hungarumlaut": "\u02dd",
- "hungarumlautcmb": "\u030b",
- "hv": "\u0195",
- "hyphen": "\u002d",
- "hypheninferior": "\uf6e5",
- "hyphenmonospace": "\uff0d",
- "hyphensmall": "\ufe63",
- "hyphensuperior": "\uf6e6",
- "hyphentwo": "\u2010",
- "i": "\u0069",
- "iacute": "\u00ed",
- "iacyrillic": "\u044f",
- "ibengali": "\u0987",
- "ibopomofo": "\u3127",
- "ibreve": "\u012d",
- "icaron": "\u01d0",
- "icircle": "\u24d8",
- "icircumflex": "\u00ee",
- "icyrillic": "\u0456",
- "idblgrave": "\u0209",
- "ideographearthcircle": "\u328f",
- "ideographfirecircle": "\u328b",
- "ideographicallianceparen": "\u323f",
- "ideographiccallparen": "\u323a",
- "ideographiccentrecircle": "\u32a5",
- "ideographicclose": "\u3006",
- "ideographiccomma": "\u3001",
- "ideographiccommaleft": "\uff64",
- "ideographiccongratulationparen": "\u3237",
- "ideographiccorrectcircle": "\u32a3",
- "ideographicearthparen": "\u322f",
- "ideographicenterpriseparen": "\u323d",
- "ideographicexcellentcircle": "\u329d",
- "ideographicfestivalparen": "\u3240",
- "ideographicfinancialcircle": "\u3296",
- "ideographicfinancialparen": "\u3236",
- "ideographicfireparen": "\u322b",
- "ideographichaveparen": "\u3232",
- "ideographichighcircle": "\u32a4",
- "ideographiciterationmark": "\u3005",
- "ideographiclaborcircle": "\u3298",
- "ideographiclaborparen": "\u3238",
- "ideographicleftcircle": "\u32a7",
- "ideographiclowcircle": "\u32a6",
- "ideographicmedicinecircle": "\u32a9",
- "ideographicmetalparen": "\u322e",
- "ideographicmoonparen": "\u322a",
- "ideographicnameparen": "\u3234",
- "ideographicperiod": "\u3002",
- "ideographicprintcircle": "\u329e",
- "ideographicreachparen": "\u3243",
- "ideographicrepresentparen": "\u3239",
- "ideographicresourceparen": "\u323e",
- "ideographicrightcircle": "\u32a8",
- "ideographicsecretcircle": "\u3299",
- "ideographicselfparen": "\u3242",
- "ideographicsocietyparen": "\u3233",
- "ideographicspace": "\u3000",
- "ideographicspecialparen": "\u3235",
- "ideographicstockparen": "\u3231",
- "ideographicstudyparen": "\u323b",
- "ideographicsunparen": "\u3230",
- "ideographicsuperviseparen": "\u323c",
- "ideographicwaterparen": "\u322c",
- "ideographicwoodparen": "\u322d",
- "ideographiczero": "\u3007",
- "ideographmetalcircle": "\u328e",
- "ideographmooncircle": "\u328a",
- "ideographnamecircle": "\u3294",
- "ideographsuncircle": "\u3290",
- "ideographwatercircle": "\u328c",
- "ideographwoodcircle": "\u328d",
- "ideva": "\u0907",
- "idieresis": "\u00ef",
- "idieresisacute": "\u1e2f",
- "idieresiscyrillic": "\u04e5",
- "idotbelow": "\u1ecb",
- "iebrevecyrillic": "\u04d7",
- "iecyrillic": "\u0435",
- "ieungacirclekorean": "\u3275",
- "ieungaparenkorean": "\u3215",
- "ieungcirclekorean": "\u3267",
- "ieungkorean": "\u3147",
- "ieungparenkorean": "\u3207",
- "igrave": "\u00ec",
- "igujarati": "\u0a87",
- "igurmukhi": "\u0a07",
- "ihiragana": "\u3044",
- "ihookabove": "\u1ec9",
- "iibengali": "\u0988",
- "iicyrillic": "\u0438",
- "iideva": "\u0908",
- "iigujarati": "\u0a88",
- "iigurmukhi": "\u0a08",
- "iimatragurmukhi": "\u0a40",
- "iinvertedbreve": "\u020b",
- "iishortcyrillic": "\u0439",
- "iivowelsignbengali": "\u09c0",
- "iivowelsigndeva": "\u0940",
- "iivowelsigngujarati": "\u0ac0",
- "ij": "\u0133",
- "ikatakana": "\u30a4",
- "ikatakanahalfwidth": "\uff72",
- "ikorean": "\u3163",
- "ilde": "\u02dc",
- "iluyhebrew": "\u05ac",
- "imacron": "\u012b",
- "imacroncyrillic": "\u04e3",
- "imageorapproximatelyequal": "\u2253",
- "imatragurmukhi": "\u0a3f",
- "imonospace": "\uff49",
- "increment": "\u2206",
- "infinity": "\u221e",
- "iniarmenian": "\u056b",
- "integral": "\u222b",
- "integralbottom": "\u2321",
- "integralbt": "\u2321",
- "integralex": "\uf8f5",
- "integraltop": "\u2320",
- "integraltp": "\u2320",
- "intersection": "\u2229",
- "intisquare": "\u3305",
- "invbullet": "\u25d8",
- "invcircle": "\u25d9",
- "invsmileface": "\u263b",
- "iocyrillic": "\u0451",
- "iogonek": "\u012f",
- "iota": "\u03b9",
- "iotadieresis": "\u03ca",
- "iotadieresistonos": "\u0390",
- "iotalatin": "\u0269",
- "iotatonos": "\u03af",
- "iparen": "\u24a4",
- "irigurmukhi": "\u0a72",
- "ismallhiragana": "\u3043",
- "ismallkatakana": "\u30a3",
- "ismallkatakanahalfwidth": "\uff68",
- "issharbengali": "\u09fa",
- "istroke": "\u0268",
- "isuperior": "\uf6ed",
- "iterationhiragana": "\u309d",
- "iterationkatakana": "\u30fd",
- "itilde": "\u0129",
- "itildebelow": "\u1e2d",
- "iubopomofo": "\u3129",
- "iucyrillic": "\u044e",
- "ivowelsignbengali": "\u09bf",
- "ivowelsigndeva": "\u093f",
- "ivowelsigngujarati": "\u0abf",
- "izhitsacyrillic": "\u0475",
- "izhitsadblgravecyrillic": "\u0477",
- "j": "\u006a",
- "jaarmenian": "\u0571",
- "jabengali": "\u099c",
- "jadeva": "\u091c",
- "jagujarati": "\u0a9c",
- "jagurmukhi": "\u0a1c",
- "jbopomofo": "\u3110",
- "jcaron": "\u01f0",
- "jcircle": "\u24d9",
- "jcircumflex": "\u0135",
- "jcrossedtail": "\u029d",
- "jdotlessstroke": "\u025f",
- "jecyrillic": "\u0458",
- "jeemarabic": "\u062c",
- "jeemfinalarabic": "\ufe9e",
- "jeeminitialarabic": "\ufe9f",
- "jeemmedialarabic": "\ufea0",
- "jeharabic": "\u0698",
- "jehfinalarabic": "\ufb8b",
- "jhabengali": "\u099d",
- "jhadeva": "\u091d",
- "jhagujarati": "\u0a9d",
- "jhagurmukhi": "\u0a1d",
- "jheharmenian": "\u057b",
- "jis": "\u3004",
- "jmonospace": "\uff4a",
- "jparen": "\u24a5",
- "jsuperior": "\u02b2",
- "k": "\u006b",
- "kabashkircyrillic": "\u04a1",
- "kabengali": "\u0995",
- "kacute": "\u1e31",
- "kacyrillic": "\u043a",
- "kadescendercyrillic": "\u049b",
- "kadeva": "\u0915",
- "kaf": "\u05db",
- "kafarabic": "\u0643",
- "kafdagesh": "\ufb3b",
- "kafdageshhebrew": "\ufb3b",
- "kaffinalarabic": "\ufeda",
- "kafhebrew": "\u05db",
- "kafinitialarabic": "\ufedb",
- "kafmedialarabic": "\ufedc",
- "kafrafehebrew": "\ufb4d",
- "kagujarati": "\u0a95",
- "kagurmukhi": "\u0a15",
- "kahiragana": "\u304b",
- "kahookcyrillic": "\u04c4",
- "kakatakana": "\u30ab",
- "kakatakanahalfwidth": "\uff76",
- "kappa": "\u03ba",
- "kappasymbolgreek": "\u03f0",
- "kapyeounmieumkorean": "\u3171",
- "kapyeounphieuphkorean": "\u3184",
- "kapyeounpieupkorean": "\u3178",
- "kapyeounssangpieupkorean": "\u3179",
- "karoriisquare": "\u330d",
- "kashidaautoarabic": "\u0640",
- "kashidaautonosidebearingarabic": "\u0640",
- "kasmallkatakana": "\u30f5",
- "kasquare": "\u3384",
- "kasraarabic": "\u0650",
- "kasratanarabic": "\u064d",
- "kastrokecyrillic": "\u049f",
- "katahiraprolongmarkhalfwidth": "\uff70",
- "kaverticalstrokecyrillic": "\u049d",
- "kbopomofo": "\u310e",
- "kcalsquare": "\u3389",
- "kcaron": "\u01e9",
- "kcedilla": "\u0137",
- "kcircle": "\u24da",
- "kcommaaccent": "\u0137",
- "kdotbelow": "\u1e33",
- "keharmenian": "\u0584",
- "kehiragana": "\u3051",
- "kekatakana": "\u30b1",
- "kekatakanahalfwidth": "\uff79",
- "kenarmenian": "\u056f",
- "kesmallkatakana": "\u30f6",
- "kgreenlandic": "\u0138",
- "khabengali": "\u0996",
- "khacyrillic": "\u0445",
- "khadeva": "\u0916",
- "khagujarati": "\u0a96",
- "khagurmukhi": "\u0a16",
- "khaharabic": "\u062e",
- "khahfinalarabic": "\ufea6",
- "khahinitialarabic": "\ufea7",
- "khahmedialarabic": "\ufea8",
- "kheicoptic": "\u03e7",
- "khhadeva": "\u0959",
- "khhagurmukhi": "\u0a59",
- "khieukhacirclekorean": "\u3278",
- "khieukhaparenkorean": "\u3218",
- "khieukhcirclekorean": "\u326a",
- "khieukhkorean": "\u314b",
- "khieukhparenkorean": "\u320a",
- "khokhaithai": "\u0e02",
- "khokhonthai": "\u0e05",
- "khokhuatthai": "\u0e03",
- "khokhwaithai": "\u0e04",
- "khomutthai": "\u0e5b",
- "khook": "\u0199",
- "khorakhangthai": "\u0e06",
- "khzsquare": "\u3391",
- "kihiragana": "\u304d",
- "kikatakana": "\u30ad",
- "kikatakanahalfwidth": "\uff77",
- "kiroguramusquare": "\u3315",
- "kiromeetorusquare": "\u3316",
- "kirosquare": "\u3314",
- "kiyeokacirclekorean": "\u326e",
- "kiyeokaparenkorean": "\u320e",
- "kiyeokcirclekorean": "\u3260",
- "kiyeokkorean": "\u3131",
- "kiyeokparenkorean": "\u3200",
- "kiyeoksioskorean": "\u3133",
- "kjecyrillic": "\u045c",
- "klinebelow": "\u1e35",
- "klsquare": "\u3398",
- "kmcubedsquare": "\u33a6",
- "kmonospace": "\uff4b",
- "kmsquaredsquare": "\u33a2",
- "kohiragana": "\u3053",
- "kohmsquare": "\u33c0",
- "kokaithai": "\u0e01",
- "kokatakana": "\u30b3",
- "kokatakanahalfwidth": "\uff7a",
- "kooposquare": "\u331e",
- "koppacyrillic": "\u0481",
- "koreanstandardsymbol": "\u327f",
- "koroniscmb": "\u0343",
- "kparen": "\u24a6",
- "kpasquare": "\u33aa",
- "ksicyrillic": "\u046f",
- "ktsquare": "\u33cf",
- "kturned": "\u029e",
- "kuhiragana": "\u304f",
- "kukatakana": "\u30af",
- "kukatakanahalfwidth": "\uff78",
- "kvsquare": "\u33b8",
- "kwsquare": "\u33be",
- "l": "\u006c",
- "labengali": "\u09b2",
- "lacute": "\u013a",
- "ladeva": "\u0932",
- "lagujarati": "\u0ab2",
- "lagurmukhi": "\u0a32",
- "lakkhangyaothai": "\u0e45",
- "lamaleffinalarabic": "\ufefc",
- "lamalefhamzaabovefinalarabic": "\ufef8",
- "lamalefhamzaaboveisolatedarabic": "\ufef7",
- "lamalefhamzabelowfinalarabic": "\ufefa",
- "lamalefhamzabelowisolatedarabic": "\ufef9",
- "lamalefisolatedarabic": "\ufefb",
- "lamalefmaddaabovefinalarabic": "\ufef6",
- "lamalefmaddaaboveisolatedarabic": "\ufef5",
- "lamarabic": "\u0644",
- "lambda": "\u03bb",
- "lambdastroke": "\u019b",
- "lamed": "\u05dc",
- "lameddagesh": "\ufb3c",
- "lameddageshhebrew": "\ufb3c",
- "lamedhebrew": "\u05dc",
- "lamedholam": "\u05dc\u05b9",
- "lamedholamdagesh": "\u05dc\u05b9\u05bc",
- "lamedholamdageshhebrew": "\u05dc\u05b9\u05bc",
- "lamedholamhebrew": "\u05dc\u05b9",
- "lamfinalarabic": "\ufede",
- "lamhahinitialarabic": "\ufcca",
- "laminitialarabic": "\ufedf",
- "lamjeeminitialarabic": "\ufcc9",
- "lamkhahinitialarabic": "\ufccb",
- "lamlamhehisolatedarabic": "\ufdf2",
- "lammedialarabic": "\ufee0",
- "lammeemhahinitialarabic": "\ufd88",
- "lammeeminitialarabic": "\ufccc",
- "lammeemjeeminitialarabic": "\ufedf\ufee4\ufea0",
- "lammeemkhahinitialarabic": "\ufedf\ufee4\ufea8",
- "largecircle": "\u25ef",
- "lbar": "\u019a",
- "lbelt": "\u026c",
- "lbopomofo": "\u310c",
- "lcaron": "\u013e",
- "lcedilla": "\u013c",
- "lcircle": "\u24db",
- "lcircumflexbelow": "\u1e3d",
- "lcommaaccent": "\u013c",
- "ldot": "\u0140",
- "ldotaccent": "\u0140",
- "ldotbelow": "\u1e37",
- "ldotbelowmacron": "\u1e39",
- "leftangleabovecmb": "\u031a",
- "lefttackbelowcmb": "\u0318",
- "less": "\u003c",
- "lessequal": "\u2264",
- "lessequalorgreater": "\u22da",
- "lessmonospace": "\uff1c",
- "lessorequivalent": "\u2272",
- "lessorgreater": "\u2276",
- "lessoverequal": "\u2266",
- "lesssmall": "\ufe64",
- "lezh": "\u026e",
- "lfblock": "\u258c",
- "lhookretroflex": "\u026d",
- "lira": "\u20a4",
- "liwnarmenian": "\u056c",
- "lj": "\u01c9",
- "ljecyrillic": "\u0459",
- "ll": "\uf6c0",
- "lladeva": "\u0933",
- "llagujarati": "\u0ab3",
- "llinebelow": "\u1e3b",
- "llladeva": "\u0934",
- "llvocalicbengali": "\u09e1",
- "llvocalicdeva": "\u0961",
- "llvocalicvowelsignbengali": "\u09e3",
- "llvocalicvowelsigndeva": "\u0963",
- "lmiddletilde": "\u026b",
- "lmonospace": "\uff4c",
- "lmsquare": "\u33d0",
- "lochulathai": "\u0e2c",
- "logicaland": "\u2227",
- "logicalnot": "\u00ac",
- "logicalnotreversed": "\u2310",
- "logicalor": "\u2228",
- "lolingthai": "\u0e25",
- "longs": "\u017f",
- "lowlinecenterline": "\ufe4e",
- "lowlinecmb": "\u0332",
- "lowlinedashed": "\ufe4d",
- "lozenge": "\u25ca",
- "lparen": "\u24a7",
- "lslash": "\u0142",
- "lsquare": "\u2113",
- "lsuperior": "\uf6ee",
- "ltshade": "\u2591",
- "luthai": "\u0e26",
- "lvocalicbengali": "\u098c",
- "lvocalicdeva": "\u090c",
- "lvocalicvowelsignbengali": "\u09e2",
- "lvocalicvowelsigndeva": "\u0962",
- "lxsquare": "\u33d3",
- "m": "\u006d",
- "mabengali": "\u09ae",
- "macron": "\u00af",
- "macronbelowcmb": "\u0331",
- "macroncmb": "\u0304",
- "macronlowmod": "\u02cd",
- "macronmonospace": "\uffe3",
- "macute": "\u1e3f",
- "madeva": "\u092e",
- "magujarati": "\u0aae",
- "magurmukhi": "\u0a2e",
- "mahapakhhebrew": "\u05a4",
- "mahapakhlefthebrew": "\u05a4",
- "mahiragana": "\u307e",
- "maichattawalowleftthai": "\uf895",
- "maichattawalowrightthai": "\uf894",
- "maichattawathai": "\u0e4b",
- "maichattawaupperleftthai": "\uf893",
- "maieklowleftthai": "\uf88c",
- "maieklowrightthai": "\uf88b",
- "maiekthai": "\u0e48",
- "maiekupperleftthai": "\uf88a",
- "maihanakatleftthai": "\uf884",
- "maihanakatthai": "\u0e31",
- "maitaikhuleftthai": "\uf889",
- "maitaikhuthai": "\u0e47",
- "maitholowleftthai": "\uf88f",
- "maitholowrightthai": "\uf88e",
- "maithothai": "\u0e49",
- "maithoupperleftthai": "\uf88d",
- "maitrilowleftthai": "\uf892",
- "maitrilowrightthai": "\uf891",
- "maitrithai": "\u0e4a",
- "maitriupperleftthai": "\uf890",
- "maiyamokthai": "\u0e46",
- "makatakana": "\u30de",
- "makatakanahalfwidth": "\uff8f",
- "male": "\u2642",
- "mansyonsquare": "\u3347",
- "maqafhebrew": "\u05be",
- "mars": "\u2642",
- "masoracirclehebrew": "\u05af",
- "masquare": "\u3383",
- "mbopomofo": "\u3107",
- "mbsquare": "\u33d4",
- "mcircle": "\u24dc",
- "mcubedsquare": "\u33a5",
- "mdotaccent": "\u1e41",
- "mdotbelow": "\u1e43",
- "meemarabic": "\u0645",
- "meemfinalarabic": "\ufee2",
- "meeminitialarabic": "\ufee3",
- "meemmedialarabic": "\ufee4",
- "meemmeeminitialarabic": "\ufcd1",
- "meemmeemisolatedarabic": "\ufc48",
- "meetorusquare": "\u334d",
- "mehiragana": "\u3081",
- "meizierasquare": "\u337e",
- "mekatakana": "\u30e1",
- "mekatakanahalfwidth": "\uff92",
- "mem": "\u05de",
- "memdagesh": "\ufb3e",
- "memdageshhebrew": "\ufb3e",
- "memhebrew": "\u05de",
- "menarmenian": "\u0574",
- "merkhahebrew": "\u05a5",
- "merkhakefulahebrew": "\u05a6",
- "merkhakefulalefthebrew": "\u05a6",
- "merkhalefthebrew": "\u05a5",
- "mhook": "\u0271",
- "mhzsquare": "\u3392",
- "middledotkatakanahalfwidth": "\uff65",
- "middot": "\u00b7",
- "mieumacirclekorean": "\u3272",
- "mieumaparenkorean": "\u3212",
- "mieumcirclekorean": "\u3264",
- "mieumkorean": "\u3141",
- "mieumpansioskorean": "\u3170",
- "mieumparenkorean": "\u3204",
- "mieumpieupkorean": "\u316e",
- "mieumsioskorean": "\u316f",
- "mihiragana": "\u307f",
- "mikatakana": "\u30df",
- "mikatakanahalfwidth": "\uff90",
- "minus": "\u2212",
- "minusbelowcmb": "\u0320",
- "minuscircle": "\u2296",
- "minusmod": "\u02d7",
- "minusplus": "\u2213",
- "minute": "\u2032",
- "miribaarusquare": "\u334a",
- "mirisquare": "\u3349",
- "mlonglegturned": "\u0270",
- "mlsquare": "\u3396",
- "mmcubedsquare": "\u33a3",
- "mmonospace": "\uff4d",
- "mmsquaredsquare": "\u339f",
- "mohiragana": "\u3082",
- "mohmsquare": "\u33c1",
- "mokatakana": "\u30e2",
- "mokatakanahalfwidth": "\uff93",
- "molsquare": "\u33d6",
- "momathai": "\u0e21",
- "moverssquare": "\u33a7",
- "moverssquaredsquare": "\u33a8",
- "mparen": "\u24a8",
- "mpasquare": "\u33ab",
- "mssquare": "\u33b3",
- "msuperior": "\uf6ef",
- "mturned": "\u026f",
- "mu": "\u00b5",
- "mu1": "\u00b5",
- "muasquare": "\u3382",
- "muchgreater": "\u226b",
- "muchless": "\u226a",
- "mufsquare": "\u338c",
- "mugreek": "\u03bc",
- "mugsquare": "\u338d",
- "muhiragana": "\u3080",
- "mukatakana": "\u30e0",
- "mukatakanahalfwidth": "\uff91",
- "mulsquare": "\u3395",
- "multiply": "\u00d7",
- "mumsquare": "\u339b",
- "munahhebrew": "\u05a3",
- "munahlefthebrew": "\u05a3",
- "musicalnote": "\u266a",
- "musicalnotedbl": "\u266b",
- "musicflatsign": "\u266d",
- "musicsharpsign": "\u266f",
- "mussquare": "\u33b2",
- "muvsquare": "\u33b6",
- "muwsquare": "\u33bc",
- "mvmegasquare": "\u33b9",
- "mvsquare": "\u33b7",
- "mwmegasquare": "\u33bf",
- "mwsquare": "\u33bd",
- "n": "\u006e",
- "nabengali": "\u09a8",
- "nabla": "\u2207",
- "nacute": "\u0144",
- "nadeva": "\u0928",
- "nagujarati": "\u0aa8",
- "nagurmukhi": "\u0a28",
- "nahiragana": "\u306a",
- "nakatakana": "\u30ca",
- "nakatakanahalfwidth": "\uff85",
- "napostrophe": "\u0149",
- "nasquare": "\u3381",
- "nbopomofo": "\u310b",
- "nbspace": "\u00a0",
- "ncaron": "\u0148",
- "ncedilla": "\u0146",
- "ncircle": "\u24dd",
- "ncircumflexbelow": "\u1e4b",
- "ncommaaccent": "\u0146",
- "ndotaccent": "\u1e45",
- "ndotbelow": "\u1e47",
- "nehiragana": "\u306d",
- "nekatakana": "\u30cd",
- "nekatakanahalfwidth": "\uff88",
- "newsheqelsign": "\u20aa",
- "nfsquare": "\u338b",
- "ngabengali": "\u0999",
- "ngadeva": "\u0919",
- "ngagujarati": "\u0a99",
- "ngagurmukhi": "\u0a19",
- "ngonguthai": "\u0e07",
- "nhiragana": "\u3093",
- "nhookleft": "\u0272",
- "nhookretroflex": "\u0273",
- "nieunacirclekorean": "\u326f",
- "nieunaparenkorean": "\u320f",
- "nieuncieuckorean": "\u3135",
- "nieuncirclekorean": "\u3261",
- "nieunhieuhkorean": "\u3136",
- "nieunkorean": "\u3134",
- "nieunpansioskorean": "\u3168",
- "nieunparenkorean": "\u3201",
- "nieunsioskorean": "\u3167",
- "nieuntikeutkorean": "\u3166",
- "nihiragana": "\u306b",
- "nikatakana": "\u30cb",
- "nikatakanahalfwidth": "\uff86",
- "nikhahitleftthai": "\uf899",
- "nikhahitthai": "\u0e4d",
- "nine": "\u0039",
- "ninearabic": "\u0669",
- "ninebengali": "\u09ef",
- "ninecircle": "\u2468",
- "ninecircleinversesansserif": "\u2792",
- "ninedeva": "\u096f",
- "ninegujarati": "\u0aef",
- "ninegurmukhi": "\u0a6f",
- "ninehackarabic": "\u0669",
- "ninehangzhou": "\u3029",
- "nineideographicparen": "\u3228",
- "nineinferior": "\u2089",
- "ninemonospace": "\uff19",
- "nineoldstyle": "\uf739",
- "nineparen": "\u247c",
- "nineperiod": "\u2490",
- "ninepersian": "\u06f9",
- "nineroman": "\u2178",
- "ninesuperior": "\u2079",
- "nineteencircle": "\u2472",
- "nineteenparen": "\u2486",
- "nineteenperiod": "\u249a",
- "ninethai": "\u0e59",
- "nj": "\u01cc",
- "njecyrillic": "\u045a",
- "nkatakana": "\u30f3",
- "nkatakanahalfwidth": "\uff9d",
- "nlegrightlong": "\u019e",
- "nlinebelow": "\u1e49",
- "nmonospace": "\uff4e",
- "nmsquare": "\u339a",
- "nnabengali": "\u09a3",
- "nnadeva": "\u0923",
- "nnagujarati": "\u0aa3",
- "nnagurmukhi": "\u0a23",
- "nnnadeva": "\u0929",
- "nohiragana": "\u306e",
- "nokatakana": "\u30ce",
- "nokatakanahalfwidth": "\uff89",
- "nonbreakingspace": "\u00a0",
- "nonenthai": "\u0e13",
- "nonuthai": "\u0e19",
- "noonarabic": "\u0646",
- "noonfinalarabic": "\ufee6",
- "noonghunnaarabic": "\u06ba",
- "noonghunnafinalarabic": "\ufb9f",
- "noonhehinitialarabic": "\ufee7\ufeec",
- "nooninitialarabic": "\ufee7",
- "noonjeeminitialarabic": "\ufcd2",
- "noonjeemisolatedarabic": "\ufc4b",
- "noonmedialarabic": "\ufee8",
- "noonmeeminitialarabic": "\ufcd5",
- "noonmeemisolatedarabic": "\ufc4e",
- "noonnoonfinalarabic": "\ufc8d",
- "notcontains": "\u220c",
- "notelement": "\u2209",
- "notelementof": "\u2209",
- "notequal": "\u2260",
- "notgreater": "\u226f",
- "notgreaternorequal": "\u2271",
- "notgreaternorless": "\u2279",
- "notidentical": "\u2262",
- "notless": "\u226e",
- "notlessnorequal": "\u2270",
- "notparallel": "\u2226",
- "notprecedes": "\u2280",
- "notsubset": "\u2284",
- "notsucceeds": "\u2281",
- "notsuperset": "\u2285",
- "nowarmenian": "\u0576",
- "nparen": "\u24a9",
- "nssquare": "\u33b1",
- "nsuperior": "\u207f",
- "ntilde": "\u00f1",
- "nu": "\u03bd",
- "nuhiragana": "\u306c",
- "nukatakana": "\u30cc",
- "nukatakanahalfwidth": "\uff87",
- "nuktabengali": "\u09bc",
- "nuktadeva": "\u093c",
- "nuktagujarati": "\u0abc",
- "nuktagurmukhi": "\u0a3c",
- "numbersign": "\u0023",
- "numbersignmonospace": "\uff03",
- "numbersignsmall": "\ufe5f",
- "numeralsigngreek": "\u0374",
- "numeralsignlowergreek": "\u0375",
- "numero": "\u2116",
- "nun": "\u05e0",
- "nundagesh": "\ufb40",
- "nundageshhebrew": "\ufb40",
- "nunhebrew": "\u05e0",
- "nvsquare": "\u33b5",
- "nwsquare": "\u33bb",
- "nyabengali": "\u099e",
- "nyadeva": "\u091e",
- "nyagujarati": "\u0a9e",
- "nyagurmukhi": "\u0a1e",
- "o": "\u006f",
- "oacute": "\u00f3",
- "oangthai": "\u0e2d",
- "obarred": "\u0275",
- "obarredcyrillic": "\u04e9",
- "obarreddieresiscyrillic": "\u04eb",
- "obengali": "\u0993",
- "obopomofo": "\u311b",
- "obreve": "\u014f",
- "ocandradeva": "\u0911",
- "ocandragujarati": "\u0a91",
- "ocandravowelsigndeva": "\u0949",
- "ocandravowelsigngujarati": "\u0ac9",
- "ocaron": "\u01d2",
- "ocircle": "\u24de",
- "ocircumflex": "\u00f4",
- "ocircumflexacute": "\u1ed1",
- "ocircumflexdotbelow": "\u1ed9",
- "ocircumflexgrave": "\u1ed3",
- "ocircumflexhookabove": "\u1ed5",
- "ocircumflextilde": "\u1ed7",
- "ocyrillic": "\u043e",
- "odblacute": "\u0151",
- "odblgrave": "\u020d",
- "odeva": "\u0913",
- "odieresis": "\u00f6",
- "odieresiscyrillic": "\u04e7",
- "odotbelow": "\u1ecd",
- "oe": "\u0153",
- "oekorean": "\u315a",
- "ogonek": "\u02db",
- "ogonekcmb": "\u0328",
- "ograve": "\u00f2",
- "ogujarati": "\u0a93",
- "oharmenian": "\u0585",
- "ohiragana": "\u304a",
- "ohookabove": "\u1ecf",
- "ohorn": "\u01a1",
- "ohornacute": "\u1edb",
- "ohorndotbelow": "\u1ee3",
- "ohorngrave": "\u1edd",
- "ohornhookabove": "\u1edf",
- "ohorntilde": "\u1ee1",
- "ohungarumlaut": "\u0151",
- "oi": "\u01a3",
- "oinvertedbreve": "\u020f",
- "okatakana": "\u30aa",
- "okatakanahalfwidth": "\uff75",
- "okorean": "\u3157",
- "olehebrew": "\u05ab",
- "omacron": "\u014d",
- "omacronacute": "\u1e53",
- "omacrongrave": "\u1e51",
- "omdeva": "\u0950",
- "omega": "\u03c9",
- "omega1": "\u03d6",
- "omegacyrillic": "\u0461",
- "omegalatinclosed": "\u0277",
- "omegaroundcyrillic": "\u047b",
- "omegatitlocyrillic": "\u047d",
- "omegatonos": "\u03ce",
- "omgujarati": "\u0ad0",
- "omicron": "\u03bf",
- "omicrontonos": "\u03cc",
- "omonospace": "\uff4f",
- "one": "\u0031",
- "onearabic": "\u0661",
- "onebengali": "\u09e7",
- "onecircle": "\u2460",
- "onecircleinversesansserif": "\u278a",
- "onedeva": "\u0967",
- "onedotenleader": "\u2024",
- "oneeighth": "\u215b",
- "onefitted": "\uf6dc",
- "onegujarati": "\u0ae7",
- "onegurmukhi": "\u0a67",
- "onehackarabic": "\u0661",
- "onehalf": "\u00bd",
- "onehangzhou": "\u3021",
- "oneideographicparen": "\u3220",
- "oneinferior": "\u2081",
- "onemonospace": "\uff11",
- "onenumeratorbengali": "\u09f4",
- "oneoldstyle": "\uf731",
- "oneparen": "\u2474",
- "oneperiod": "\u2488",
- "onepersian": "\u06f1",
- "onequarter": "\u00bc",
- "oneroman": "\u2170",
- "onesuperior": "\u00b9",
- "onethai": "\u0e51",
- "onethird": "\u2153",
- "oogonek": "\u01eb",
- "oogonekmacron": "\u01ed",
- "oogurmukhi": "\u0a13",
- "oomatragurmukhi": "\u0a4b",
- "oopen": "\u0254",
- "oparen": "\u24aa",
- "openbullet": "\u25e6",
- "option": "\u2325",
- "ordfeminine": "\u00aa",
- "ordmasculine": "\u00ba",
- "orthogonal": "\u221f",
- "oshortdeva": "\u0912",
- "oshortvowelsigndeva": "\u094a",
- "oslash": "\u00f8",
- "oslashacute": "\u01ff",
- "osmallhiragana": "\u3049",
- "osmallkatakana": "\u30a9",
- "osmallkatakanahalfwidth": "\uff6b",
- "ostrokeacute": "\u01ff",
- "osuperior": "\uf6f0",
- "otcyrillic": "\u047f",
- "otilde": "\u00f5",
- "otildeacute": "\u1e4d",
- "otildedieresis": "\u1e4f",
- "oubopomofo": "\u3121",
- "overline": "\u203e",
- "overlinecenterline": "\ufe4a",
- "overlinecmb": "\u0305",
- "overlinedashed": "\ufe49",
- "overlinedblwavy": "\ufe4c",
- "overlinewavy": "\ufe4b",
- "overscore": "\u00af",
- "ovowelsignbengali": "\u09cb",
- "ovowelsigndeva": "\u094b",
- "ovowelsigngujarati": "\u0acb",
- "p": "\u0070",
- "paampssquare": "\u3380",
- "paasentosquare": "\u332b",
- "pabengali": "\u09aa",
- "pacute": "\u1e55",
- "padeva": "\u092a",
- "pagedown": "\u21df",
- "pageup": "\u21de",
- "pagujarati": "\u0aaa",
- "pagurmukhi": "\u0a2a",
- "pahiragana": "\u3071",
- "paiyannoithai": "\u0e2f",
- "pakatakana": "\u30d1",
- "palatalizationcyrilliccmb": "\u0484",
- "palochkacyrillic": "\u04c0",
- "pansioskorean": "\u317f",
- "paragraph": "\u00b6",
- "parallel": "\u2225",
- "parenleft": "\u0028",
- "parenleftaltonearabic": "\ufd3e",
- "parenleftbt": "\uf8ed",
- "parenleftex": "\uf8ec",
- "parenleftinferior": "\u208d",
- "parenleftmonospace": "\uff08",
- "parenleftsmall": "\ufe59",
- "parenleftsuperior": "\u207d",
- "parenlefttp": "\uf8eb",
- "parenleftvertical": "\ufe35",
- "parenright": "\u0029",
- "parenrightaltonearabic": "\ufd3f",
- "parenrightbt": "\uf8f8",
- "parenrightex": "\uf8f7",
- "parenrightinferior": "\u208e",
- "parenrightmonospace": "\uff09",
- "parenrightsmall": "\ufe5a",
- "parenrightsuperior": "\u207e",
- "parenrighttp": "\uf8f6",
- "parenrightvertical": "\ufe36",
- "partialdiff": "\u2202",
- "paseqhebrew": "\u05c0",
- "pashtahebrew": "\u0599",
- "pasquare": "\u33a9",
- "patah": "\u05b7",
- "patah11": "\u05b7",
- "patah1d": "\u05b7",
- "patah2a": "\u05b7",
- "patahhebrew": "\u05b7",
- "patahnarrowhebrew": "\u05b7",
- "patahquarterhebrew": "\u05b7",
- "patahwidehebrew": "\u05b7",
- "pazerhebrew": "\u05a1",
- "pbopomofo": "\u3106",
- "pcircle": "\u24df",
- "pdotaccent": "\u1e57",
- "pe": "\u05e4",
- "pecyrillic": "\u043f",
- "pedagesh": "\ufb44",
- "pedageshhebrew": "\ufb44",
- "peezisquare": "\u333b",
- "pefinaldageshhebrew": "\ufb43",
- "peharabic": "\u067e",
- "peharmenian": "\u057a",
- "pehebrew": "\u05e4",
- "pehfinalarabic": "\ufb57",
- "pehinitialarabic": "\ufb58",
- "pehiragana": "\u307a",
- "pehmedialarabic": "\ufb59",
- "pekatakana": "\u30da",
- "pemiddlehookcyrillic": "\u04a7",
- "perafehebrew": "\ufb4e",
- "percent": "\u0025",
- "percentarabic": "\u066a",
- "percentmonospace": "\uff05",
- "percentsmall": "\ufe6a",
- "period": "\u002e",
- "periodarmenian": "\u0589",
- "periodcentered": "\u00b7",
- "periodhalfwidth": "\uff61",
- "periodinferior": "\uf6e7",
- "periodmonospace": "\uff0e",
- "periodsmall": "\ufe52",
- "periodsuperior": "\uf6e8",
- "perispomenigreekcmb": "\u0342",
- "perpendicular": "\u22a5",
- "perthousand": "\u2030",
- "peseta": "\u20a7",
- "pfsquare": "\u338a",
- "phabengali": "\u09ab",
- "phadeva": "\u092b",
- "phagujarati": "\u0aab",
- "phagurmukhi": "\u0a2b",
- "phi": "\u03c6",
- "phi1": "\u03d5",
- "phieuphacirclekorean": "\u327a",
- "phieuphaparenkorean": "\u321a",
- "phieuphcirclekorean": "\u326c",
- "phieuphkorean": "\u314d",
- "phieuphparenkorean": "\u320c",
- "philatin": "\u0278",
- "phinthuthai": "\u0e3a",
- "phisymbolgreek": "\u03d5",
- "phook": "\u01a5",
- "phophanthai": "\u0e1e",
- "phophungthai": "\u0e1c",
- "phosamphaothai": "\u0e20",
- "pi": "\u03c0",
- "pieupacirclekorean": "\u3273",
- "pieupaparenkorean": "\u3213",
- "pieupcieuckorean": "\u3176",
- "pieupcirclekorean": "\u3265",
- "pieupkiyeokkorean": "\u3172",
- "pieupkorean": "\u3142",
- "pieupparenkorean": "\u3205",
- "pieupsioskiyeokkorean": "\u3174",
- "pieupsioskorean": "\u3144",
- "pieupsiostikeutkorean": "\u3175",
- "pieupthieuthkorean": "\u3177",
- "pieuptikeutkorean": "\u3173",
- "pihiragana": "\u3074",
- "pikatakana": "\u30d4",
- "pisymbolgreek": "\u03d6",
- "piwrarmenian": "\u0583",
- "plus": "\u002b",
- "plusbelowcmb": "\u031f",
- "pluscircle": "\u2295",
- "plusminus": "\u00b1",
- "plusmod": "\u02d6",
- "plusmonospace": "\uff0b",
- "plussmall": "\ufe62",
- "plussuperior": "\u207a",
- "pmonospace": "\uff50",
- "pmsquare": "\u33d8",
- "pohiragana": "\u307d",
- "pointingindexdownwhite": "\u261f",
- "pointingindexleftwhite": "\u261c",
- "pointingindexrightwhite": "\u261e",
- "pointingindexupwhite": "\u261d",
- "pokatakana": "\u30dd",
- "poplathai": "\u0e1b",
- "postalmark": "\u3012",
- "postalmarkface": "\u3020",
- "pparen": "\u24ab",
- "precedes": "\u227a",
- "prescription": "\u211e",
- "primemod": "\u02b9",
- "primereversed": "\u2035",
- "product": "\u220f",
- "projective": "\u2305",
- "prolongedkana": "\u30fc",
- "propellor": "\u2318",
- "propersubset": "\u2282",
- "propersuperset": "\u2283",
- "proportion": "\u2237",
- "proportional": "\u221d",
- "psi": "\u03c8",
- "psicyrillic": "\u0471",
- "psilipneumatacyrilliccmb": "\u0486",
- "pssquare": "\u33b0",
- "puhiragana": "\u3077",
- "pukatakana": "\u30d7",
- "pvsquare": "\u33b4",
- "pwsquare": "\u33ba",
- "q": "\u0071",
- "qadeva": "\u0958",
- "qadmahebrew": "\u05a8",
- "qafarabic": "\u0642",
- "qaffinalarabic": "\ufed6",
- "qafinitialarabic": "\ufed7",
- "qafmedialarabic": "\ufed8",
- "qamats": "\u05b8",
- "qamats10": "\u05b8",
- "qamats1a": "\u05b8",
- "qamats1c": "\u05b8",
- "qamats27": "\u05b8",
- "qamats29": "\u05b8",
- "qamats33": "\u05b8",
- "qamatsde": "\u05b8",
- "qamatshebrew": "\u05b8",
- "qamatsnarrowhebrew": "\u05b8",
- "qamatsqatanhebrew": "\u05b8",
- "qamatsqatannarrowhebrew": "\u05b8",
- "qamatsqatanquarterhebrew": "\u05b8",
- "qamatsqatanwidehebrew": "\u05b8",
- "qamatsquarterhebrew": "\u05b8",
- "qamatswidehebrew": "\u05b8",
- "qarneyparahebrew": "\u059f",
- "qbopomofo": "\u3111",
- "qcircle": "\u24e0",
- "qhook": "\u02a0",
- "qmonospace": "\uff51",
- "qof": "\u05e7",
- "qofdagesh": "\ufb47",
- "qofdageshhebrew": "\ufb47",
- "qofhatafpatah": "\u05e7\u05b2",
- "qofhatafpatahhebrew": "\u05e7\u05b2",
- "qofhatafsegol": "\u05e7\u05b1",
- "qofhatafsegolhebrew": "\u05e7\u05b1",
- "qofhebrew": "\u05e7",
- "qofhiriq": "\u05e7\u05b4",
- "qofhiriqhebrew": "\u05e7\u05b4",
- "qofholam": "\u05e7\u05b9",
- "qofholamhebrew": "\u05e7\u05b9",
- "qofpatah": "\u05e7\u05b7",
- "qofpatahhebrew": "\u05e7\u05b7",
- "qofqamats": "\u05e7\u05b8",
- "qofqamatshebrew": "\u05e7\u05b8",
- "qofqubuts": "\u05e7\u05bb",
- "qofqubutshebrew": "\u05e7\u05bb",
- "qofsegol": "\u05e7\u05b6",
- "qofsegolhebrew": "\u05e7\u05b6",
- "qofsheva": "\u05e7\u05b0",
- "qofshevahebrew": "\u05e7\u05b0",
- "qoftsere": "\u05e7\u05b5",
- "qoftserehebrew": "\u05e7\u05b5",
- "qparen": "\u24ac",
- "quarternote": "\u2669",
- "qubuts": "\u05bb",
- "qubuts18": "\u05bb",
- "qubuts25": "\u05bb",
- "qubuts31": "\u05bb",
- "qubutshebrew": "\u05bb",
- "qubutsnarrowhebrew": "\u05bb",
- "qubutsquarterhebrew": "\u05bb",
- "qubutswidehebrew": "\u05bb",
- "question": "\u003f",
- "questionarabic": "\u061f",
- "questionarmenian": "\u055e",
- "questiondown": "\u00bf",
- "questiondownsmall": "\uf7bf",
- "questiongreek": "\u037e",
- "questionmonospace": "\uff1f",
- "questionsmall": "\uf73f",
- "quotedbl": "\u0022",
- "quotedblbase": "\u201e",
- "quotedblleft": "\u201c",
- "quotedblmonospace": "\uff02",
- "quotedblprime": "\u301e",
- "quotedblprimereversed": "\u301d",
- "quotedblright": "\u201d",
- "quoteleft": "\u2018",
- "quoteleftreversed": "\u201b",
- "quotereversed": "\u201b",
- "quoteright": "\u2019",
- "quoterightn": "\u0149",
- "quotesinglbase": "\u201a",
- "quotesingle": "\u0027",
- "quotesinglemonospace": "\uff07",
- "r": "\u0072",
- "raarmenian": "\u057c",
- "rabengali": "\u09b0",
- "racute": "\u0155",
- "radeva": "\u0930",
- "radical": "\u221a",
- "radicalex": "\uf8e5",
- "radoverssquare": "\u33ae",
- "radoverssquaredsquare": "\u33af",
- "radsquare": "\u33ad",
- "rafe": "\u05bf",
- "rafehebrew": "\u05bf",
- "ragujarati": "\u0ab0",
- "ragurmukhi": "\u0a30",
- "rahiragana": "\u3089",
- "rakatakana": "\u30e9",
- "rakatakanahalfwidth": "\uff97",
- "ralowerdiagonalbengali": "\u09f1",
- "ramiddlediagonalbengali": "\u09f0",
- "ramshorn": "\u0264",
- "ratio": "\u2236",
- "rbopomofo": "\u3116",
- "rcaron": "\u0159",
- "rcedilla": "\u0157",
- "rcircle": "\u24e1",
- "rcommaaccent": "\u0157",
- "rdblgrave": "\u0211",
- "rdotaccent": "\u1e59",
- "rdotbelow": "\u1e5b",
- "rdotbelowmacron": "\u1e5d",
- "referencemark": "\u203b",
- "reflexsubset": "\u2286",
- "reflexsuperset": "\u2287",
- "registered": "\u00ae",
- "registersans": "\uf8e8",
- "registerserif": "\uf6da",
- "reharabic": "\u0631",
- "reharmenian": "\u0580",
- "rehfinalarabic": "\ufeae",
- "rehiragana": "\u308c",
- "rehyehaleflamarabic": "\u0631\ufef3\ufe8e\u0644",
- "rekatakana": "\u30ec",
- "rekatakanahalfwidth": "\uff9a",
- "resh": "\u05e8",
- "reshdageshhebrew": "\ufb48",
- "reshhatafpatah": "\u05e8\u05b2",
- "reshhatafpatahhebrew": "\u05e8\u05b2",
- "reshhatafsegol": "\u05e8\u05b1",
- "reshhatafsegolhebrew": "\u05e8\u05b1",
- "reshhebrew": "\u05e8",
- "reshhiriq": "\u05e8\u05b4",
- "reshhiriqhebrew": "\u05e8\u05b4",
- "reshholam": "\u05e8\u05b9",
- "reshholamhebrew": "\u05e8\u05b9",
- "reshpatah": "\u05e8\u05b7",
- "reshpatahhebrew": "\u05e8\u05b7",
- "reshqamats": "\u05e8\u05b8",
- "reshqamatshebrew": "\u05e8\u05b8",
- "reshqubuts": "\u05e8\u05bb",
- "reshqubutshebrew": "\u05e8\u05bb",
- "reshsegol": "\u05e8\u05b6",
- "reshsegolhebrew": "\u05e8\u05b6",
- "reshsheva": "\u05e8\u05b0",
- "reshshevahebrew": "\u05e8\u05b0",
- "reshtsere": "\u05e8\u05b5",
- "reshtserehebrew": "\u05e8\u05b5",
- "reversedtilde": "\u223d",
- "reviahebrew": "\u0597",
- "reviamugrashhebrew": "\u0597",
- "revlogicalnot": "\u2310",
- "rfishhook": "\u027e",
- "rfishhookreversed": "\u027f",
- "rhabengali": "\u09dd",
- "rhadeva": "\u095d",
- "rho": "\u03c1",
- "rhook": "\u027d",
- "rhookturned": "\u027b",
- "rhookturnedsuperior": "\u02b5",
- "rhosymbolgreek": "\u03f1",
- "rhotichookmod": "\u02de",
- "rieulacirclekorean": "\u3271",
- "rieulaparenkorean": "\u3211",
- "rieulcirclekorean": "\u3263",
- "rieulhieuhkorean": "\u3140",
- "rieulkiyeokkorean": "\u313a",
- "rieulkiyeoksioskorean": "\u3169",
- "rieulkorean": "\u3139",
- "rieulmieumkorean": "\u313b",
- "rieulpansioskorean": "\u316c",
- "rieulparenkorean": "\u3203",
- "rieulphieuphkorean": "\u313f",
- "rieulpieupkorean": "\u313c",
- "rieulpieupsioskorean": "\u316b",
- "rieulsioskorean": "\u313d",
- "rieulthieuthkorean": "\u313e",
- "rieultikeutkorean": "\u316a",
- "rieulyeorinhieuhkorean": "\u316d",
- "rightangle": "\u221f",
- "righttackbelowcmb": "\u0319",
- "righttriangle": "\u22bf",
- "rihiragana": "\u308a",
- "rikatakana": "\u30ea",
- "rikatakanahalfwidth": "\uff98",
- "ring": "\u02da",
- "ringbelowcmb": "\u0325",
- "ringcmb": "\u030a",
- "ringhalfleft": "\u02bf",
- "ringhalfleftarmenian": "\u0559",
- "ringhalfleftbelowcmb": "\u031c",
- "ringhalfleftcentered": "\u02d3",
- "ringhalfright": "\u02be",
- "ringhalfrightbelowcmb": "\u0339",
- "ringhalfrightcentered": "\u02d2",
- "rinvertedbreve": "\u0213",
- "rittorusquare": "\u3351",
- "rlinebelow": "\u1e5f",
- "rlongleg": "\u027c",
- "rlonglegturned": "\u027a",
- "rmonospace": "\uff52",
- "rohiragana": "\u308d",
- "rokatakana": "\u30ed",
- "rokatakanahalfwidth": "\uff9b",
- "roruathai": "\u0e23",
- "rparen": "\u24ad",
- "rrabengali": "\u09dc",
- "rradeva": "\u0931",
- "rragurmukhi": "\u0a5c",
- "rreharabic": "\u0691",
- "rrehfinalarabic": "\ufb8d",
- "rrvocalicbengali": "\u09e0",
- "rrvocalicdeva": "\u0960",
- "rrvocalicgujarati": "\u0ae0",
- "rrvocalicvowelsignbengali": "\u09c4",
- "rrvocalicvowelsigndeva": "\u0944",
- "rrvocalicvowelsigngujarati": "\u0ac4",
- "rsuperior": "\uf6f1",
- "rtblock": "\u2590",
- "rturned": "\u0279",
- "rturnedsuperior": "\u02b4",
- "ruhiragana": "\u308b",
- "rukatakana": "\u30eb",
- "rukatakanahalfwidth": "\uff99",
- "rupeemarkbengali": "\u09f2",
- "rupeesignbengali": "\u09f3",
- "rupiah": "\uf6dd",
- "ruthai": "\u0e24",
- "rvocalicbengali": "\u098b",
- "rvocalicdeva": "\u090b",
- "rvocalicgujarati": "\u0a8b",
- "rvocalicvowelsignbengali": "\u09c3",
- "rvocalicvowelsigndeva": "\u0943",
- "rvocalicvowelsigngujarati": "\u0ac3",
- "s": "\u0073",
- "sabengali": "\u09b8",
- "sacute": "\u015b",
- "sacutedotaccent": "\u1e65",
- "sadarabic": "\u0635",
- "sadeva": "\u0938",
- "sadfinalarabic": "\ufeba",
- "sadinitialarabic": "\ufebb",
- "sadmedialarabic": "\ufebc",
- "sagujarati": "\u0ab8",
- "sagurmukhi": "\u0a38",
- "sahiragana": "\u3055",
- "sakatakana": "\u30b5",
- "sakatakanahalfwidth": "\uff7b",
- "sallallahoualayhewasallamarabic": "\ufdfa",
- "samekh": "\u05e1",
- "samekhdagesh": "\ufb41",
- "samekhdageshhebrew": "\ufb41",
- "samekhhebrew": "\u05e1",
- "saraaathai": "\u0e32",
- "saraaethai": "\u0e41",
- "saraaimaimalaithai": "\u0e44",
- "saraaimaimuanthai": "\u0e43",
- "saraamthai": "\u0e33",
- "saraathai": "\u0e30",
- "saraethai": "\u0e40",
- "saraiileftthai": "\uf886",
- "saraiithai": "\u0e35",
- "saraileftthai": "\uf885",
- "saraithai": "\u0e34",
- "saraothai": "\u0e42",
- "saraueeleftthai": "\uf888",
- "saraueethai": "\u0e37",
- "saraueleftthai": "\uf887",
- "sarauethai": "\u0e36",
- "sarauthai": "\u0e38",
- "sarauuthai": "\u0e39",
- "sbopomofo": "\u3119",
- "scaron": "\u0161",
- "scarondotaccent": "\u1e67",
- "scedilla": "\u015f",
- "schwa": "\u0259",
- "schwacyrillic": "\u04d9",
- "schwadieresiscyrillic": "\u04db",
- "schwahook": "\u025a",
- "scircle": "\u24e2",
- "scircumflex": "\u015d",
- "scommaaccent": "\u0219",
- "sdotaccent": "\u1e61",
- "sdotbelow": "\u1e63",
- "sdotbelowdotaccent": "\u1e69",
- "seagullbelowcmb": "\u033c",
- "second": "\u2033",
- "secondtonechinese": "\u02ca",
- "section": "\u00a7",
- "seenarabic": "\u0633",
- "seenfinalarabic": "\ufeb2",
- "seeninitialarabic": "\ufeb3",
- "seenmedialarabic": "\ufeb4",
- "segol": "\u05b6",
- "segol13": "\u05b6",
- "segol1f": "\u05b6",
- "segol2c": "\u05b6",
- "segolhebrew": "\u05b6",
- "segolnarrowhebrew": "\u05b6",
- "segolquarterhebrew": "\u05b6",
- "segoltahebrew": "\u0592",
- "segolwidehebrew": "\u05b6",
- "seharmenian": "\u057d",
- "sehiragana": "\u305b",
- "sekatakana": "\u30bb",
- "sekatakanahalfwidth": "\uff7e",
- "semicolon": "\u003b",
- "semicolonarabic": "\u061b",
- "semicolonmonospace": "\uff1b",
- "semicolonsmall": "\ufe54",
- "semivoicedmarkkana": "\u309c",
- "semivoicedmarkkanahalfwidth": "\uff9f",
- "sentisquare": "\u3322",
- "sentosquare": "\u3323",
- "seven": "\u0037",
- "sevenarabic": "\u0667",
- "sevenbengali": "\u09ed",
- "sevencircle": "\u2466",
- "sevencircleinversesansserif": "\u2790",
- "sevendeva": "\u096d",
- "seveneighths": "\u215e",
- "sevengujarati": "\u0aed",
- "sevengurmukhi": "\u0a6d",
- "sevenhackarabic": "\u0667",
- "sevenhangzhou": "\u3027",
- "sevenideographicparen": "\u3226",
- "seveninferior": "\u2087",
- "sevenmonospace": "\uff17",
- "sevenoldstyle": "\uf737",
- "sevenparen": "\u247a",
- "sevenperiod": "\u248e",
- "sevenpersian": "\u06f7",
- "sevenroman": "\u2176",
- "sevensuperior": "\u2077",
- "seventeencircle": "\u2470",
- "seventeenparen": "\u2484",
- "seventeenperiod": "\u2498",
- "seventhai": "\u0e57",
- "sfthyphen": "\u00ad",
- "shaarmenian": "\u0577",
- "shabengali": "\u09b6",
- "shacyrillic": "\u0448",
- "shaddaarabic": "\u0651",
- "shaddadammaarabic": "\ufc61",
- "shaddadammatanarabic": "\ufc5e",
- "shaddafathaarabic": "\ufc60",
- "shaddafathatanarabic": "\u0651\u064b",
- "shaddakasraarabic": "\ufc62",
- "shaddakasratanarabic": "\ufc5f",
- "shade": "\u2592",
- "shadedark": "\u2593",
- "shadelight": "\u2591",
- "shademedium": "\u2592",
- "shadeva": "\u0936",
- "shagujarati": "\u0ab6",
- "shagurmukhi": "\u0a36",
- "shalshelethebrew": "\u0593",
- "shbopomofo": "\u3115",
- "shchacyrillic": "\u0449",
- "sheenarabic": "\u0634",
- "sheenfinalarabic": "\ufeb6",
- "sheeninitialarabic": "\ufeb7",
- "sheenmedialarabic": "\ufeb8",
- "sheicoptic": "\u03e3",
- "sheqel": "\u20aa",
- "sheqelhebrew": "\u20aa",
- "sheva": "\u05b0",
- "sheva115": "\u05b0",
- "sheva15": "\u05b0",
- "sheva22": "\u05b0",
- "sheva2e": "\u05b0",
- "shevahebrew": "\u05b0",
- "shevanarrowhebrew": "\u05b0",
- "shevaquarterhebrew": "\u05b0",
- "shevawidehebrew": "\u05b0",
- "shhacyrillic": "\u04bb",
- "shimacoptic": "\u03ed",
- "shin": "\u05e9",
- "shindagesh": "\ufb49",
- "shindageshhebrew": "\ufb49",
- "shindageshshindot": "\ufb2c",
- "shindageshshindothebrew": "\ufb2c",
- "shindageshsindot": "\ufb2d",
- "shindageshsindothebrew": "\ufb2d",
- "shindothebrew": "\u05c1",
- "shinhebrew": "\u05e9",
- "shinshindot": "\ufb2a",
- "shinshindothebrew": "\ufb2a",
- "shinsindot": "\ufb2b",
- "shinsindothebrew": "\ufb2b",
- "shook": "\u0282",
- "sigma": "\u03c3",
- "sigma1": "\u03c2",
- "sigmafinal": "\u03c2",
- "sigmalunatesymbolgreek": "\u03f2",
- "sihiragana": "\u3057",
- "sikatakana": "\u30b7",
- "sikatakanahalfwidth": "\uff7c",
- "siluqhebrew": "\u05bd",
- "siluqlefthebrew": "\u05bd",
- "similar": "\u223c",
- "sindothebrew": "\u05c2",
- "siosacirclekorean": "\u3274",
- "siosaparenkorean": "\u3214",
- "sioscieuckorean": "\u317e",
- "sioscirclekorean": "\u3266",
- "sioskiyeokkorean": "\u317a",
- "sioskorean": "\u3145",
- "siosnieunkorean": "\u317b",
- "siosparenkorean": "\u3206",
- "siospieupkorean": "\u317d",
- "siostikeutkorean": "\u317c",
- "six": "\u0036",
- "sixarabic": "\u0666",
- "sixbengali": "\u09ec",
- "sixcircle": "\u2465",
- "sixcircleinversesansserif": "\u278f",
- "sixdeva": "\u096c",
- "sixgujarati": "\u0aec",
- "sixgurmukhi": "\u0a6c",
- "sixhackarabic": "\u0666",
- "sixhangzhou": "\u3026",
- "sixideographicparen": "\u3225",
- "sixinferior": "\u2086",
- "sixmonospace": "\uff16",
- "sixoldstyle": "\uf736",
- "sixparen": "\u2479",
- "sixperiod": "\u248d",
- "sixpersian": "\u06f6",
- "sixroman": "\u2175",
- "sixsuperior": "\u2076",
- "sixteencircle": "\u246f",
- "sixteencurrencydenominatorbengali": "\u09f9",
- "sixteenparen": "\u2483",
- "sixteenperiod": "\u2497",
- "sixthai": "\u0e56",
- "slash": "\u002f",
- "slashmonospace": "\uff0f",
- "slong": "\u017f",
- "slongdotaccent": "\u1e9b",
- "smileface": "\u263a",
- "smonospace": "\uff53",
- "sofpasuqhebrew": "\u05c3",
- "softhyphen": "\u00ad",
- "softsigncyrillic": "\u044c",
- "sohiragana": "\u305d",
- "sokatakana": "\u30bd",
- "sokatakanahalfwidth": "\uff7f",
- "soliduslongoverlaycmb": "\u0338",
- "solidusshortoverlaycmb": "\u0337",
- "sorusithai": "\u0e29",
- "sosalathai": "\u0e28",
- "sosothai": "\u0e0b",
- "sosuathai": "\u0e2a",
- "space": "\u0020",
- "spacehackarabic": "\u0020",
- "spade": "\u2660",
- "spadesuitblack": "\u2660",
- "spadesuitwhite": "\u2664",
- "sparen": "\u24ae",
- "squarebelowcmb": "\u033b",
- "squarecc": "\u33c4",
- "squarecm": "\u339d",
- "squarediagonalcrosshatchfill": "\u25a9",
- "squarehorizontalfill": "\u25a4",
- "squarekg": "\u338f",
- "squarekm": "\u339e",
- "squarekmcapital": "\u33ce",
- "squareln": "\u33d1",
- "squarelog": "\u33d2",
- "squaremg": "\u338e",
- "squaremil": "\u33d5",
- "squaremm": "\u339c",
- "squaremsquared": "\u33a1",
- "squareorthogonalcrosshatchfill": "\u25a6",
- "squareupperlefttolowerrightfill": "\u25a7",
- "squareupperrighttolowerleftfill": "\u25a8",
- "squareverticalfill": "\u25a5",
- "squarewhitewithsmallblack": "\u25a3",
- "srsquare": "\u33db",
- "ssabengali": "\u09b7",
- "ssadeva": "\u0937",
- "ssagujarati": "\u0ab7",
- "ssangcieuckorean": "\u3149",
- "ssanghieuhkorean": "\u3185",
- "ssangieungkorean": "\u3180",
- "ssangkiyeokkorean": "\u3132",
- "ssangnieunkorean": "\u3165",
- "ssangpieupkorean": "\u3143",
- "ssangsioskorean": "\u3146",
- "ssangtikeutkorean": "\u3138",
- "ssuperior": "\uf6f2",
- "sterling": "\u00a3",
- "sterlingmonospace": "\uffe1",
- "strokelongoverlaycmb": "\u0336",
- "strokeshortoverlaycmb": "\u0335",
- "subset": "\u2282",
- "subsetnotequal": "\u228a",
- "subsetorequal": "\u2286",
- "succeeds": "\u227b",
- "suchthat": "\u220b",
- "suhiragana": "\u3059",
- "sukatakana": "\u30b9",
- "sukatakanahalfwidth": "\uff7d",
- "sukunarabic": "\u0652",
- "summation": "\u2211",
- "sun": "\u263c",
- "superset": "\u2283",
- "supersetnotequal": "\u228b",
- "supersetorequal": "\u2287",
- "svsquare": "\u33dc",
- "syouwaerasquare": "\u337c",
- "t": "\u0074",
- "tabengali": "\u09a4",
- "tackdown": "\u22a4",
- "tackleft": "\u22a3",
- "tadeva": "\u0924",
- "tagujarati": "\u0aa4",
- "tagurmukhi": "\u0a24",
- "taharabic": "\u0637",
- "tahfinalarabic": "\ufec2",
- "tahinitialarabic": "\ufec3",
- "tahiragana": "\u305f",
- "tahmedialarabic": "\ufec4",
- "taisyouerasquare": "\u337d",
- "takatakana": "\u30bf",
- "takatakanahalfwidth": "\uff80",
- "tatweelarabic": "\u0640",
- "tau": "\u03c4",
- "tav": "\u05ea",
- "tavdages": "\ufb4a",
- "tavdagesh": "\ufb4a",
- "tavdageshhebrew": "\ufb4a",
- "tavhebrew": "\u05ea",
- "tbar": "\u0167",
- "tbopomofo": "\u310a",
- "tcaron": "\u0165",
- "tccurl": "\u02a8",
- "tcedilla": "\u0163",
- "tcheharabic": "\u0686",
- "tchehfinalarabic": "\ufb7b",
- "tchehinitialarabic": "\ufb7c",
- "tchehmedialarabic": "\ufb7d",
- "tchehmeeminitialarabic": "\ufb7c\ufee4",
- "tcircle": "\u24e3",
- "tcircumflexbelow": "\u1e71",
- "tcommaaccent": "\u0163",
- "tdieresis": "\u1e97",
- "tdotaccent": "\u1e6b",
- "tdotbelow": "\u1e6d",
- "tecyrillic": "\u0442",
- "tedescendercyrillic": "\u04ad",
- "teharabic": "\u062a",
- "tehfinalarabic": "\ufe96",
- "tehhahinitialarabic": "\ufca2",
- "tehhahisolatedarabic": "\ufc0c",
- "tehinitialarabic": "\ufe97",
- "tehiragana": "\u3066",
- "tehjeeminitialarabic": "\ufca1",
- "tehjeemisolatedarabic": "\ufc0b",
- "tehmarbutaarabic": "\u0629",
- "tehmarbutafinalarabic": "\ufe94",
- "tehmedialarabic": "\ufe98",
- "tehmeeminitialarabic": "\ufca4",
- "tehmeemisolatedarabic": "\ufc0e",
- "tehnoonfinalarabic": "\ufc73",
- "tekatakana": "\u30c6",
- "tekatakanahalfwidth": "\uff83",
- "telephone": "\u2121",
- "telephoneblack": "\u260e",
- "telishagedolahebrew": "\u05a0",
- "telishaqetanahebrew": "\u05a9",
- "tencircle": "\u2469",
- "tenideographicparen": "\u3229",
- "tenparen": "\u247d",
- "tenperiod": "\u2491",
- "tenroman": "\u2179",
- "tesh": "\u02a7",
- "tet": "\u05d8",
- "tetdagesh": "\ufb38",
- "tetdageshhebrew": "\ufb38",
- "tethebrew": "\u05d8",
- "tetsecyrillic": "\u04b5",
- "tevirhebrew": "\u059b",
- "tevirlefthebrew": "\u059b",
- "thabengali": "\u09a5",
- "thadeva": "\u0925",
- "thagujarati": "\u0aa5",
- "thagurmukhi": "\u0a25",
- "thalarabic": "\u0630",
- "thalfinalarabic": "\ufeac",
- "thanthakhatlowleftthai": "\uf898",
- "thanthakhatlowrightthai": "\uf897",
- "thanthakhatthai": "\u0e4c",
- "thanthakhatupperleftthai": "\uf896",
- "theharabic": "\u062b",
- "thehfinalarabic": "\ufe9a",
- "thehinitialarabic": "\ufe9b",
- "thehmedialarabic": "\ufe9c",
- "thereexists": "\u2203",
- "therefore": "\u2234",
- "theta": "\u03b8",
- "theta1": "\u03d1",
- "thetasymbolgreek": "\u03d1",
- "thieuthacirclekorean": "\u3279",
- "thieuthaparenkorean": "\u3219",
- "thieuthcirclekorean": "\u326b",
- "thieuthkorean": "\u314c",
- "thieuthparenkorean": "\u320b",
- "thirteencircle": "\u246c",
- "thirteenparen": "\u2480",
- "thirteenperiod": "\u2494",
- "thonangmonthothai": "\u0e11",
- "thook": "\u01ad",
- "thophuthaothai": "\u0e12",
- "thorn": "\u00fe",
- "thothahanthai": "\u0e17",
- "thothanthai": "\u0e10",
- "thothongthai": "\u0e18",
- "thothungthai": "\u0e16",
- "thousandcyrillic": "\u0482",
- "thousandsseparatorarabic": "\u066c",
- "thousandsseparatorpersian": "\u066c",
- "three": "\u0033",
- "threearabic": "\u0663",
- "threebengali": "\u09e9",
- "threecircle": "\u2462",
- "threecircleinversesansserif": "\u278c",
- "threedeva": "\u0969",
- "threeeighths": "\u215c",
- "threegujarati": "\u0ae9",
- "threegurmukhi": "\u0a69",
- "threehackarabic": "\u0663",
- "threehangzhou": "\u3023",
- "threeideographicparen": "\u3222",
- "threeinferior": "\u2083",
- "threemonospace": "\uff13",
- "threenumeratorbengali": "\u09f6",
- "threeoldstyle": "\uf733",
- "threeparen": "\u2476",
- "threeperiod": "\u248a",
- "threepersian": "\u06f3",
- "threequarters": "\u00be",
- "threequartersemdash": "\uf6de",
- "threeroman": "\u2172",
- "threesuperior": "\u00b3",
- "threethai": "\u0e53",
- "thzsquare": "\u3394",
- "tihiragana": "\u3061",
- "tikatakana": "\u30c1",
- "tikatakanahalfwidth": "\uff81",
- "tikeutacirclekorean": "\u3270",
- "tikeutaparenkorean": "\u3210",
- "tikeutcirclekorean": "\u3262",
- "tikeutkorean": "\u3137",
- "tikeutparenkorean": "\u3202",
- "tilde": "\u02dc",
- "tildebelowcmb": "\u0330",
- "tildecmb": "\u0303",
- "tildecomb": "\u0303",
- "tildedoublecmb": "\u0360",
- "tildeoperator": "\u223c",
- "tildeoverlaycmb": "\u0334",
- "tildeverticalcmb": "\u033e",
- "timescircle": "\u2297",
- "tipehahebrew": "\u0596",
- "tipehalefthebrew": "\u0596",
- "tippigurmukhi": "\u0a70",
- "titlocyrilliccmb": "\u0483",
- "tiwnarmenian": "\u057f",
- "tlinebelow": "\u1e6f",
- "tmonospace": "\uff54",
- "toarmenian": "\u0569",
- "tohiragana": "\u3068",
- "tokatakana": "\u30c8",
- "tokatakanahalfwidth": "\uff84",
- "tonebarextrahighmod": "\u02e5",
- "tonebarextralowmod": "\u02e9",
- "tonebarhighmod": "\u02e6",
- "tonebarlowmod": "\u02e8",
- "tonebarmidmod": "\u02e7",
- "tonefive": "\u01bd",
- "tonesix": "\u0185",
- "tonetwo": "\u01a8",
- "tonos": "\u0384",
- "tonsquare": "\u3327",
- "topatakthai": "\u0e0f",
- "tortoiseshellbracketleft": "\u3014",
- "tortoiseshellbracketleftsmall": "\ufe5d",
- "tortoiseshellbracketleftvertical": "\ufe39",
- "tortoiseshellbracketright": "\u3015",
- "tortoiseshellbracketrightsmall": "\ufe5e",
- "tortoiseshellbracketrightvertical": "\ufe3a",
- "totaothai": "\u0e15",
- "tpalatalhook": "\u01ab",
- "tparen": "\u24af",
- "trademark": "\u2122",
- "trademarksans": "\uf8ea",
- "trademarkserif": "\uf6db",
- "tretroflexhook": "\u0288",
- "triagdn": "\u25bc",
- "triaglf": "\u25c4",
- "triagrt": "\u25ba",
- "triagup": "\u25b2",
- "ts": "\u02a6",
- "tsadi": "\u05e6",
- "tsadidagesh": "\ufb46",
- "tsadidageshhebrew": "\ufb46",
- "tsadihebrew": "\u05e6",
- "tsecyrillic": "\u0446",
- "tsere": "\u05b5",
- "tsere12": "\u05b5",
- "tsere1e": "\u05b5",
- "tsere2b": "\u05b5",
- "tserehebrew": "\u05b5",
- "tserenarrowhebrew": "\u05b5",
- "tserequarterhebrew": "\u05b5",
- "tserewidehebrew": "\u05b5",
- "tshecyrillic": "\u045b",
- "tsuperior": "\uf6f3",
- "ttabengali": "\u099f",
- "ttadeva": "\u091f",
- "ttagujarati": "\u0a9f",
- "ttagurmukhi": "\u0a1f",
- "tteharabic": "\u0679",
- "ttehfinalarabic": "\ufb67",
- "ttehinitialarabic": "\ufb68",
- "ttehmedialarabic": "\ufb69",
- "tthabengali": "\u09a0",
- "tthadeva": "\u0920",
- "tthagujarati": "\u0aa0",
- "tthagurmukhi": "\u0a20",
- "tturned": "\u0287",
- "tuhiragana": "\u3064",
- "tukatakana": "\u30c4",
- "tukatakanahalfwidth": "\uff82",
- "tusmallhiragana": "\u3063",
- "tusmallkatakana": "\u30c3",
- "tusmallkatakanahalfwidth": "\uff6f",
- "twelvecircle": "\u246b",
- "twelveparen": "\u247f",
- "twelveperiod": "\u2493",
- "twelveroman": "\u217b",
- "twentycircle": "\u2473",
- "twentyhangzhou": "\u5344",
- "twentyparen": "\u2487",
- "twentyperiod": "\u249b",
- "two": "\u0032",
- "twoarabic": "\u0662",
- "twobengali": "\u09e8",
- "twocircle": "\u2461",
- "twocircleinversesansserif": "\u278b",
- "twodeva": "\u0968",
- "twodotenleader": "\u2025",
- "twodotleader": "\u2025",
- "twodotleadervertical": "\ufe30",
- "twogujarati": "\u0ae8",
- "twogurmukhi": "\u0a68",
- "twohackarabic": "\u0662",
- "twohangzhou": "\u3022",
- "twoideographicparen": "\u3221",
- "twoinferior": "\u2082",
- "twomonospace": "\uff12",
- "twonumeratorbengali": "\u09f5",
- "twooldstyle": "\uf732",
- "twoparen": "\u2475",
- "twoperiod": "\u2489",
- "twopersian": "\u06f2",
- "tworoman": "\u2171",
- "twostroke": "\u01bb",
- "twosuperior": "\u00b2",
- "twothai": "\u0e52",
- "twothirds": "\u2154",
- "u": "\u0075",
- "uacute": "\u00fa",
- "ubar": "\u0289",
- "ubengali": "\u0989",
- "ubopomofo": "\u3128",
- "ubreve": "\u016d",
- "ucaron": "\u01d4",
- "ucircle": "\u24e4",
- "ucircumflex": "\u00fb",
- "ucircumflexbelow": "\u1e77",
- "ucyrillic": "\u0443",
- "udattadeva": "\u0951",
- "udblacute": "\u0171",
- "udblgrave": "\u0215",
- "udeva": "\u0909",
- "udieresis": "\u00fc",
- "udieresisacute": "\u01d8",
- "udieresisbelow": "\u1e73",
- "udieresiscaron": "\u01da",
- "udieresiscyrillic": "\u04f1",
- "udieresisgrave": "\u01dc",
- "udieresismacron": "\u01d6",
- "udotbelow": "\u1ee5",
- "ugrave": "\u00f9",
- "ugujarati": "\u0a89",
- "ugurmukhi": "\u0a09",
- "uhiragana": "\u3046",
- "uhookabove": "\u1ee7",
- "uhorn": "\u01b0",
- "uhornacute": "\u1ee9",
- "uhorndotbelow": "\u1ef1",
- "uhorngrave": "\u1eeb",
- "uhornhookabove": "\u1eed",
- "uhorntilde": "\u1eef",
- "uhungarumlaut": "\u0171",
- "uhungarumlautcyrillic": "\u04f3",
- "uinvertedbreve": "\u0217",
- "ukatakana": "\u30a6",
- "ukatakanahalfwidth": "\uff73",
- "ukcyrillic": "\u0479",
- "ukorean": "\u315c",
- "umacron": "\u016b",
- "umacroncyrillic": "\u04ef",
- "umacrondieresis": "\u1e7b",
- "umatragurmukhi": "\u0a41",
- "umonospace": "\uff55",
- "underscore": "\u005f",
- "underscoredbl": "\u2017",
- "underscoremonospace": "\uff3f",
- "underscorevertical": "\ufe33",
- "underscorewavy": "\ufe4f",
- "union": "\u222a",
- "universal": "\u2200",
- "uogonek": "\u0173",
- "uparen": "\u24b0",
- "upblock": "\u2580",
- "upperdothebrew": "\u05c4",
- "upsilon": "\u03c5",
- "upsilondieresis": "\u03cb",
- "upsilondieresistonos": "\u03b0",
- "upsilonlatin": "\u028a",
- "upsilontonos": "\u03cd",
- "uptackbelowcmb": "\u031d",
- "uptackmod": "\u02d4",
- "uragurmukhi": "\u0a73",
- "uring": "\u016f",
- "ushortcyrillic": "\u045e",
- "usmallhiragana": "\u3045",
- "usmallkatakana": "\u30a5",
- "usmallkatakanahalfwidth": "\uff69",
- "ustraightcyrillic": "\u04af",
- "ustraightstrokecyrillic": "\u04b1",
- "utilde": "\u0169",
- "utildeacute": "\u1e79",
- "utildebelow": "\u1e75",
- "uubengali": "\u098a",
- "uudeva": "\u090a",
- "uugujarati": "\u0a8a",
- "uugurmukhi": "\u0a0a",
- "uumatragurmukhi": "\u0a42",
- "uuvowelsignbengali": "\u09c2",
- "uuvowelsigndeva": "\u0942",
- "uuvowelsigngujarati": "\u0ac2",
- "uvowelsignbengali": "\u09c1",
- "uvowelsigndeva": "\u0941",
- "uvowelsigngujarati": "\u0ac1",
- "v": "\u0076",
- "vadeva": "\u0935",
- "vagujarati": "\u0ab5",
- "vagurmukhi": "\u0a35",
- "vakatakana": "\u30f7",
- "vav": "\u05d5",
- "vavdagesh": "\ufb35",
- "vavdagesh65": "\ufb35",
- "vavdageshhebrew": "\ufb35",
- "vavhebrew": "\u05d5",
- "vavholam": "\ufb4b",
- "vavholamhebrew": "\ufb4b",
- "vavvavhebrew": "\u05f0",
- "vavyodhebrew": "\u05f1",
- "vcircle": "\u24e5",
- "vdotbelow": "\u1e7f",
- "vecyrillic": "\u0432",
- "veharabic": "\u06a4",
- "vehfinalarabic": "\ufb6b",
- "vehinitialarabic": "\ufb6c",
- "vehmedialarabic": "\ufb6d",
- "vekatakana": "\u30f9",
- "venus": "\u2640",
- "verticalbar": "\u007c",
- "verticallineabovecmb": "\u030d",
- "verticallinebelowcmb": "\u0329",
- "verticallinelowmod": "\u02cc",
- "verticallinemod": "\u02c8",
- "vewarmenian": "\u057e",
- "vhook": "\u028b",
- "vikatakana": "\u30f8",
- "viramabengali": "\u09cd",
- "viramadeva": "\u094d",
- "viramagujarati": "\u0acd",
- "visargabengali": "\u0983",
- "visargadeva": "\u0903",
- "visargagujarati": "\u0a83",
- "vmonospace": "\uff56",
- "voarmenian": "\u0578",
- "voicediterationhiragana": "\u309e",
- "voicediterationkatakana": "\u30fe",
- "voicedmarkkana": "\u309b",
- "voicedmarkkanahalfwidth": "\uff9e",
- "vokatakana": "\u30fa",
- "vparen": "\u24b1",
- "vtilde": "\u1e7d",
- "vturned": "\u028c",
- "vuhiragana": "\u3094",
- "vukatakana": "\u30f4",
- "w": "\u0077",
- "wacute": "\u1e83",
- "waekorean": "\u3159",
- "wahiragana": "\u308f",
- "wakatakana": "\u30ef",
- "wakatakanahalfwidth": "\uff9c",
- "wakorean": "\u3158",
- "wasmallhiragana": "\u308e",
- "wasmallkatakana": "\u30ee",
- "wattosquare": "\u3357",
- "wavedash": "\u301c",
- "wavyunderscorevertical": "\ufe34",
- "wawarabic": "\u0648",
- "wawfinalarabic": "\ufeee",
- "wawhamzaabovearabic": "\u0624",
- "wawhamzaabovefinalarabic": "\ufe86",
- "wbsquare": "\u33dd",
- "wcircle": "\u24e6",
- "wcircumflex": "\u0175",
- "wdieresis": "\u1e85",
- "wdotaccent": "\u1e87",
- "wdotbelow": "\u1e89",
- "wehiragana": "\u3091",
- "weierstrass": "\u2118",
- "wekatakana": "\u30f1",
- "wekorean": "\u315e",
- "weokorean": "\u315d",
- "wgrave": "\u1e81",
- "whitebullet": "\u25e6",
- "whitecircle": "\u25cb",
- "whitecircleinverse": "\u25d9",
- "whitecornerbracketleft": "\u300e",
- "whitecornerbracketleftvertical": "\ufe43",
- "whitecornerbracketright": "\u300f",
- "whitecornerbracketrightvertical": "\ufe44",
- "whitediamond": "\u25c7",
- "whitediamondcontainingblacksmalldiamond": "\u25c8",
- "whitedownpointingsmalltriangle": "\u25bf",
- "whitedownpointingtriangle": "\u25bd",
- "whiteleftpointingsmalltriangle": "\u25c3",
- "whiteleftpointingtriangle": "\u25c1",
- "whitelenticularbracketleft": "\u3016",
- "whitelenticularbracketright": "\u3017",
- "whiterightpointingsmalltriangle": "\u25b9",
- "whiterightpointingtriangle": "\u25b7",
- "whitesmallsquare": "\u25ab",
- "whitesmilingface": "\u263a",
- "whitesquare": "\u25a1",
- "whitestar": "\u2606",
- "whitetelephone": "\u260f",
- "whitetortoiseshellbracketleft": "\u3018",
- "whitetortoiseshellbracketright": "\u3019",
- "whiteuppointingsmalltriangle": "\u25b5",
- "whiteuppointingtriangle": "\u25b3",
- "wihiragana": "\u3090",
- "wikatakana": "\u30f0",
- "wikorean": "\u315f",
- "wmonospace": "\uff57",
- "wohiragana": "\u3092",
- "wokatakana": "\u30f2",
- "wokatakanahalfwidth": "\uff66",
- "won": "\u20a9",
- "wonmonospace": "\uffe6",
- "wowaenthai": "\u0e27",
- "wparen": "\u24b2",
- "wring": "\u1e98",
- "wsuperior": "\u02b7",
- "wturned": "\u028d",
- "wynn": "\u01bf",
- "x": "\u0078",
- "xabovecmb": "\u033d",
- "xbopomofo": "\u3112",
- "xcircle": "\u24e7",
- "xdieresis": "\u1e8d",
- "xdotaccent": "\u1e8b",
- "xeharmenian": "\u056d",
- "xi": "\u03be",
- "xmonospace": "\uff58",
- "xparen": "\u24b3",
- "xsuperior": "\u02e3",
- "y": "\u0079",
- "yaadosquare": "\u334e",
- "yabengali": "\u09af",
- "yacute": "\u00fd",
- "yadeva": "\u092f",
- "yaekorean": "\u3152",
- "yagujarati": "\u0aaf",
- "yagurmukhi": "\u0a2f",
- "yahiragana": "\u3084",
- "yakatakana": "\u30e4",
- "yakatakanahalfwidth": "\uff94",
- "yakorean": "\u3151",
- "yamakkanthai": "\u0e4e",
- "yasmallhiragana": "\u3083",
- "yasmallkatakana": "\u30e3",
- "yasmallkatakanahalfwidth": "\uff6c",
- "yatcyrillic": "\u0463",
- "ycircle": "\u24e8",
- "ycircumflex": "\u0177",
- "ydieresis": "\u00ff",
- "ydotaccent": "\u1e8f",
- "ydotbelow": "\u1ef5",
- "yeharabic": "\u064a",
- "yehbarreearabic": "\u06d2",
- "yehbarreefinalarabic": "\ufbaf",
- "yehfinalarabic": "\ufef2",
- "yehhamzaabovearabic": "\u0626",
- "yehhamzaabovefinalarabic": "\ufe8a",
- "yehhamzaaboveinitialarabic": "\ufe8b",
- "yehhamzaabovemedialarabic": "\ufe8c",
- "yehinitialarabic": "\ufef3",
- "yehmedialarabic": "\ufef4",
- "yehmeeminitialarabic": "\ufcdd",
- "yehmeemisolatedarabic": "\ufc58",
- "yehnoonfinalarabic": "\ufc94",
- "yehthreedotsbelowarabic": "\u06d1",
- "yekorean": "\u3156",
- "yen": "\u00a5",
- "yenmonospace": "\uffe5",
- "yeokorean": "\u3155",
- "yeorinhieuhkorean": "\u3186",
- "yerahbenyomohebrew": "\u05aa",
- "yerahbenyomolefthebrew": "\u05aa",
- "yericyrillic": "\u044b",
- "yerudieresiscyrillic": "\u04f9",
- "yesieungkorean": "\u3181",
- "yesieungpansioskorean": "\u3183",
- "yesieungsioskorean": "\u3182",
- "yetivhebrew": "\u059a",
- "ygrave": "\u1ef3",
- "yhook": "\u01b4",
- "yhookabove": "\u1ef7",
- "yiarmenian": "\u0575",
- "yicyrillic": "\u0457",
- "yikorean": "\u3162",
- "yinyang": "\u262f",
- "yiwnarmenian": "\u0582",
- "ymonospace": "\uff59",
- "yod": "\u05d9",
- "yoddagesh": "\ufb39",
- "yoddageshhebrew": "\ufb39",
- "yodhebrew": "\u05d9",
- "yodyodhebrew": "\u05f2",
- "yodyodpatahhebrew": "\ufb1f",
- "yohiragana": "\u3088",
- "yoikorean": "\u3189",
- "yokatakana": "\u30e8",
- "yokatakanahalfwidth": "\uff96",
- "yokorean": "\u315b",
- "yosmallhiragana": "\u3087",
- "yosmallkatakana": "\u30e7",
- "yosmallkatakanahalfwidth": "\uff6e",
- "yotgreek": "\u03f3",
- "yoyaekorean": "\u3188",
- "yoyakorean": "\u3187",
- "yoyakthai": "\u0e22",
- "yoyingthai": "\u0e0d",
- "yparen": "\u24b4",
- "ypogegrammeni": "\u037a",
- "ypogegrammenigreekcmb": "\u0345",
- "yr": "\u01a6",
- "yring": "\u1e99",
- "ysuperior": "\u02b8",
- "ytilde": "\u1ef9",
- "yturned": "\u028e",
- "yuhiragana": "\u3086",
- "yuikorean": "\u318c",
- "yukatakana": "\u30e6",
- "yukatakanahalfwidth": "\uff95",
- "yukorean": "\u3160",
- "yusbigcyrillic": "\u046b",
- "yusbigiotifiedcyrillic": "\u046d",
- "yuslittlecyrillic": "\u0467",
- "yuslittleiotifiedcyrillic": "\u0469",
- "yusmallhiragana": "\u3085",
- "yusmallkatakana": "\u30e5",
- "yusmallkatakanahalfwidth": "\uff6d",
- "yuyekorean": "\u318b",
- "yuyeokorean": "\u318a",
- "yyabengali": "\u09df",
- "yyadeva": "\u095f",
- "z": "\u007a",
- "zaarmenian": "\u0566",
- "zacute": "\u017a",
- "zadeva": "\u095b",
- "zagurmukhi": "\u0a5b",
- "zaharabic": "\u0638",
- "zahfinalarabic": "\ufec6",
- "zahinitialarabic": "\ufec7",
- "zahiragana": "\u3056",
- "zahmedialarabic": "\ufec8",
- "zainarabic": "\u0632",
- "zainfinalarabic": "\ufeb0",
- "zakatakana": "\u30b6",
- "zaqefgadolhebrew": "\u0595",
- "zaqefqatanhebrew": "\u0594",
- "zarqahebrew": "\u0598",
- "zayin": "\u05d6",
- "zayindagesh": "\ufb36",
- "zayindageshhebrew": "\ufb36",
- "zayinhebrew": "\u05d6",
- "zbopomofo": "\u3117",
- "zcaron": "\u017e",
- "zcircle": "\u24e9",
- "zcircumflex": "\u1e91",
- "zcurl": "\u0291",
- "zdot": "\u017c",
- "zdotaccent": "\u017c",
- "zdotbelow": "\u1e93",
- "zecyrillic": "\u0437",
- "zedescendercyrillic": "\u0499",
- "zedieresiscyrillic": "\u04df",
- "zehiragana": "\u305c",
- "zekatakana": "\u30bc",
- "zero": "\u0030",
- "zeroarabic": "\u0660",
- "zerobengali": "\u09e6",
- "zerodeva": "\u0966",
- "zerogujarati": "\u0ae6",
- "zerogurmukhi": "\u0a66",
- "zerohackarabic": "\u0660",
- "zeroinferior": "\u2080",
- "zeromonospace": "\uff10",
- "zerooldstyle": "\uf730",
- "zeropersian": "\u06f0",
- "zerosuperior": "\u2070",
- "zerothai": "\u0e50",
- "zerowidthjoiner": "\ufeff",
- "zerowidthnonjoiner": "\u200c",
- "zerowidthspace": "\u200b",
- "zeta": "\u03b6",
- "zhbopomofo": "\u3113",
- "zhearmenian": "\u056a",
- "zhebrevecyrillic": "\u04c2",
- "zhecyrillic": "\u0436",
- "zhedescendercyrillic": "\u0497",
- "zhedieresiscyrillic": "\u04dd",
- "zihiragana": "\u3058",
- "zikatakana": "\u30b8",
- "zinorhebrew": "\u05ae",
- "zlinebelow": "\u1e95",
- "zmonospace": "\uff5a",
- "zohiragana": "\u305e",
- "zokatakana": "\u30be",
- "zparen": "\u24b5",
- "zretroflexhook": "\u0290",
- "zstroke": "\u01b6",
- "zuhiragana": "\u305a",
- "zukatakana": "\u30ba",
-}
-# --end
diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py
index c525454..f696270 100644
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -1,45 +1,36 @@
"""Functions that can be used for the most common use-cases for pdf2zh.six"""
import logging
-import sys
-from io import StringIO
-from typing import Any, BinaryIO, Container, Iterator, Optional, cast
+from typing import BinaryIO
import numpy as np
import tqdm
from pymupdf import Document
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfparser import PDFParser
+from pdf2zh.converter import TranslateConverter
+from pdf2zh.pdfinterp import PDFPageInterpreterEx
-from pdf2zh.converter import (
- HOCRConverter,
- HTMLConverter,
- PDFPageAggregator,
- TextConverter,
- XMLConverter,
-)
-from pdf2zh.image import ImageWriter
-from pdf2zh.layout import LAParams, LTPage
-from pdf2zh.pdfdevice import PDFDevice, TagExtractor
-from pdf2zh.pdfexceptions import PDFValueError
-from pdf2zh.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdf2zh.pdfpage import PDFPage
-from pdf2zh.utils import AnyIO, FileOrName, open_filename, get_device
+
+def get_device():
+ """Get the device to use for computation."""
+ try:
+ import torch
+
+ if torch.cuda.is_available():
+ return "cuda:0"
+ except ImportError:
+ pass
+
+ return "cpu"
def extract_text_to_fp(
inf: BinaryIO,
- outfp: AnyIO,
- output_type: str = "text",
- codec: str = "utf-8",
- laparams: Optional[LAParams] = None,
- maxpages: int = 0,
- pages: Optional[Container[int]] = None,
+ pages=None,
password: str = "",
- scale: float = 1.0,
- rotation: int = 0,
- layoutmode: str = "normal",
- output_dir: Optional[str] = None,
- strip_control: bool = False,
debug: bool = False,
- disable_caching: bool = False,
page_count: int = 0,
vfont: str = "",
vchar: str = "",
@@ -50,126 +41,37 @@ def extract_text_to_fp(
lang_out: str = "",
service: str = "",
callback: object = None,
- **kwargs: Any,
+ **kwarg,
) -> None:
- """Parses text from inf-file and writes to outfp file-like object.
-
- Takes loads of optional arguments but the defaults are somewhat sane.
- Beware laparams: Including an empty LAParams is not the same as passing
- None!
-
- :param inf: a file-like object to read PDF structure from, such as a
- file handler (using the builtin `open()` function) or a `BytesIO`.
- :param outfp: a file-like object to write the text to.
- :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
- Only 'text' works properly.
- :param codec: Text decoding codec
- :param laparams: An LAParams object from pdf2zh.layout. Default is None
- but may not layout correctly.
- :param maxpages: How many pages to stop parsing after
- :param page_numbers: zero-indexed page numbers to operate on.
- :param password: For encrypted PDFs, the password to decrypt.
- :param scale: Scale factor
- :param rotation: Rotation factor
- :param layoutmode: Default is 'normal', see
- pdf2zh.converter.HTMLConverter
- :param output_dir: If given, creates an ImageWriter for extracted images.
- :param strip_control: Does what it says on the tin
- :param debug: Output more logging data
- :param disable_caching: Does what it says on the tin
- :param other:
- :return: nothing, acting as it does on two streams. Use StringIO to get
- strings.
- """
if debug:
logging.getLogger().setLevel(logging.DEBUG)
- imagewriter = None
- if output_dir:
- imagewriter = ImageWriter(output_dir)
-
- rsrcmgr = PDFResourceManager(caching=not disable_caching)
- device: Optional[PDFDevice] = None
+ rsrcmgr = PDFResourceManager()
layout = {}
-
- if output_type != "text" and outfp == sys.stdout:
- outfp = sys.stdout.buffer
-
- if output_type == "text":
- device = TextConverter(
- rsrcmgr,
- outfp,
- codec=codec,
- laparams=laparams,
- imagewriter=imagewriter,
- vfont=vfont,
- vchar=vchar,
- thread=thread,
- layout=layout,
- lang_in=lang_in,
- lang_out=lang_out,
- service=service,
- )
-
- elif output_type == "xml":
- device = XMLConverter(
- rsrcmgr,
- outfp,
- codec=codec,
- laparams=laparams,
- imagewriter=imagewriter,
- stripcontrol=strip_control,
- )
-
- elif output_type == "html":
- device = HTMLConverter(
- rsrcmgr,
- outfp,
- codec=codec,
- scale=scale,
- layoutmode=layoutmode,
- laparams=laparams,
- imagewriter=imagewriter,
- )
-
- elif output_type == "hocr":
- device = HOCRConverter(
- rsrcmgr,
- outfp,
- codec=codec,
- laparams=laparams,
- stripcontrol=strip_control,
- )
-
- elif output_type == "tag":
- # Binary I/O is required, but we have no good way to test it here.
- device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
-
- else:
- msg = f"Output type can be text, html, xml or tag but is {output_type}"
- raise PDFValueError(msg)
+ device = TranslateConverter(
+ rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service
+ )
assert device is not None
obj_patch = {}
- interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch)
+ interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
if pages:
total_pages = len(pages)
else:
total_pages = page_count
+
+ parser = PDFParser(inf)
+ doc = PDFDocument(parser, password=password)
with tqdm.tqdm(
- PDFPage.get_pages(
- inf,
- pages,
- maxpages=maxpages,
- password=password,
- caching=not disable_caching,
- ),
+ enumerate(PDFPage.create_pages(doc)),
total=total_pages,
- position=0,
) as progress:
- for page in progress:
+ for pageno, page in progress:
+ if pages and (pageno not in pages):
+ continue
if callback:
callback(progress)
+ page.pageno = pageno
pix = doc_en[page.pageno].get_pixmap()
image = np.fromstring(pix.samples, np.uint8).reshape(
pix.height, pix.width, 3
@@ -202,8 +104,6 @@ def extract_text_to_fp(
)
box[y0:y1, x0:x1] = 0
layout[page.pageno] = box
- # print(page.number,page_layout)
- page.rotate = (page.rotate + rotation) % 360
# 新建一个 xref 存放新指令流
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
doc_en.update_object(page.page_xref, "<<>>")
@@ -213,86 +113,3 @@ def extract_text_to_fp(
device.close()
return obj_patch
-
-
-def extract_text(
- pdf_file: FileOrName,
- password: str = "",
- page_numbers: Optional[Container[int]] = None,
- maxpages: int = 0,
- caching: bool = True,
- codec: str = "utf-8",
- laparams: Optional[LAParams] = None,
-) -> str:
- """Parse and return the text contained in a PDF file.
-
- :param pdf_file: Either a file path or a file-like object for the PDF file
- to be worked on.
- :param password: For encrypted PDFs, the password to decrypt.
- :param page_numbers: List of zero-indexed page numbers to extract.
- :param maxpages: The maximum number of pages to parse
- :param caching: If resources should be cached
- :param codec: Text decoding codec
- :param laparams: An LAParams object from pdf2zh.layout. If None, uses
- some default settings that often work well.
- :return: a string containing all of the text extracted.
- """
- if laparams is None:
- laparams = LAParams()
-
- with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
- fp = cast(BinaryIO, fp) # we opened in binary mode
- rsrcmgr = PDFResourceManager(caching=caching)
- device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
-
- for page in PDFPage.get_pages(
- fp,
- page_numbers,
- maxpages=maxpages,
- password=password,
- caching=caching,
- ):
- interpreter.process_page(page)
-
- return output_string.getvalue()
-
-
-def extract_pages(
- pdf_file: FileOrName,
- password: str = "",
- page_numbers: Optional[Container[int]] = None,
- maxpages: int = 0,
- caching: bool = True,
- laparams: Optional[LAParams] = None,
-) -> Iterator[LTPage]:
- """Extract and yield LTPage objects
-
- :param pdf_file: Either a file path or a file-like object for the PDF file
- to be worked on.
- :param password: For encrypted PDFs, the password to decrypt.
- :param page_numbers: List of zero-indexed page numbers to extract.
- :param maxpages: The maximum number of pages to parse
- :param caching: If resources should be cached
- :param laparams: An LAParams object from pdf2zh.layout. If None, uses
- some default settings that often work well.
- :return: LTPage objects
- """
- if laparams is None:
- laparams = LAParams()
-
- with open_filename(pdf_file, "rb") as fp:
- fp = cast(BinaryIO, fp) # we opened in binary mode
- resource_manager = PDFResourceManager(caching=caching)
- device = PDFPageAggregator(resource_manager, laparams=laparams)
- interpreter = PDFPageInterpreter(resource_manager, device)
- for page in PDFPage.get_pages(
- fp,
- page_numbers,
- maxpages=maxpages,
- password=password,
- caching=caching,
- ):
- interpreter.process_page(page)
- layout = device.get_result()
- yield layout
diff --git a/pdf2zh/image.py b/pdf2zh/image.py
deleted file mode 100644
index 99e8e8c..0000000
--- a/pdf2zh/image.py
+++ /dev/null
@@ -1,297 +0,0 @@
-import os
-import os.path
-import struct
-from io import BytesIO
-from typing import BinaryIO, Tuple
-
-try:
- from typing import Literal
-except ImportError:
- # Literal was introduced in Python 3.8
- from typing_extensions import Literal # type: ignore[assignment]
-
-from pdf2zh.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
-from pdf2zh.layout import LTImage
-from pdf2zh.pdfcolor import (
- LITERAL_DEVICE_CMYK,
- LITERAL_DEVICE_GRAY,
- LITERAL_DEVICE_RGB,
- LITERAL_INLINE_DEVICE_GRAY,
- LITERAL_INLINE_DEVICE_RGB,
-)
-from pdf2zh.pdfexceptions import PDFValueError
-from pdf2zh.pdftypes import (
- LITERALS_DCT_DECODE,
- LITERALS_FLATE_DECODE,
- LITERALS_JBIG2_DECODE,
- LITERALS_JPX_DECODE,
-)
-
-PIL_ERROR_MESSAGE = (
- "Could not import Pillow. This dependency of pdf2zh.six is not "
- "installed by default. You need it to to save jpg images to a file. Install it "
- "with `pip install 'pdf2zh.six[image]'`"
-)
-
-
-def align32(x: int) -> int:
- return ((x + 3) // 4) * 4
-
-
-class BMPWriter:
- def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
- self.fp = fp
- self.bits = bits
- self.width = width
- self.height = height
- if bits == 1:
- ncols = 2
- elif bits == 8:
- ncols = 256
- elif bits == 24:
- ncols = 0
- else:
- raise PDFValueError(bits)
- self.linesize = align32((self.width * self.bits + 7) // 8)
- self.datasize = self.linesize * self.height
- headersize = 14 + 40 + ncols * 4
- info = struct.pack(
- " None:
- self.fp.seek(self.pos1 - (y + 1) * self.linesize)
- self.fp.write(data)
-
-
-class ImageWriter:
- """Write image to a file
-
- Supports various image types: JPEG, JBIG2 and bitmaps
- """
-
- def __init__(self, outdir: str) -> None:
- self.outdir = outdir
- if not os.path.exists(self.outdir):
- os.makedirs(self.outdir)
-
- def export_image(self, image: LTImage) -> str:
- """Save an LTImage to disk"""
- (width, height) = image.srcsize
-
- filters = image.stream.get_filters()
-
- if filters[-1][0] in LITERALS_DCT_DECODE:
- name = self._save_jpeg(image)
-
- elif filters[-1][0] in LITERALS_JPX_DECODE:
- name = self._save_jpeg2000(image)
-
- elif self._is_jbig2_iamge(image):
- name = self._save_jbig2(image)
-
- elif image.bits == 1:
- name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
-
- elif image.bits == 8 and (
- LITERAL_DEVICE_RGB in image.colorspace
- or LITERAL_INLINE_DEVICE_RGB in image.colorspace
- ):
- name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
-
- elif image.bits == 8 and (
- LITERAL_DEVICE_GRAY in image.colorspace
- or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
- ):
- name = self._save_bmp(image, width, height, width, image.bits)
-
- elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
- name = self._save_bytes(image)
-
- else:
- name = self._save_raw(image)
-
- return name
-
- def _save_jpeg(self, image: LTImage) -> str:
- """Save a JPEG encoded image"""
- data = image.stream.get_data()
-
- name, path = self._create_unique_image_name(image, ".jpg")
- with open(path, "wb") as fp:
- if LITERAL_DEVICE_CMYK in image.colorspace:
- try:
- from PIL import Image, ImageChops # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
-
- ifp = BytesIO(data)
- i = Image.open(ifp)
- i = ImageChops.invert(i)
- i = i.convert("RGB")
- i.save(fp, "JPEG")
- else:
- fp.write(data)
-
- return name
-
- def _save_jpeg2000(self, image: LTImage) -> str:
- """Save a JPEG 2000 encoded image"""
- data = image.stream.get_data()
-
- name, path = self._create_unique_image_name(image, ".jp2")
- with open(path, "wb") as fp:
- try:
- from PIL import Image # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
-
- # if we just write the raw data, most image programs
- # that I have tried cannot open the file. However,
- # open and saving with PIL produces a file that
- # seems to be easily opened by other programs
- ifp = BytesIO(data)
- i = Image.open(ifp)
- i.save(fp, "JPEG2000")
- return name
-
- def _save_jbig2(self, image: LTImage) -> str:
- """Save a JBIG2 encoded image"""
- name, path = self._create_unique_image_name(image, ".jb2")
- with open(path, "wb") as fp:
- input_stream = BytesIO()
-
- global_streams = []
- filters = image.stream.get_filters()
- for filter_name, params in filters:
- if filter_name in LITERALS_JBIG2_DECODE:
- global_streams.append(params["JBIG2Globals"].resolve())
-
- if len(global_streams) > 1:
- msg = (
- "There should never be more than one JBIG2Globals "
- "associated with a JBIG2 embedded image"
- )
- raise PDFValueError(msg)
- if len(global_streams) == 1:
- input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
- input_stream.write(image.stream.get_data())
- input_stream.seek(0)
- reader = JBIG2StreamReader(input_stream)
- segments = reader.get_segments()
-
- writer = JBIG2StreamWriter(fp)
- writer.write_file(segments)
- return name
-
- def _save_bmp(
- self,
- image: LTImage,
- width: int,
- height: int,
- bytes_per_line: int,
- bits: int,
- ) -> str:
- """Save a BMP encoded image"""
- name, path = self._create_unique_image_name(image, ".bmp")
- with open(path, "wb") as fp:
- bmp = BMPWriter(fp, bits, width, height)
- data = image.stream.get_data()
- i = 0
- for y in range(height):
- bmp.write_line(y, data[i : i + bytes_per_line])
- i += bytes_per_line
- return name
-
- def _save_bytes(self, image: LTImage) -> str:
- """Save an image without encoding, just bytes"""
- name, path = self._create_unique_image_name(image, ".jpg")
- width, height = image.srcsize
- channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
- with open(path, "wb") as fp:
- try:
- from PIL import (
- Image, # type: ignore[import]
- ImageOps,
- )
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
-
- mode: Literal["1", "L", "RGB", "CMYK"]
- if image.bits == 1:
- mode = "1"
- elif image.bits == 8 and channels == 1:
- mode = "L"
- elif image.bits == 8 and channels == 3:
- mode = "RGB"
- elif image.bits == 8 and channels == 4:
- mode = "CMYK"
-
- img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
- if mode == "L":
- img = ImageOps.invert(img)
-
- img.save(fp)
-
- return name
-
- def _save_raw(self, image: LTImage) -> str:
- """Save an image with unknown encoding"""
- ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
- name, path = self._create_unique_image_name(image, ext)
-
- with open(path, "wb") as fp:
- fp.write(image.stream.get_data())
- return name
-
- @staticmethod
- def _is_jbig2_iamge(image: LTImage) -> bool:
- filters = image.stream.get_filters()
- for filter_name, params in filters:
- if filter_name in LITERALS_JBIG2_DECODE:
- return True
- return False
-
- def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
- name = image.name + ext
- path = os.path.join(self.outdir, name)
- img_index = 0
- while os.path.exists(path):
- name = "%s.%d%s" % (image.name, img_index, ext)
- path = os.path.join(self.outdir, name)
- img_index += 1
- return name, path
diff --git a/pdf2zh/jbig2.py b/pdf2zh/jbig2.py
deleted file mode 100644
index 594abbf..0000000
--- a/pdf2zh/jbig2.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import math
-import os
-from struct import calcsize, pack, unpack
-from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast
-
-from pdf2zh.pdfexceptions import PDFValueError
-
-# segment structure base
-SEG_STRUCT = [
- (">L", "number"),
- (">B", "flags"),
- (">B", "retention_flags"),
- (">B", "page_assoc"),
- (">L", "data_length"),
-]
-
-# segment header literals
-HEADER_FLAG_DEFERRED = 0b10000000
-HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
-
-SEG_TYPE_MASK = 0b00111111
-
-REF_COUNT_SHORT_MASK = 0b11100000
-REF_COUNT_LONG_MASK = 0x1FFFFFFF
-REF_COUNT_LONG = 7
-
-DATA_LEN_UNKNOWN = 0xFFFFFFFF
-
-# segment types
-SEG_TYPE_IMMEDIATE_GEN_REGION = 38
-SEG_TYPE_END_OF_PAGE = 49
-SEG_TYPE_END_OF_FILE = 51
-
-# file literals
-FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a"
-FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
-
-
-def bit_set(bit_pos: int, value: int) -> bool:
- return bool((value >> bit_pos) & 1)
-
-
-def check_flag(flag: int, value: int) -> bool:
- return bool(flag & value)
-
-
-def masked_value(mask: int, value: int) -> int:
- for bit_pos in range(31):
- if bit_set(bit_pos, mask):
- return (value & mask) >> bit_pos
-
- raise PDFValueError("Invalid mask or value")
-
-
-def mask_value(mask: int, value: int) -> int:
- for bit_pos in range(31):
- if bit_set(bit_pos, mask):
- return (value & (mask >> bit_pos)) << bit_pos
-
- raise PDFValueError("Invalid mask or value")
-
-
-def unpack_int(format: str, buffer: bytes) -> int:
- assert format in {">B", ">I", ">L"}
- [result] = cast(Tuple[int], unpack(format, buffer))
- return result
-
-
-JBIG2SegmentFlags = Dict[str, Union[int, bool]]
-JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
-JBIG2Segment = Dict[
- str,
- Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags],
-]
-
-
-class JBIG2StreamReader:
- """Read segments from a JBIG2 byte stream"""
-
- def __init__(self, stream: BinaryIO) -> None:
- self.stream = stream
-
- def get_segments(self) -> List[JBIG2Segment]:
- segments: List[JBIG2Segment] = []
- while not self.is_eof():
- segment: JBIG2Segment = {}
- for field_format, name in SEG_STRUCT:
- field_len = calcsize(field_format)
- field = self.stream.read(field_len)
- if len(field) < field_len:
- segment["_error"] = True
- break
- value = unpack_int(field_format, field)
- parser = getattr(self, "parse_%s" % name, None)
- if callable(parser):
- value = parser(segment, value, field)
- segment[name] = value
-
- if not segment.get("_error"):
- segments.append(segment)
- return segments
-
- def is_eof(self) -> bool:
- if self.stream.read(1) == b"":
- return True
- else:
- self.stream.seek(-1, os.SEEK_CUR)
- return False
-
- def parse_flags(
- self,
- segment: JBIG2Segment,
- flags: int,
- field: bytes,
- ) -> JBIG2SegmentFlags:
- return {
- "deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
- "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
- "type": masked_value(SEG_TYPE_MASK, flags),
- }
-
- def parse_retention_flags(
- self,
- segment: JBIG2Segment,
- flags: int,
- field: bytes,
- ) -> JBIG2RetentionFlags:
- ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
- retain_segments = []
- ref_segments = []
-
- if ref_count < REF_COUNT_LONG:
- for bit_pos in range(5):
- retain_segments.append(bit_set(bit_pos, flags))
- else:
- field += self.stream.read(3)
- ref_count = unpack_int(">L", field)
- ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
- ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
- for ret_byte_index in range(ret_bytes_count):
- ret_byte = unpack_int(">B", self.stream.read(1))
- for bit_pos in range(7):
- retain_segments.append(bit_set(bit_pos, ret_byte))
-
- seg_num = segment["number"]
- assert isinstance(seg_num, int)
- if seg_num <= 256:
- ref_format = ">B"
- elif seg_num <= 65536:
- ref_format = ">I"
- else:
- ref_format = ">L"
-
- ref_size = calcsize(ref_format)
-
- for ref_index in range(ref_count):
- ref_data = self.stream.read(ref_size)
- ref = unpack_int(ref_format, ref_data)
- ref_segments.append(ref)
-
- return {
- "ref_count": ref_count,
- "retain_segments": retain_segments,
- "ref_segments": ref_segments,
- }
-
- def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
- if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
- field += self.stream.read(3)
- page = unpack_int(">L", field)
- return page
-
- def parse_data_length(
- self,
- segment: JBIG2Segment,
- length: int,
- field: bytes,
- ) -> int:
- if length:
- if (
- cast(JBIG2SegmentFlags, segment["flags"])["type"]
- == SEG_TYPE_IMMEDIATE_GEN_REGION
- ) and (length == DATA_LEN_UNKNOWN):
- raise NotImplementedError(
- "Working with unknown segment length is not implemented yet",
- )
- else:
- segment["raw_data"] = self.stream.read(length)
-
- return length
-
-
-class JBIG2StreamWriter:
- """Write JBIG2 segments to a file in JBIG2 format"""
-
- EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
- "ref_count": 0,
- "ref_segments": cast(List[int], []),
- "retain_segments": cast(List[bool], []),
- }
-
- def __init__(self, stream: BinaryIO) -> None:
- self.stream = stream
-
- def write_segments(
- self,
- segments: Iterable[JBIG2Segment],
- fix_last_page: bool = True,
- ) -> int:
- data_len = 0
- current_page: Optional[int] = None
- seg_num: Optional[int] = None
-
- for segment in segments:
- data = self.encode_segment(segment)
- self.stream.write(data)
- data_len += len(data)
-
- seg_num = cast(Optional[int], segment["number"])
-
- if fix_last_page:
- seg_page = cast(int, segment.get("page_assoc"))
-
- if (
- cast(JBIG2SegmentFlags, segment["flags"])["type"]
- == SEG_TYPE_END_OF_PAGE
- ):
- current_page = None
- elif seg_page:
- current_page = seg_page
-
- if fix_last_page and current_page and (seg_num is not None):
- segment = self.get_eop_segment(seg_num + 1, current_page)
- data = self.encode_segment(segment)
- self.stream.write(data)
- data_len += len(data)
-
- return data_len
-
- def write_file(
- self,
- segments: Iterable[JBIG2Segment],
- fix_last_page: bool = True,
- ) -> int:
- header = FILE_HEADER_ID
- header_flags = FILE_HEAD_FLAG_SEQUENTIAL
- header += pack(">B", header_flags)
- # The embedded JBIG2 files in a PDF always
- # only have one page
- number_of_pages = pack(">L", 1)
- header += number_of_pages
- self.stream.write(header)
- data_len = len(header)
-
- data_len += self.write_segments(segments, fix_last_page)
-
- seg_num = 0
- for segment in segments:
- seg_num = cast(int, segment["number"])
-
- if fix_last_page:
- seg_num_offset = 2
- else:
- seg_num_offset = 1
- eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
- data = self.encode_segment(eof_segment)
-
- self.stream.write(data)
- data_len += len(data)
-
- return data_len
-
- def encode_segment(self, segment: JBIG2Segment) -> bytes:
- data = b""
- for field_format, name in SEG_STRUCT:
- value = segment.get(name)
- encoder = getattr(self, "encode_%s" % name, None)
- if callable(encoder):
- field = encoder(value, segment)
- else:
- field = pack(field_format, value)
- data += field
- return data
-
- def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
- flags = 0
- if value.get("deferred"):
- flags |= HEADER_FLAG_DEFERRED
-
- if "page_assoc_long" in value:
- flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
- else:
- flags |= (
- HEADER_FLAG_PAGE_ASSOC_LONG
- if cast(int, segment.get("page", 0)) > 255
- else flags
- )
-
- flags |= mask_value(SEG_TYPE_MASK, value["type"])
-
- return pack(">B", flags)
-
- def encode_retention_flags(
- self,
- value: JBIG2RetentionFlags,
- segment: JBIG2Segment,
- ) -> bytes:
- flags = []
- flags_format = ">B"
- ref_count = value["ref_count"]
- assert isinstance(ref_count, int)
- retain_segments = cast(List[bool], value.get("retain_segments", []))
-
- if ref_count <= 4:
- flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
- for ref_index, ref_retain in enumerate(retain_segments):
- if ref_retain:
- flags_byte |= 1 << ref_index
- flags.append(flags_byte)
- else:
- bytes_count = math.ceil((ref_count + 1) / 8)
- flags_format = ">L" + ("B" * bytes_count)
- flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
- flags.append(flags_dword)
-
- for byte_index in range(bytes_count):
- ret_byte = 0
- ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8]
- for bit_pos, ret_seg in enumerate(ret_part):
- ret_byte |= 1 << bit_pos if ret_seg else ret_byte
-
- flags.append(ret_byte)
-
- ref_segments = cast(List[int], value.get("ref_segments", []))
-
- seg_num = cast(int, segment["number"])
- if seg_num <= 256:
- ref_format = "B"
- elif seg_num <= 65536:
- ref_format = "I"
- else:
- ref_format = "L"
-
- for ref in ref_segments:
- flags_format += ref_format
- flags.append(ref)
-
- return pack(flags_format, *flags)
-
- def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes:
- data = pack(">L", value)
- data += cast(bytes, segment["raw_data"])
- return data
-
- def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
- return {
- "data_length": 0,
- "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
- "number": seg_number,
- "page_assoc": page_number,
- "raw_data": b"",
- "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
- }
-
- def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
- return {
- "data_length": 0,
- "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
- "number": seg_number,
- "page_assoc": 0,
- "raw_data": b"",
- "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
- }
diff --git a/pdf2zh/latin_enc.py b/pdf2zh/latin_enc.py
deleted file mode 100644
index c5e8305..0000000
--- a/pdf2zh/latin_enc.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""Standard encoding tables used in PDF.
-
-This table is extracted from PDF Reference Manual 1.6, pp.925
- "D.1 Latin Character Set and Encodings"
-
-"""
-
-from typing import List, Optional, Tuple
-
-EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
-
-ENCODING: List[EncodingRow] = [
- # (name, std, mac, win, pdf)
- ("A", 65, 65, 65, 65),
- ("AE", 225, 174, 198, 198),
- ("Aacute", None, 231, 193, 193),
- ("Acircumflex", None, 229, 194, 194),
- ("Adieresis", None, 128, 196, 196),
- ("Agrave", None, 203, 192, 192),
- ("Aring", None, 129, 197, 197),
- ("Atilde", None, 204, 195, 195),
- ("B", 66, 66, 66, 66),
- ("C", 67, 67, 67, 67),
- ("Ccedilla", None, 130, 199, 199),
- ("D", 68, 68, 68, 68),
- ("E", 69, 69, 69, 69),
- ("Eacute", None, 131, 201, 201),
- ("Ecircumflex", None, 230, 202, 202),
- ("Edieresis", None, 232, 203, 203),
- ("Egrave", None, 233, 200, 200),
- ("Eth", None, None, 208, 208),
- ("Euro", None, None, 128, 160),
- ("F", 70, 70, 70, 70),
- ("G", 71, 71, 71, 71),
- ("H", 72, 72, 72, 72),
- ("I", 73, 73, 73, 73),
- ("Iacute", None, 234, 205, 205),
- ("Icircumflex", None, 235, 206, 206),
- ("Idieresis", None, 236, 207, 207),
- ("Igrave", None, 237, 204, 204),
- ("J", 74, 74, 74, 74),
- ("K", 75, 75, 75, 75),
- ("L", 76, 76, 76, 76),
- ("Lslash", 232, None, None, 149),
- ("M", 77, 77, 77, 77),
- ("N", 78, 78, 78, 78),
- ("Ntilde", None, 132, 209, 209),
- ("O", 79, 79, 79, 79),
- ("OE", 234, 206, 140, 150),
- ("Oacute", None, 238, 211, 211),
- ("Ocircumflex", None, 239, 212, 212),
- ("Odieresis", None, 133, 214, 214),
- ("Ograve", None, 241, 210, 210),
- ("Oslash", 233, 175, 216, 216),
- ("Otilde", None, 205, 213, 213),
- ("P", 80, 80, 80, 80),
- ("Q", 81, 81, 81, 81),
- ("R", 82, 82, 82, 82),
- ("S", 83, 83, 83, 83),
- ("Scaron", None, None, 138, 151),
- ("T", 84, 84, 84, 84),
- ("Thorn", None, None, 222, 222),
- ("U", 85, 85, 85, 85),
- ("Uacute", None, 242, 218, 218),
- ("Ucircumflex", None, 243, 219, 219),
- ("Udieresis", None, 134, 220, 220),
- ("Ugrave", None, 244, 217, 217),
- ("V", 86, 86, 86, 86),
- ("W", 87, 87, 87, 87),
- ("X", 88, 88, 88, 88),
- ("Y", 89, 89, 89, 89),
- ("Yacute", None, None, 221, 221),
- ("Ydieresis", None, 217, 159, 152),
- ("Z", 90, 90, 90, 90),
- ("Zcaron", None, None, 142, 153),
- ("a", 97, 97, 97, 97),
- ("aacute", None, 135, 225, 225),
- ("acircumflex", None, 137, 226, 226),
- ("acute", 194, 171, 180, 180),
- ("adieresis", None, 138, 228, 228),
- ("ae", 241, 190, 230, 230),
- ("agrave", None, 136, 224, 224),
- ("ampersand", 38, 38, 38, 38),
- ("aring", None, 140, 229, 229),
- ("asciicircum", 94, 94, 94, 94),
- ("asciitilde", 126, 126, 126, 126),
- ("asterisk", 42, 42, 42, 42),
- ("at", 64, 64, 64, 64),
- ("atilde", None, 139, 227, 227),
- ("b", 98, 98, 98, 98),
- ("backslash", 92, 92, 92, 92),
- ("bar", 124, 124, 124, 124),
- ("braceleft", 123, 123, 123, 123),
- ("braceright", 125, 125, 125, 125),
- ("bracketleft", 91, 91, 91, 91),
- ("bracketright", 93, 93, 93, 93),
- ("breve", 198, 249, None, 24),
- ("brokenbar", None, None, 166, 166),
- ("bullet", 183, 165, 149, 128),
- ("c", 99, 99, 99, 99),
- ("caron", 207, 255, None, 25),
- ("ccedilla", None, 141, 231, 231),
- ("cedilla", 203, 252, 184, 184),
- ("cent", 162, 162, 162, 162),
- ("circumflex", 195, 246, 136, 26),
- ("colon", 58, 58, 58, 58),
- ("comma", 44, 44, 44, 44),
- ("copyright", None, 169, 169, 169),
- ("currency", 168, 219, 164, 164),
- ("d", 100, 100, 100, 100),
- ("dagger", 178, 160, 134, 129),
- ("daggerdbl", 179, 224, 135, 130),
- ("degree", None, 161, 176, 176),
- ("dieresis", 200, 172, 168, 168),
- ("divide", None, 214, 247, 247),
- ("dollar", 36, 36, 36, 36),
- ("dotaccent", 199, 250, None, 27),
- ("dotlessi", 245, 245, None, 154),
- ("e", 101, 101, 101, 101),
- ("eacute", None, 142, 233, 233),
- ("ecircumflex", None, 144, 234, 234),
- ("edieresis", None, 145, 235, 235),
- ("egrave", None, 143, 232, 232),
- ("eight", 56, 56, 56, 56),
- ("ellipsis", 188, 201, 133, 131),
- ("emdash", 208, 209, 151, 132),
- ("endash", 177, 208, 150, 133),
- ("equal", 61, 61, 61, 61),
- ("eth", None, None, 240, 240),
- ("exclam", 33, 33, 33, 33),
- ("exclamdown", 161, 193, 161, 161),
- ("f", 102, 102, 102, 102),
- ("fi", 174, 222, None, 147),
- ("five", 53, 53, 53, 53),
- ("fl", 175, 223, None, 148),
- ("florin", 166, 196, 131, 134),
- ("four", 52, 52, 52, 52),
- ("fraction", 164, 218, None, 135),
- ("g", 103, 103, 103, 103),
- ("germandbls", 251, 167, 223, 223),
- ("grave", 193, 96, 96, 96),
- ("greater", 62, 62, 62, 62),
- ("guillemotleft", 171, 199, 171, 171),
- ("guillemotright", 187, 200, 187, 187),
- ("guilsinglleft", 172, 220, 139, 136),
- ("guilsinglright", 173, 221, 155, 137),
- ("h", 104, 104, 104, 104),
- ("hungarumlaut", 205, 253, None, 28),
- ("hyphen", 45, 45, 45, 45),
- ("i", 105, 105, 105, 105),
- ("iacute", None, 146, 237, 237),
- ("icircumflex", None, 148, 238, 238),
- ("idieresis", None, 149, 239, 239),
- ("igrave", None, 147, 236, 236),
- ("j", 106, 106, 106, 106),
- ("k", 107, 107, 107, 107),
- ("l", 108, 108, 108, 108),
- ("less", 60, 60, 60, 60),
- ("logicalnot", None, 194, 172, 172),
- ("lslash", 248, None, None, 155),
- ("m", 109, 109, 109, 109),
- ("macron", 197, 248, 175, 175),
- ("minus", None, None, None, 138),
- ("mu", None, 181, 181, 181),
- ("multiply", None, None, 215, 215),
- ("n", 110, 110, 110, 110),
- ("nbspace", None, 202, 160, None),
- ("nine", 57, 57, 57, 57),
- ("ntilde", None, 150, 241, 241),
- ("numbersign", 35, 35, 35, 35),
- ("o", 111, 111, 111, 111),
- ("oacute", None, 151, 243, 243),
- ("ocircumflex", None, 153, 244, 244),
- ("odieresis", None, 154, 246, 246),
- ("oe", 250, 207, 156, 156),
- ("ogonek", 206, 254, None, 29),
- ("ograve", None, 152, 242, 242),
- ("one", 49, 49, 49, 49),
- ("onehalf", None, None, 189, 189),
- ("onequarter", None, None, 188, 188),
- ("onesuperior", None, None, 185, 185),
- ("ordfeminine", 227, 187, 170, 170),
- ("ordmasculine", 235, 188, 186, 186),
- ("oslash", 249, 191, 248, 248),
- ("otilde", None, 155, 245, 245),
- ("p", 112, 112, 112, 112),
- ("paragraph", 182, 166, 182, 182),
- ("parenleft", 40, 40, 40, 40),
- ("parenright", 41, 41, 41, 41),
- ("percent", 37, 37, 37, 37),
- ("period", 46, 46, 46, 46),
- ("periodcentered", 180, 225, 183, 183),
- ("perthousand", 189, 228, 137, 139),
- ("plus", 43, 43, 43, 43),
- ("plusminus", None, 177, 177, 177),
- ("q", 113, 113, 113, 113),
- ("question", 63, 63, 63, 63),
- ("questiondown", 191, 192, 191, 191),
- ("quotedbl", 34, 34, 34, 34),
- ("quotedblbase", 185, 227, 132, 140),
- ("quotedblleft", 170, 210, 147, 141),
- ("quotedblright", 186, 211, 148, 142),
- ("quoteleft", 96, 212, 145, 143),
- ("quoteright", 39, 213, 146, 144),
- ("quotesinglbase", 184, 226, 130, 145),
- ("quotesingle", 169, 39, 39, 39),
- ("r", 114, 114, 114, 114),
- ("registered", None, 168, 174, 174),
- ("ring", 202, 251, None, 30),
- ("s", 115, 115, 115, 115),
- ("scaron", None, None, 154, 157),
- ("section", 167, 164, 167, 167),
- ("semicolon", 59, 59, 59, 59),
- ("seven", 55, 55, 55, 55),
- ("six", 54, 54, 54, 54),
- ("slash", 47, 47, 47, 47),
- ("space", 32, 32, 32, 32),
- ("space", None, 202, 160, None),
- ("space", None, 202, 173, None),
- ("sterling", 163, 163, 163, 163),
- ("t", 116, 116, 116, 116),
- ("thorn", None, None, 254, 254),
- ("three", 51, 51, 51, 51),
- ("threequarters", None, None, 190, 190),
- ("threesuperior", None, None, 179, 179),
- ("tilde", 196, 247, 152, 31),
- ("trademark", None, 170, 153, 146),
- ("two", 50, 50, 50, 50),
- ("twosuperior", None, None, 178, 178),
- ("u", 117, 117, 117, 117),
- ("uacute", None, 156, 250, 250),
- ("ucircumflex", None, 158, 251, 251),
- ("udieresis", None, 159, 252, 252),
- ("ugrave", None, 157, 249, 249),
- ("underscore", 95, 95, 95, 95),
- ("v", 118, 118, 118, 118),
- ("w", 119, 119, 119, 119),
- ("x", 120, 120, 120, 120),
- ("y", 121, 121, 121, 121),
- ("yacute", None, None, 253, 253),
- ("ydieresis", None, 216, 255, 255),
- ("yen", 165, 180, 165, 165),
- ("z", 122, 122, 122, 122),
- ("zcaron", None, None, 158, 158),
- ("zero", 48, 48, 48, 48),
-]
diff --git a/pdf2zh/layout.py b/pdf2zh/layout.py
deleted file mode 100644
index 0920856..0000000
--- a/pdf2zh/layout.py
+++ /dev/null
@@ -1,993 +0,0 @@
-import heapq
-import logging
-from typing import (
- Dict,
- Generic,
- Iterable,
- Iterator,
- List,
- Optional,
- Sequence,
- Set,
- Tuple,
- TypeVar,
- Union,
- cast,
-)
-
-from pdf2zh.pdfcolor import PDFColorSpace
-from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
-from pdf2zh.pdffont import PDFFont
-from pdf2zh.pdfinterp import Color, PDFGraphicState
-from pdf2zh.pdftypes import PDFStream
-from pdf2zh.utils import (
- INF,
- LTComponentT,
- Matrix,
- PathSegment,
- Plane,
- Point,
- Rect,
- apply_matrix_pt,
- bbox2str,
- fsplit,
- get_bound,
- matrix2str,
- uniq,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class IndexAssigner:
- def __init__(self, index: int = 0) -> None:
- self.index = index
-
- def run(self, obj: "LTItem") -> None:
- if isinstance(obj, LTTextBox):
- obj.index = self.index
- self.index += 1
- elif isinstance(obj, LTTextGroup):
- for x in obj:
- self.run(x)
-
-
-class LAParams:
- """Parameters for layout analysis
-
- :param line_overlap: If two characters have more overlap than this they
- are considered to be on the same line. The overlap is specified
- relative to the minimum height of both characters.
- :param char_margin: If two characters are closer together than this
- margin they are considered part of the same line. The margin is
- specified relative to the width of the character.
- :param word_margin: If two characters on the same line are further apart
- than this margin then they are considered to be two separate words, and
- an intermediate space will be added for readability. The margin is
- specified relative to the width of the character.
- :param line_margin: If two lines are are close together they are
- considered to be part of the same paragraph. The margin is
- specified relative to the height of a line.
- :param boxes_flow: Specifies how much a horizontal and vertical position
- of a text matters when determining the order of text boxes. The value
- should be within the range of -1.0 (only horizontal position
- matters) to +1.0 (only vertical position matters). You can also pass
- `None` to disable advanced layout analysis, and instead return text
- based on the position of the bottom left corner of the text box.
- :param detect_vertical: If vertical text should be considered during
- layout analysis
- :param all_texts: If layout analysis should be performed on text in
- figures.
- """
-
- def __init__(
- self,
- line_overlap: float = 0.5,
- char_margin: float = 2.0,
- line_margin: float = 0.5,
- word_margin: float = 0.1,
- boxes_flow: Optional[float] = 0.5,
- detect_vertical: bool = False,
- all_texts: bool = False,
- ) -> None:
- self.line_overlap = line_overlap
- self.char_margin = char_margin
- self.line_margin = line_margin
- self.word_margin = word_margin
- self.boxes_flow = boxes_flow
- self.detect_vertical = detect_vertical
- self.all_texts = all_texts
-
- self._validate()
-
- def _validate(self) -> None:
- if self.boxes_flow is not None:
- boxes_flow_err_msg = (
- "LAParam boxes_flow should be None, or a number between -1 and +1"
- )
- if not (
- isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
- ):
- raise PDFTypeError(boxes_flow_err_msg)
- if not -1 <= self.boxes_flow <= 1:
- raise PDFValueError(boxes_flow_err_msg)
-
- def __repr__(self) -> str:
- return (
- ""
- % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
- )
-
-
-class LTItem:
- """Interface for things that can be analyzed"""
-
- def analyze(self, laparams: LAParams) -> None:
- """Perform the layout analysis."""
-
-
-class LTText:
- """Interface for things that have text"""
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__} {self.get_text()!r}>"
-
- def get_text(self) -> str:
- """Text contained in this object"""
- raise NotImplementedError
-
-
-class LTComponent(LTItem):
- """Object with a bounding box"""
-
- def __init__(self, bbox: Rect) -> None:
- LTItem.__init__(self)
- self.set_bbox(bbox)
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"
-
- # Disable comparison.
- def __lt__(self, _: object) -> bool:
- raise PDFValueError
-
- def __le__(self, _: object) -> bool:
- raise PDFValueError
-
- def __gt__(self, _: object) -> bool:
- raise PDFValueError
-
- def __ge__(self, _: object) -> bool:
- raise PDFValueError
-
- def set_bbox(self, bbox: Rect) -> None:
- (x0, y0, x1, y1) = bbox
- self.x0 = x0
- self.y0 = y0
- self.x1 = x1
- self.y1 = y1
- self.width = x1 - x0
- self.height = y1 - y0
- self.bbox = bbox
-
- def is_empty(self) -> bool:
- return self.width <= 0 or self.height <= 0
-
- def is_hoverlap(self, obj: "LTComponent") -> bool:
- assert isinstance(obj, LTComponent), str(type(obj))
- return obj.x0 <= self.x1 and self.x0 <= obj.x1
-
- def hdistance(self, obj: "LTComponent") -> float:
- assert isinstance(obj, LTComponent), str(type(obj))
- if self.is_hoverlap(obj):
- return 0
- else:
- return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
-
- def hoverlap(self, obj: "LTComponent") -> float:
- assert isinstance(obj, LTComponent), str(type(obj))
- if self.is_hoverlap(obj):
- return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
- else:
- return 0
-
- def is_voverlap(self, obj: "LTComponent") -> bool:
- assert isinstance(obj, LTComponent), str(type(obj))
- return obj.y0 <= self.y1 and self.y0 <= obj.y1
-
- def vdistance(self, obj: "LTComponent") -> float:
- assert isinstance(obj, LTComponent), str(type(obj))
- if self.is_voverlap(obj):
- return 0
- else:
- return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
-
- def voverlap(self, obj: "LTComponent") -> float:
- assert isinstance(obj, LTComponent), str(type(obj))
- if self.is_voverlap(obj):
- return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
- else:
- return 0
-
-
-class LTCurve(LTComponent):
- """A generic Bezier curve
-
- The parameter `original_path` contains the original
- pathing information from the pdf (e.g. for reconstructing Bezier Curves).
-
- `dashing_style` contains the Dashing information if any.
- """
-
- def __init__(
- self,
- linewidth: float,
- pts: List[Point],
- stroke: bool = False,
- fill: bool = False,
- evenodd: bool = False,
- stroking_color: Optional[Color] = None,
- non_stroking_color: Optional[Color] = None,
- original_path: Optional[List[PathSegment]] = None,
- dashing_style: Optional[Tuple[object, object]] = None,
- ) -> None:
- LTComponent.__init__(self, get_bound(pts))
- self.pts = pts
- self.linewidth = linewidth
- self.stroke = stroke
- self.fill = fill
- self.evenodd = evenodd
- self.stroking_color = stroking_color
- self.non_stroking_color = non_stroking_color
- self.original_path = original_path
- self.dashing_style = dashing_style
-
- def get_pts(self) -> str:
- return ",".join("%.3f,%.3f" % p for p in self.pts)
-
-
-class LTLine(LTCurve):
- """A single straight line.
-
- Could be used for separating text or figures.
- """
-
- def __init__(
- self,
- linewidth: float,
- p0: Point,
- p1: Point,
- stroke: bool = False,
- fill: bool = False,
- evenodd: bool = False,
- stroking_color: Optional[Color] = None,
- non_stroking_color: Optional[Color] = None,
- original_path: Optional[List[PathSegment]] = None,
- dashing_style: Optional[Tuple[object, object]] = None,
- ) -> None:
- LTCurve.__init__(
- self,
- linewidth,
- [p0, p1],
- stroke,
- fill,
- evenodd,
- stroking_color,
- non_stroking_color,
- original_path,
- dashing_style,
- )
-
-
-class LTRect(LTCurve):
- """A rectangle.
-
- Could be used for framing another pictures or figures.
- """
-
- def __init__(
- self,
- linewidth: float,
- bbox: Rect,
- stroke: bool = False,
- fill: bool = False,
- evenodd: bool = False,
- stroking_color: Optional[Color] = None,
- non_stroking_color: Optional[Color] = None,
- original_path: Optional[List[PathSegment]] = None,
- dashing_style: Optional[Tuple[object, object]] = None,
- ) -> None:
- (x0, y0, x1, y1) = bbox
- LTCurve.__init__(
- self,
- linewidth,
- [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
- stroke,
- fill,
- evenodd,
- stroking_color,
- non_stroking_color,
- original_path,
- dashing_style,
- )
-
-
-class LTImage(LTComponent):
- """An image object.
-
- Embedded images can be in JPEG, Bitmap or JBIG2.
- """
-
- def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
- LTComponent.__init__(self, bbox)
- self.name = name
- self.stream = stream
- self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
- self.imagemask = stream.get_any(("IM", "ImageMask"))
- self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
- self.colorspace = stream.get_any(("CS", "ColorSpace"))
- if not isinstance(self.colorspace, list):
- self.colorspace = [self.colorspace]
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>"
-
-
-class LTAnno(LTItem, LTText):
- """Actual letter in the text as a Unicode string.
-
- Note that, while a LTChar object has actual boundaries, LTAnno objects does
- not, as these are "virtual" characters, inserted by a layout analyzer
- according to the relationship between two characters (e.g. a space).
- """
-
- def __init__(self, text: str) -> None:
- self._text = text
-
- def get_text(self) -> str:
- return self._text
-
-
-class LTChar(LTComponent, LTText):
- """Actual letter in the text as a Unicode string."""
-
- def __init__(
- self,
- matrix: Matrix,
- font: PDFFont,
- fontsize: float,
- scaling: float,
- rise: float,
- text: str,
- textwidth: float,
- textdisp: Union[float, Tuple[Optional[float], float]],
- ncs: PDFColorSpace,
- graphicstate: PDFGraphicState,
- ) -> None:
- LTText.__init__(self)
- self._text = text
- self.matrix = matrix
- self.font = font
- self.fontname = font.fontname
- self.ncs = ncs
- self.graphicstate = graphicstate
- self.adv = textwidth * fontsize * scaling
- # compute the boundary rectangle.
- if font.is_vertical():
- # vertical
- assert isinstance(textdisp, tuple)
- (vx, vy) = textdisp
- if vx is None:
- vx = fontsize * 0.5
- else:
- vx = vx * fontsize * 0.001
- vy = (1000 - vy) * fontsize * 0.001
- bbox_lower_left = (-vx, vy + rise + self.adv)
- bbox_upper_right = (-vx + fontsize, vy + rise)
- else:
- # horizontal
- descent = 0 # descent = font.get_descent() * fontsize
- bbox_lower_left = (0, descent + rise)
- bbox_upper_right = (self.adv, descent + rise + fontsize)
- (a, b, c, d, e, f) = self.matrix
- self.upright = a * d * scaling > 0 and b * c <= 0
- (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
- (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
- if x1 < x0:
- (x0, x1) = (x1, x0)
- if y1 < y0:
- (y0, y1) = (y1, y0)
- LTComponent.__init__(self, (x0, y0, x1, y1))
- if font.is_vertical():
- self.size = self.width
- else:
- self.size = self.height
-
- def __repr__(self) -> str:
- return "<{} {} matrix={} font={} adv={} text={}>".format(
- self.__class__.__name__,
- bbox2str(self.bbox),
- matrix2str(self.matrix),
- repr(self.fontname),
- self.adv,
- repr(self.get_text()),
- )
-
- def get_text(self) -> str:
- return self._text
-
-
-LTItemT = TypeVar("LTItemT", bound=LTItem)
-
-
-class LTContainer(LTComponent, Generic[LTItemT]):
- """Object that can be extended and analyzed"""
-
- def __init__(self, bbox: Rect) -> None:
- LTComponent.__init__(self, bbox)
- self._objs: List[LTItemT] = []
-
- def __iter__(self) -> Iterator[LTItemT]:
- return iter(self._objs)
-
- def __len__(self) -> int:
- return len(self._objs)
-
- def add(self, obj: LTItemT) -> None:
- self._objs.append(obj)
-
- def extend(self, objs: Iterable[LTItemT]) -> None:
- for obj in objs:
- self.add(obj)
-
- def analyze(self, laparams: LAParams) -> None:
- for obj in self._objs:
- obj.analyze(laparams)
-
-
-class LTExpandableContainer(LTContainer[LTItemT]):
- def __init__(self) -> None:
- LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
-
- # Incompatible override: we take an LTComponent (with bounding box), but
- # super() LTContainer only considers LTItem (no bounding box).
- def add(self, obj: LTComponent) -> None: # type: ignore[override]
- LTContainer.add(self, cast(LTItemT, obj))
- self.set_bbox(
- (
- min(self.x0, obj.x0),
- min(self.y0, obj.y0),
- max(self.x1, obj.x1),
- max(self.y1, obj.y1),
- ),
- )
-
-
-class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
- def __init__(self) -> None:
- LTText.__init__(self)
- LTExpandableContainer.__init__(self)
-
- def get_text(self) -> str:
- return "".join(
- cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
- )
-
-
-TextLineElement = Union[LTChar, LTAnno]
-
-
-class LTTextLine(LTTextContainer[TextLineElement]):
- """Contains a list of LTChar objects that represent a single text line.
-
- The characters are aligned either horizontally or vertically, depending on
- the text's writing mode.
- """
-
- def __init__(self, word_margin: float) -> None:
- super().__init__()
- self.word_margin = word_margin
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"
-
- def analyze(self, laparams: LAParams) -> None:
- for obj in self._objs:
- obj.analyze(laparams)
- LTContainer.add(self, LTAnno("\n"))
-
- def find_neighbors(
- self,
- plane: Plane[LTComponentT],
- ratio: float,
- ) -> List["LTTextLine"]:
- raise NotImplementedError
-
- def is_empty(self) -> bool:
- return super().is_empty() or self.get_text().isspace()
-
-
-class LTTextLineHorizontal(LTTextLine):
- def __init__(self, word_margin: float) -> None:
- LTTextLine.__init__(self, word_margin)
- self._x1: float = +INF
-
- # Incompatible override: we take an LTComponent (with bounding box), but
- # LTContainer only considers LTItem (no bounding box).
- def add(self, obj: LTComponent) -> None: # type: ignore[override]
- if isinstance(obj, LTChar) and self.word_margin:
- margin = self.word_margin * max(obj.width, obj.height)
- if self._x1 < obj.x0 - margin:
- LTContainer.add(self, LTAnno(" "))
- self._x1 = obj.x1
- super().add(obj)
-
- def find_neighbors(
- self,
- plane: Plane[LTComponentT],
- ratio: float,
- ) -> List[LTTextLine]:
- """Finds neighboring LTTextLineHorizontals in the plane.
-
- Returns a list of other LTTestLineHorizontals in the plane which are
- close to self. "Close" can be controlled by ratio. The returned objects
- will be the same height as self, and also either left-, right-, or
- centrally-aligned.
- """
- d = ratio * self.height
- objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
- return [
- obj
- for obj in objs
- if (
- isinstance(obj, LTTextLineHorizontal)
- and self._is_same_height_as(obj, tolerance=d)
- and (
- self._is_left_aligned_with(obj, tolerance=d)
- or self._is_right_aligned_with(obj, tolerance=d)
- or self._is_centrally_aligned_with(obj, tolerance=d)
- )
- )
- ]
-
- def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
- """Whether the left-hand edge of `other` is within `tolerance`."""
- return abs(other.x0 - self.x0) <= tolerance
-
- def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
- """Whether the right-hand edge of `other` is within `tolerance`."""
- return abs(other.x1 - self.x1) <= tolerance
-
- def _is_centrally_aligned_with(
- self,
- other: LTComponent,
- tolerance: float = 0,
- ) -> bool:
- """Whether the horizontal center of `other` is within `tolerance`."""
- return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
-
- def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
- return abs(other.height - self.height) <= tolerance
-
-
-class LTTextLineVertical(LTTextLine):
- def __init__(self, word_margin: float) -> None:
- LTTextLine.__init__(self, word_margin)
- self._y0: float = -INF
-
- # Incompatible override: we take an LTComponent (with bounding box), but
- # LTContainer only considers LTItem (no bounding box).
- def add(self, obj: LTComponent) -> None: # type: ignore[override]
- if isinstance(obj, LTChar) and self.word_margin:
- margin = self.word_margin * max(obj.width, obj.height)
- if obj.y1 + margin < self._y0:
- LTContainer.add(self, LTAnno(" "))
- self._y0 = obj.y0
- super().add(obj)
-
- def find_neighbors(
- self,
- plane: Plane[LTComponentT],
- ratio: float,
- ) -> List[LTTextLine]:
- """Finds neighboring LTTextLineVerticals in the plane.
-
- Returns a list of other LTTextLineVerticals in the plane which are
- close to self. "Close" can be controlled by ratio. The returned objects
- will be the same width as self, and also either upper-, lower-, or
- centrally-aligned.
- """
- d = ratio * self.width
- objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
- return [
- obj
- for obj in objs
- if (
- isinstance(obj, LTTextLineVertical)
- and self._is_same_width_as(obj, tolerance=d)
- and (
- self._is_lower_aligned_with(obj, tolerance=d)
- or self._is_upper_aligned_with(obj, tolerance=d)
- or self._is_centrally_aligned_with(obj, tolerance=d)
- )
- )
- ]
-
- def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
- """Whether the lower edge of `other` is within `tolerance`."""
- return abs(other.y0 - self.y0) <= tolerance
-
- def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
- """Whether the upper edge of `other` is within `tolerance`."""
- return abs(other.y1 - self.y1) <= tolerance
-
- def _is_centrally_aligned_with(
- self,
- other: LTComponent,
- tolerance: float = 0,
- ) -> bool:
- """Whether the vertical center of `other` is within `tolerance`."""
- return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
-
- def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
- return abs(other.width - self.width) <= tolerance
-
-
-class LTTextBox(LTTextContainer[LTTextLine]):
- """Represents a group of text chunks in a rectangular area.
-
- Note that this box is created by geometric analysis and does not
- necessarily represents a logical boundary of the text. It contains a list
- of LTTextLine objects.
- """
-
- def __init__(self) -> None:
- LTTextContainer.__init__(self)
- self.index: int = -1
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"
-
- def get_writing_mode(self) -> str:
- raise NotImplementedError
-
-
-class LTTextBoxHorizontal(LTTextBox):
- def analyze(self, laparams: LAParams) -> None:
- super().analyze(laparams)
- self._objs.sort(key=lambda obj: -obj.y1)
-
- def get_writing_mode(self) -> str:
- return "lr-tb"
-
-
-class LTTextBoxVertical(LTTextBox):
- def analyze(self, laparams: LAParams) -> None:
- super().analyze(laparams)
- self._objs.sort(key=lambda obj: -obj.x1)
-
- def get_writing_mode(self) -> str:
- return "tb-rl"
-
-
-TextGroupElement = Union[LTTextBox, "LTTextGroup"]
-
-
-class LTTextGroup(LTTextContainer[TextGroupElement]):
- def __init__(self, objs: Iterable[TextGroupElement]) -> None:
- super().__init__()
- self.extend(objs)
-
-
-class LTTextGroupLRTB(LTTextGroup):
- def analyze(self, laparams: LAParams) -> None:
- super().analyze(laparams)
- assert laparams.boxes_flow is not None
- boxes_flow = laparams.boxes_flow
- # reorder the objects from top-left to bottom-right.
- self._objs.sort(
- key=lambda obj: (1 - boxes_flow) * obj.x0
- - (1 + boxes_flow) * (obj.y0 + obj.y1),
- )
-
-
-class LTTextGroupTBRL(LTTextGroup):
- def analyze(self, laparams: LAParams) -> None:
- super().analyze(laparams)
- assert laparams.boxes_flow is not None
- boxes_flow = laparams.boxes_flow
- # reorder the objects from top-right to bottom-left.
- self._objs.sort(
- key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
- - (1 - boxes_flow) * obj.y1,
- )
-
-
-class LTLayoutContainer(LTContainer[LTComponent]):
- def __init__(self, bbox: Rect) -> None:
- LTContainer.__init__(self, bbox)
- self.groups: Optional[List[LTTextGroup]] = None
-
- # group_objects: group text object to textlines.
- def group_objects(
- self,
- laparams: LAParams,
- objs: Iterable[LTComponent],
- ) -> Iterator[LTTextLine]:
- obj0 = None
- line = None
- for obj1 in objs:
- if obj0 is not None:
- # halign: obj0 and obj1 is horizontally aligned.
- #
- # +------+ - - -
- # | obj0 | - - +------+ -
- # | | | obj1 | | (line_overlap)
- # +------+ - - | | -
- # - - - +------+
- #
- # |<--->|
- # (char_margin)
- halign = (
- obj0.is_voverlap(obj1)
- and min(obj0.height, obj1.height) * laparams.line_overlap
- < obj0.voverlap(obj1)
- and obj0.hdistance(obj1)
- < max(obj0.width, obj1.width) * laparams.char_margin
- )
-
- # valign: obj0 and obj1 is vertically aligned.
- #
- # +------+
- # | obj0 |
- # | |
- # +------+ - - -
- # | | | (char_margin)
- # +------+ - -
- # | obj1 |
- # | |
- # +------+
- #
- # |<-->|
- # (line_overlap)
- valign = (
- laparams.detect_vertical
- and obj0.is_hoverlap(obj1)
- and min(obj0.width, obj1.width) * laparams.line_overlap
- < obj0.hoverlap(obj1)
- and obj0.vdistance(obj1)
- < max(obj0.height, obj1.height) * laparams.char_margin
- )
-
- if (halign and isinstance(line, LTTextLineHorizontal)) or (
- valign and isinstance(line, LTTextLineVertical)
- ):
- line.add(obj1)
- elif line is not None:
- yield line
- line = None
- elif valign and not halign:
- line = LTTextLineVertical(laparams.word_margin)
- line.add(obj0)
- line.add(obj1)
- elif halign and not valign:
- line = LTTextLineHorizontal(laparams.word_margin)
- line.add(obj0)
- line.add(obj1)
- else:
- line = LTTextLineHorizontal(laparams.word_margin)
- line.add(obj0)
- yield line
- line = None
- obj0 = obj1
- if line is None:
- line = LTTextLineHorizontal(laparams.word_margin)
- assert obj0 is not None
- line.add(obj0)
- yield line
-
- def group_textlines(
- self,
- laparams: LAParams,
- lines: Iterable[LTTextLine],
- ) -> Iterator[LTTextBox]:
- """Group neighboring lines to textboxes"""
- plane: Plane[LTTextLine] = Plane(self.bbox)
- plane.extend(lines)
- boxes: Dict[LTTextLine, LTTextBox] = {}
- for line in lines:
- neighbors = line.find_neighbors(plane, laparams.line_margin)
- members = [line]
- for obj1 in neighbors:
- members.append(obj1)
- if obj1 in boxes:
- members.extend(boxes.pop(obj1))
- if isinstance(line, LTTextLineHorizontal):
- box: LTTextBox = LTTextBoxHorizontal()
- else:
- box = LTTextBoxVertical()
- for obj in uniq(members):
- box.add(obj)
- boxes[obj] = box
- done = set()
- for line in lines:
- if line not in boxes:
- continue
- box = boxes[line]
- if box in done:
- continue
- done.add(box)
- if not box.is_empty():
- yield box
-
- def group_textboxes(
- self,
- laparams: LAParams,
- boxes: Sequence[LTTextBox],
- ) -> List[LTTextGroup]:
- """Group textboxes hierarchically.
-
- Get pair-wise distances, via dist func defined below, and then merge
- from the closest textbox pair. Once obj1 and obj2 are merged /
- grouped, the resulting group is considered as a new object, and its
- distances to other objects & groups are added to the process queue.
-
- For performance reason, pair-wise distances and object pair info are
- maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
- tuples. It ensures quick access to the smallest element. Note that
- since comparison operators, e.g., __lt__, are disabled for
- LTComponent, id(obj) has to appear before obj in element tuples.
-
- :param laparams: LAParams object.
- :param boxes: All textbox objects to be grouped.
- :return: a list that has only one element, the final top level group.
- """
- ElementT = Union[LTTextBox, LTTextGroup]
- plane: Plane[ElementT] = Plane(self.bbox)
-
- def dist(obj1: LTComponent, obj2: LTComponent) -> float:
- """A distance function between two TextBoxes.
-
- Consider the bounding rectangle for obj1 and obj2.
- Return its area less the areas of obj1 and obj2,
- shown as 'www' below. This value may be negative.
- +------+..........+ (x1, y1)
- | obj1 |wwwwwwwwww:
- +------+www+------+
- :wwwwwwwwww| obj2 |
- (x0, y0) +..........+------+
- """
- x0 = min(obj1.x0, obj2.x0)
- y0 = min(obj1.y0, obj2.y0)
- x1 = max(obj1.x1, obj2.x1)
- y1 = max(obj1.y1, obj2.y1)
- return (
- (x1 - x0) * (y1 - y0)
- - obj1.width * obj1.height
- - obj2.width * obj2.height
- )
-
- def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
- """Check if there's any other object between obj1 and obj2."""
- x0 = min(obj1.x0, obj2.x0)
- y0 = min(obj1.y0, obj2.y0)
- x1 = max(obj1.x1, obj2.x1)
- y1 = max(obj1.y1, obj2.y1)
- objs = set(plane.find((x0, y0, x1, y1)))
- return objs.difference((obj1, obj2))
-
- dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
- for i in range(len(boxes)):
- box1 = boxes[i]
- for j in range(i + 1, len(boxes)):
- box2 = boxes[j]
- dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
- heapq.heapify(dists)
-
- plane.extend(boxes)
- done = set()
- while len(dists) > 0:
- (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
- # Skip objects that are already merged
- if (id1 not in done) and (id2 not in done):
- if not skip_isany and isany(obj1, obj2):
- heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
- continue
- if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
- obj2,
- (LTTextBoxVertical, LTTextGroupTBRL),
- ):
- group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
- else:
- group = LTTextGroupLRTB([obj1, obj2])
- plane.remove(obj1)
- plane.remove(obj2)
- done.update([id1, id2])
-
- for other in plane:
- heapq.heappush(
- dists,
- (False, dist(group, other), id(group), id(other), group, other),
- )
- plane.add(group)
- # By now only groups are in the plane
- return list(cast(LTTextGroup, g) for g in plane)
-
- def analyze(self, laparams: LAParams) -> None:
- # textobjs is a list of LTChar objects, i.e.
- # it has all the individual characters in the page.
- (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
- for obj in otherobjs:
- obj.analyze(laparams)
- if not textobjs:
- return
- textlines = list(self.group_objects(laparams, textobjs))
- (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
- for obj in empties:
- obj.analyze(laparams)
- textboxes = list(self.group_textlines(laparams, textlines))
- if laparams.boxes_flow is None:
- for textbox in textboxes:
- textbox.analyze(laparams)
-
- def getkey(box: LTTextBox) -> Tuple[int, float, float]:
- if isinstance(box, LTTextBoxVertical):
- return (0, -box.x1, -box.y0)
- else:
- return (1, -box.y0, box.x0)
-
- textboxes.sort(key=getkey)
- else:
- self.groups = self.group_textboxes(laparams, textboxes)
- assigner = IndexAssigner()
- for group in self.groups:
- group.analyze(laparams)
- assigner.run(group)
- textboxes.sort(key=lambda box: box.index)
- self._objs = (
- cast(List[LTComponent], textboxes)
- + otherobjs
- + cast(List[LTComponent], empties)
- )
-
-
-class LTFigure(LTLayoutContainer):
- """Represents an area used by PDF Form objects.
-
- PDF Forms can be used to present figures or pictures by embedding yet
- another PDF document within a page. Note that LTFigure objects can appear
- recursively.
- """
-
- def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
- self.name = name
- self.matrix = matrix
- (x, y, w, h) = bbox
- bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
- bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
- LTLayoutContainer.__init__(self, bbox)
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"
-
- def analyze(self, laparams: LAParams) -> None:
- if not laparams.all_texts:
- return
- LTLayoutContainer.analyze(self, laparams)
-
-
-class LTPage(LTLayoutContainer):
- """Represents an entire page.
-
- Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
- objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
- """
-
- def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
- LTLayoutContainer.__init__(self, bbox)
- self.pageid = pageid
- self.rotate = rotate
-
- def __repr__(self) -> str:
- return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"
diff --git a/pdf2zh/lzw.py b/pdf2zh/lzw.py
deleted file mode 100644
index 82a4941..0000000
--- a/pdf2zh/lzw.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import logging
-from io import BytesIO
-from typing import BinaryIO, Iterator, List, Optional, cast
-
-from pdf2zh.pdfexceptions import PDFEOFError, PDFException
-
-logger = logging.getLogger(__name__)
-
-
-class CorruptDataError(PDFException):
- pass
-
-
-class LZWDecoder:
- def __init__(self, fp: BinaryIO) -> None:
- self.fp = fp
- self.buff = 0
- self.bpos = 8
- self.nbits = 9
- # NB: self.table stores None only in indices 256 and 257
- self.table: List[Optional[bytes]] = []
- self.prevbuf: Optional[bytes] = None
-
- def readbits(self, bits: int) -> int:
- v = 0
- while 1:
- # the number of remaining bits we can get from the current buffer.
- r = 8 - self.bpos
- if bits <= r:
- # |-----8-bits-----|
- # |-bpos-|-bits-| |
- # | |----r----|
- v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
- self.bpos += bits
- break
- else:
- # |-----8-bits-----|
- # |-bpos-|---bits----...
- # | |----r----|
- v = (v << r) | (self.buff & ((1 << r) - 1))
- bits -= r
- x = self.fp.read(1)
- if not x:
- raise PDFEOFError
- self.buff = ord(x)
- self.bpos = 0
- return v
-
- def feed(self, code: int) -> bytes:
- x = b""
- if code == 256:
- self.table = [bytes((c,)) for c in range(256)] # 0-255
- self.table.append(None) # 256
- self.table.append(None) # 257
- self.prevbuf = b""
- self.nbits = 9
- elif code == 257:
- pass
- elif not self.prevbuf:
- x = self.prevbuf = cast(bytes, self.table[code]) # assume not None
- else:
- if code < len(self.table):
- x = cast(bytes, self.table[code]) # assume not None
- self.table.append(self.prevbuf + x[:1])
- elif code == len(self.table):
- self.table.append(self.prevbuf + self.prevbuf[:1])
- x = cast(bytes, self.table[code])
- else:
- raise CorruptDataError
- table_length = len(self.table)
- if table_length == 511:
- self.nbits = 10
- elif table_length == 1023:
- self.nbits = 11
- elif table_length == 2047:
- self.nbits = 12
- self.prevbuf = x
- return x
-
- def run(self) -> Iterator[bytes]:
- while 1:
- try:
- code = self.readbits(self.nbits)
- except EOFError:
- break
- try:
- x = self.feed(code)
- except CorruptDataError:
- # just ignore corrupt data and stop yielding there
- break
- yield x
-
- # logger.debug(
- # "nbits=%d, code=%d, output=%r, table=%r",
- # self.nbits,
- # code,
- # x,
- # self.table[258:],
- # )
-
-
-def lzwdecode(data: bytes) -> bytes:
- fp = BytesIO(data)
- s = LZWDecoder(fp).run()
- return b"".join(s)
diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py
index ac4ddcf..a60c227 100644
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -6,34 +6,16 @@
from __future__ import annotations
import argparse
-import logging
import os
import sys
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
+from typing import Any, Container, Iterable, List, Optional
+from pdfminer.pdfexceptions import PDFValueError
import pymupdf
import requests
from pdf2zh import __version__
-from pdf2zh.pdfexceptions import PDFValueError
-
-if TYPE_CHECKING:
- from pdf2zh.layout import LAParams
- from pdf2zh.utils import AnyIO
-
-OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
-
-
-def setup_log() -> None:
- logging.basicConfig()
-
- try:
- import doclayout_yolo
-
- doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
- except ImportError:
- pass
def check_files(files: List[str]) -> List[str]:
@@ -47,31 +29,11 @@ def check_files(files: List[str]) -> List[str]:
return missing_files
-def float_or_disabled(x: str) -> Optional[float]:
- if x.lower().strip() == "disabled":
- return None
- try:
- return float(x)
- except ValueError:
- raise argparse.ArgumentTypeError(f"invalid float value: {x}")
-
-
def extract_text(
files: Iterable[str] = [],
- outfile: str = "-",
- laparams: Optional[LAParams] = None,
- output_type: str = "text",
- codec: str = "utf-8",
- strip_control: bool = False,
- maxpages: int = 0,
pages: Optional[Container[int]] = None,
password: str = "",
- scale: float = 1.0,
- rotation: int = 0,
- layoutmode: str = "normal",
- output_dir: Optional[str] = None,
debug: bool = False,
- disable_caching: bool = False,
vfont: str = "",
vchar: str = "",
thread: int = 0,
@@ -81,19 +43,13 @@ def extract_text(
callback: object = None,
output: str = "",
**kwargs: Any,
-) -> AnyIO:
+):
import pdf2zh.high_level
from pdf2zh.doclayout import DocLayoutModel
if not files:
raise PDFValueError("Must provide files to work upon!")
- if output_type == "text" and outfile != "-":
- for override, alttype in OUTPUT_TYPES:
- if outfile.endswith(override):
- output_type = alttype
-
- outfp: AnyIO = sys.stdout
model = DocLayoutModel.load_available()
for file in files:
@@ -300,11 +256,9 @@ def main(args: Optional[List[str]] = None) -> int:
setup_gui(parsed_args.share)
return 0
- setup_log()
extract_text(**vars(parsed_args))
return 0
if __name__ == "__main__":
sys.exit(main())
- sys.exit(main())
diff --git a/pdf2zh/pdfcolor.py b/pdf2zh/pdfcolor.py
deleted file mode 100644
index 08e044e..0000000
--- a/pdf2zh/pdfcolor.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import collections
-from typing import Dict
-
-from pdf2zh.psparser import LIT
-
-LITERAL_DEVICE_GRAY = LIT("DeviceGray")
-LITERAL_DEVICE_RGB = LIT("DeviceRGB")
-LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
-# Abbreviations for inline images
-LITERAL_INLINE_DEVICE_GRAY = LIT("G")
-LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
-LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")
-
-
-class PDFColorSpace:
- def __init__(self, name: str, ncomponents: int) -> None:
- self.name = name
- self.ncomponents = ncomponents
-
- def __repr__(self) -> str:
- return "" % (self.name, self.ncomponents)
-
-
-PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
-
-for name, n in [
- ("DeviceGray", 1), # default value first
- ("CalRGB", 3),
- ("CalGray", 1),
- ("Lab", 3),
- ("DeviceRGB", 3),
- ("DeviceCMYK", 4),
- ("Separation", 1),
- ("Indexed", 1),
- ("Pattern", 1),
-]:
- PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
diff --git a/pdf2zh/pdfdevice.py b/pdf2zh/pdfdevice.py
deleted file mode 100644
index edbbe99..0000000
--- a/pdf2zh/pdfdevice.py
+++ /dev/null
@@ -1,316 +0,0 @@
-from typing import (
- TYPE_CHECKING,
- BinaryIO,
- Iterable,
- List,
- Optional,
- Sequence,
- Union,
- cast,
-)
-
-from pdf2zh import utils
-from pdf2zh.pdfcolor import PDFColorSpace
-from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined
-from pdf2zh.pdfpage import PDFPage
-from pdf2zh.pdftypes import PDFStream
-from pdf2zh.psparser import PSLiteral
-from pdf2zh.utils import Matrix, PathSegment, Point, Rect
-
-if TYPE_CHECKING:
- from pdf2zh.pdfinterp import (
- PDFGraphicState,
- PDFResourceManager,
- PDFStackT,
- PDFTextState,
- )
-
-
-PDFTextSeq = Iterable[Union[int, float, bytes]]
-
-
-class PDFDevice:
- """Translate the output of PDFPageInterpreter to the output that is needed"""
-
- def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
- self.rsrcmgr = rsrcmgr
- self.ctm: Optional[Matrix] = None
-
- def __repr__(self) -> str:
- return ""
-
- def __enter__(self) -> "PDFDevice":
- return self
-
- def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
- self.close()
-
- def close(self) -> None:
- pass
-
- def set_ctm(self, ctm: Matrix) -> None:
- self.ctm = ctm
-
- def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
- pass
-
- def end_tag(self) -> None:
- pass
-
- def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
- pass
-
- def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
- pass
-
- def end_page(self, page: PDFPage) -> None:
- pass
-
- def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
- pass
-
- def end_figure(self, name: str) -> None:
- pass
-
- def paint_path(
- self,
- graphicstate: "PDFGraphicState",
- stroke: bool,
- fill: bool,
- evenodd: bool,
- path: Sequence[PathSegment],
- ) -> None:
- pass
-
- def render_image(self, name: str, stream: PDFStream) -> None:
- pass
-
- def render_string(
- self,
- textstate: "PDFTextState",
- seq: PDFTextSeq,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> None:
- pass
-
-
-class PDFTextDevice(PDFDevice):
- def render_string(
- self,
- textstate: "PDFTextState",
- seq: PDFTextSeq,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> None:
- assert self.ctm is not None
- matrix = utils.mult_matrix(textstate.matrix, self.ctm)
- font = textstate.font
- fontsize = textstate.fontsize
- scaling = textstate.scaling * 0.01
- charspace = textstate.charspace * scaling
- wordspace = textstate.wordspace * scaling
- rise = textstate.rise
- assert font is not None
- if font.is_multibyte():
- wordspace = 0
- dxscale = 0.001 * fontsize * scaling
- if font.is_vertical():
- textstate.linematrix = self.render_string_vertical(
- seq,
- matrix,
- textstate.linematrix,
- font,
- fontsize,
- scaling,
- charspace,
- wordspace,
- rise,
- dxscale,
- ncs,
- graphicstate,
- )
- else:
- textstate.linematrix = self.render_string_horizontal(
- seq,
- matrix,
- textstate.linematrix,
- font,
- fontsize,
- scaling,
- charspace,
- wordspace,
- rise,
- dxscale,
- ncs,
- graphicstate,
- )
-
- def render_string_horizontal(
- self,
- seq: PDFTextSeq,
- matrix: Matrix,
- pos: Point,
- font: PDFFont,
- fontsize: float,
- scaling: float,
- charspace: float,
- wordspace: float,
- rise: float,
- dxscale: float,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> Point:
- (x, y) = pos
- needcharspace = False
- for obj in seq:
- if isinstance(obj, (int, float)):
- x -= obj * dxscale
- needcharspace = True
- else:
- for cid in font.decode(obj):
- if needcharspace:
- x += charspace
- x += self.render_char(
- utils.translate_matrix(matrix, (x, y)),
- font,
- fontsize,
- scaling,
- rise,
- cid,
- ncs,
- graphicstate,
- )
- if cid == 32 and wordspace:
- x += wordspace
- needcharspace = True
- return (x, y)
-
- def render_string_vertical(
- self,
- seq: PDFTextSeq,
- matrix: Matrix,
- pos: Point,
- font: PDFFont,
- fontsize: float,
- scaling: float,
- charspace: float,
- wordspace: float,
- rise: float,
- dxscale: float,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> Point:
- (x, y) = pos
- needcharspace = False
- for obj in seq:
- if isinstance(obj, (int, float)):
- y -= obj * dxscale
- needcharspace = True
- else:
- for cid in font.decode(obj):
- if needcharspace:
- y += charspace
- y += self.render_char(
- utils.translate_matrix(matrix, (x, y)),
- font,
- fontsize,
- scaling,
- rise,
- cid,
- ncs,
- graphicstate,
- )
- if cid == 32 and wordspace:
- y += wordspace
- needcharspace = True
- return (x, y)
-
- def render_char(
- self,
- matrix: Matrix,
- font: PDFFont,
- fontsize: float,
- scaling: float,
- rise: float,
- cid: int,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> float:
- return 0
-
-
-class TagExtractor(PDFDevice):
- def __init__(
- self,
- rsrcmgr: "PDFResourceManager",
- outfp: BinaryIO,
- codec: str = "utf-8",
- ) -> None:
- PDFDevice.__init__(self, rsrcmgr)
- self.outfp = outfp
- self.codec = codec
- self.pageno = 0
- self._stack: List[PSLiteral] = []
-
- def render_string(
- self,
- textstate: "PDFTextState",
- seq: PDFTextSeq,
- ncs: PDFColorSpace,
- graphicstate: "PDFGraphicState",
- ) -> None:
- font = textstate.font
- assert font is not None
- text = ""
- for obj in seq:
- if isinstance(obj, str):
- obj = utils.make_compat_bytes(obj)
- if not isinstance(obj, bytes):
- continue
- chars = font.decode(obj)
- for cid in chars:
- try:
- char = font.to_unichr(cid)
- text += char
- except PDFUnicodeNotDefined:
- pass
- self._write(utils.enc(text))
-
- def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
- output = '' % (
- self.pageno,
- utils.bbox2str(page.mediabox),
- page.rotate,
- )
- self._write(output)
-
- def end_page(self, page: PDFPage) -> None:
- self._write("\n")
- self.pageno += 1
-
- def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
- s = ""
- if isinstance(props, dict):
- s = "".join(
- [
- f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
- for (k, v) in sorted(props.items())
- ],
- )
- out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
- self._write(out_s)
- self._stack.append(tag)
-
- def end_tag(self) -> None:
- assert self._stack, str(self.pageno)
- tag = self._stack.pop(-1)
- out_s = "%s>" % utils.enc(cast(str, tag.name))
- self._write(out_s)
-
- def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
- self.begin_tag(tag, props)
- self._stack.pop(-1)
-
- def _write(self, s: str) -> None:
- self.outfp.write(s.encode(self.codec))
diff --git a/pdf2zh/pdfdocument.py b/pdf2zh/pdfdocument.py
deleted file mode 100644
index 535459e..0000000
--- a/pdf2zh/pdfdocument.py
+++ /dev/null
@@ -1,1069 +0,0 @@
-import itertools
-import logging
-import re
-import struct
-from hashlib import md5, sha256, sha384, sha512
-from typing import (
- Any,
- Callable,
- Dict,
- Iterable,
- Iterator,
- KeysView,
- List,
- Optional,
- Sequence,
- Tuple,
- Type,
- Union,
- cast,
-)
-
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
-
-from pdf2zh import settings
-from pdf2zh.arcfour import Arcfour
-from pdf2zh.data_structures import NumberTree
-from pdf2zh.pdfexceptions import (
- PDFException,
- PDFKeyError,
- PDFObjectNotFound,
- PDFTypeError,
-)
-from pdf2zh.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError
-from pdf2zh.pdftypes import (
- DecipherCallable,
- PDFStream,
- decipher_all,
- dict_value,
- int_value,
- list_value,
- str_value,
- stream_value,
- uint_value,
-)
-from pdf2zh.psexceptions import PSEOF
-from pdf2zh.psparser import KWD, LIT, literal_name
-from pdf2zh.utils import (
- choplist,
- decode_text,
- format_int_alpha,
- format_int_roman,
- nunpack,
-)
-
-log = logging.getLogger(__name__)
-
-
-class PDFNoValidXRef(PDFSyntaxError):
- pass
-
-
-class PDFNoValidXRefWarning(SyntaxWarning):
- """Legacy warning for missing xref.
-
- Not used anymore because warnings.warn is replaced by logger.Logger.warn.
- """
-
-
-class PDFNoOutlines(PDFException):
- pass
-
-
-class PDFNoPageLabels(PDFException):
- pass
-
-
-class PDFDestinationNotFound(PDFException):
- pass
-
-
-class PDFEncryptionError(PDFException):
- pass
-
-
-class PDFPasswordIncorrect(PDFEncryptionError):
- pass
-
-
-class PDFEncryptionWarning(UserWarning):
- """Legacy warning for failed decryption.
-
- Not used anymore because warnings.warn is replaced by logger.Logger.warn.
- """
-
-
-class PDFTextExtractionNotAllowedWarning(UserWarning):
- """Legacy warning for PDF that does not allow extraction.
-
- Not used anymore because warnings.warn is replaced by logger.Logger.warn.
- """
-
-
-class PDFTextExtractionNotAllowed(PDFEncryptionError):
- pass
-
-
-# some predefined literals and keywords.
-LITERAL_OBJSTM = LIT("ObjStm")
-LITERAL_XREF = LIT("XRef")
-LITERAL_CATALOG = LIT("Catalog")
-
-
-class PDFBaseXRef:
- def get_trailer(self) -> Dict[str, Any]:
- raise NotImplementedError
-
- def get_objids(self) -> Iterable[int]:
- return []
-
- # Must return
- # (strmid, index, genno)
- # or (None, pos, genno)
- def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
- raise PDFKeyError(objid)
-
- def load(self, parser: PDFParser) -> None:
- raise NotImplementedError
-
-
-class PDFXRef(PDFBaseXRef):
- def __init__(self) -> None:
- self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
- self.trailer: Dict[str, Any] = {}
-
- def __repr__(self) -> str:
- return "" % (self.offsets.keys())
-
- def load(self, parser: PDFParser) -> None:
- while True:
- try:
- (pos, line) = parser.nextline()
- line = line.strip()
- if not line:
- continue
- except PSEOF:
- raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
- if line.startswith(b"trailer"):
- parser.seek(pos)
- break
- f = line.split(b" ")
- if len(f) != 2:
- error_msg = f"Trailer not found: {parser!r}: line={line!r}"
- raise PDFNoValidXRef(error_msg)
- try:
- (start, nobjs) = map(int, f)
- except ValueError:
- error_msg = f"Invalid line: {parser!r}: line={line!r}"
- raise PDFNoValidXRef(error_msg)
- for objid in range(start, start + nobjs):
- try:
- (_, line) = parser.nextline()
- line = line.strip()
- except PSEOF:
- raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
- f = line.split(b" ")
- if len(f) != 3:
- error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
- raise PDFNoValidXRef(error_msg)
- (pos_b, genno_b, use_b) = f
- if use_b != b"n":
- continue
- self.offsets[objid] = (None, int(pos_b), int(genno_b))
- # log.debug("xref objects: %r", self.offsets)
- self.load_trailer(parser)
-
- def load_trailer(self, parser: PDFParser) -> None:
- try:
- (_, kwd) = parser.nexttoken()
- assert kwd is KWD(b"trailer"), str(kwd)
- _, (_, dic) = parser.nextobject()
- except PSEOF:
- x = parser.pop(1)
- if not x:
- raise PDFNoValidXRef("Unexpected EOF - file corrupted")
- (_, dic) = x[0]
- self.trailer.update(dict_value(dic))
- # log.debug("trailer=%r", self.trailer)
-
- def get_trailer(self) -> Dict[str, Any]:
- return self.trailer
-
- def get_objids(self) -> KeysView[int]:
- return self.offsets.keys()
-
- def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
- return self.offsets[objid]
-
-
-class PDFXRefFallback(PDFXRef):
- def __repr__(self) -> str:
- return "" % (self.offsets.keys())
-
- PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
-
- def load(self, parser: PDFParser) -> None:
- parser.seek(0)
- while 1:
- try:
- (pos, line_bytes) = parser.nextline()
- except PSEOF:
- break
- if line_bytes.startswith(b"trailer"):
- parser.seek(pos)
- self.load_trailer(parser)
- # log.debug("trailer: %r", self.trailer)
- break
- line = line_bytes.decode("latin-1") # default pdf encoding
- m = self.PDFOBJ_CUE.match(line)
- if not m:
- continue
- (objid_s, genno_s) = m.groups()
- objid = int(objid_s)
- genno = int(genno_s)
- self.offsets[objid] = (None, pos, genno)
- # expand ObjStm.
- parser.seek(pos)
- _, (_, obj) = parser.nextobject()
- if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
- stream = stream_value(obj)
- try:
- n = stream["N"]
- except KeyError:
- if settings.STRICT:
- raise PDFSyntaxError("N is not defined: %r" % stream)
- n = 0
- parser1 = PDFStreamParser(stream.get_data())
- objs: List[int] = []
- try:
- while 1:
- _, (_, obj) = parser1.nextobject()
- objs.append(cast(int, obj))
- except PSEOF:
- pass
- n = min(n, len(objs) // 2)
- for index in range(n):
- objid1 = objs[index * 2]
- self.offsets[objid1] = (objid, index, 0)
-
-
-class PDFXRefStream(PDFBaseXRef):
- def __init__(self) -> None:
- self.data: Optional[bytes] = None
- self.entlen: Optional[int] = None
- self.fl1: Optional[int] = None
- self.fl2: Optional[int] = None
- self.fl3: Optional[int] = None
- self.ranges: List[Tuple[int, int]] = []
-
- def __repr__(self) -> str:
- return "" % (self.ranges)
-
- def load(self, parser: PDFParser) -> None:
- (_, objid) = parser.nexttoken() # ignored
- (_, genno) = parser.nexttoken() # ignored
- (_, kwd) = parser.nexttoken()
- _, (_, stream) = parser.nextobject()
- if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
- raise PDFNoValidXRef("Invalid PDF stream spec.")
- size = stream["Size"]
- index_array = stream.get("Index", (0, size))
- if len(index_array) % 2 != 0:
- raise PDFSyntaxError("Invalid index number")
- self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
- (self.fl1, self.fl2, self.fl3) = stream["W"]
- assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
- self.data = stream.get_data()
- self.entlen = self.fl1 + self.fl2 + self.fl3
- self.trailer = stream.attrs
- # log.debug(
- # "xref stream: objid=%s, fields=%d,%d,%d",
- # ", ".join(map(repr, self.ranges)),
- # self.fl1,
- # self.fl2,
- # self.fl3,
- # )
-
- def get_trailer(self) -> Dict[str, Any]:
- return self.trailer
-
- def get_objids(self) -> Iterator[int]:
- for start, nobjs in self.ranges:
- for i in range(nobjs):
- assert self.entlen is not None
- assert self.data is not None
- offset = self.entlen * i
- ent = self.data[offset : offset + self.entlen]
- f1 = nunpack(ent[: self.fl1], 1)
- if f1 == 1 or f1 == 2:
- yield start + i
-
- def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
- index = 0
- for start, nobjs in self.ranges:
- if start <= objid and objid < start + nobjs:
- index += objid - start
- break
- else:
- index += nobjs
- else:
- raise PDFKeyError(objid)
- assert self.entlen is not None
- assert self.data is not None
- assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
- offset = self.entlen * index
- ent = self.data[offset : offset + self.entlen]
- f1 = nunpack(ent[: self.fl1], 1)
- f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
- f3 = nunpack(ent[self.fl1 + self.fl2 :])
- if f1 == 1:
- return (None, f2, f3)
- elif f1 == 2:
- return (f2, f3, 0)
- else:
- # this is a free object
- raise PDFKeyError(objid)
-
-
-class PDFStandardSecurityHandler:
- PASSWORD_PADDING = (
- b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
- b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
- )
- supported_revisions: Tuple[int, ...] = (2, 3)
-
- def __init__(
- self,
- docid: Sequence[bytes],
- param: Dict[str, Any],
- password: str = "",
- ) -> None:
- self.docid = docid
- self.param = param
- self.password = password
- self.init()
-
- def init(self) -> None:
- self.init_params()
- if self.r not in self.supported_revisions:
- error_msg = "Unsupported revision: param=%r" % self.param
- raise PDFEncryptionError(error_msg)
- self.init_key()
-
- def init_params(self) -> None:
- self.v = int_value(self.param.get("V", 0))
- self.r = int_value(self.param["R"])
- self.p = uint_value(self.param["P"], 32)
- self.o = str_value(self.param["O"])
- self.u = str_value(self.param["U"])
- self.length = int_value(self.param.get("Length", 40))
-
- def init_key(self) -> None:
- self.key = self.authenticate(self.password)
- if self.key is None:
- raise PDFPasswordIncorrect
-
- def is_printable(self) -> bool:
- return bool(self.p & 4)
-
- def is_modifiable(self) -> bool:
- return bool(self.p & 8)
-
- def is_extractable(self) -> bool:
- return bool(self.p & 16)
-
- def compute_u(self, key: bytes) -> bytes:
- if self.r == 2:
- # Algorithm 3.4
- return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
- else:
- # Algorithm 3.5
- hash = md5(self.PASSWORD_PADDING) # 2
- hash.update(self.docid[0]) # 3
- result = Arcfour(key).encrypt(hash.digest()) # 4
- for i in range(1, 20): # 5
- k = b"".join(bytes((c ^ i,)) for c in iter(key))
- result = Arcfour(k).encrypt(result)
- result += result # 6
- return result
-
- def compute_encryption_key(self, password: bytes) -> bytes:
- # Algorithm 3.2
- password = (password + self.PASSWORD_PADDING)[:32] # 1
- hash = md5(password) # 2
- hash.update(self.o) # 3
- # See https://github.com/pdf2zh/pdf2zh.six/issues/186
- hash.update(struct.pack("= 4:
- if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
- hash.update(b"\xff\xff\xff\xff")
- result = hash.digest()
- n = 5
- if self.r >= 3:
- n = self.length // 8
- for _ in range(50):
- result = md5(result[:n]).digest()
- return result[:n]
-
- def authenticate(self, password: str) -> Optional[bytes]:
- password_bytes = password.encode("latin1")
- key = self.authenticate_user_password(password_bytes)
- if key is None:
- key = self.authenticate_owner_password(password_bytes)
- return key
-
- def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
- key = self.compute_encryption_key(password)
- if self.verify_encryption_key(key):
- return key
- else:
- return None
-
- def verify_encryption_key(self, key: bytes) -> bool:
- # Algorithm 3.6
- u = self.compute_u(key)
- if self.r == 2:
- return u == self.u
- return u[:16] == self.u[:16]
-
- def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
- # Algorithm 3.7
- password = (password + self.PASSWORD_PADDING)[:32]
- hash = md5(password)
- if self.r >= 3:
- for _ in range(50):
- hash = md5(hash.digest())
- n = 5
- if self.r >= 3:
- n = self.length // 8
- key = hash.digest()[:n]
- if self.r == 2:
- user_password = Arcfour(key).decrypt(self.o)
- else:
- user_password = self.o
- for i in range(19, -1, -1):
- k = b"".join(bytes((c ^ i,)) for c in iter(key))
- user_password = Arcfour(k).decrypt(user_password)
- return self.authenticate_user_password(user_password)
-
- def decrypt(
- self,
- objid: int,
- genno: int,
- data: bytes,
- attrs: Optional[Dict[str, Any]] = None,
- ) -> bytes:
- return self.decrypt_rc4(objid, genno, data)
-
- def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
- assert self.key is not None
- key = self.key + struct.pack(" None:
- super().init_params()
- self.length = 128
- self.cf = dict_value(self.param.get("CF"))
- self.stmf = literal_name(self.param["StmF"])
- self.strf = literal_name(self.param["StrF"])
- self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
- if self.stmf != self.strf:
- error_msg = "Unsupported crypt filter: param=%r" % self.param
- raise PDFEncryptionError(error_msg)
- self.cfm = {}
- for k, v in self.cf.items():
- f = self.get_cfm(literal_name(v["CFM"]))
- if f is None:
- error_msg = "Unknown crypt filter method: param=%r" % self.param
- raise PDFEncryptionError(error_msg)
- self.cfm[k] = f
- self.cfm["Identity"] = self.decrypt_identity
- if self.strf not in self.cfm:
- error_msg = "Undefined crypt filter: param=%r" % self.param
- raise PDFEncryptionError(error_msg)
-
- def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
- if name == "V2":
- return self.decrypt_rc4
- elif name == "AESV2":
- return self.decrypt_aes128
- else:
- return None
-
- def decrypt(
- self,
- objid: int,
- genno: int,
- data: bytes,
- attrs: Optional[Dict[str, Any]] = None,
- name: Optional[str] = None,
- ) -> bytes:
- if not self.encrypt_metadata and attrs is not None:
- t = attrs.get("Type")
- if t is not None and literal_name(t) == "Metadata":
- return data
- if name is None:
- name = self.strf
- return self.cfm[name](objid, genno, data)
-
- def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
- return data
-
- def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
- assert self.key is not None
- key = (
- self.key
- + struct.pack(" None:
- super().init_params()
- self.length = 256
- self.oe = str_value(self.param["OE"])
- self.ue = str_value(self.param["UE"])
- self.o_hash = self.o[:32]
- self.o_validation_salt = self.o[32:40]
- self.o_key_salt = self.o[40:]
- self.u_hash = self.u[:32]
- self.u_validation_salt = self.u[32:40]
- self.u_key_salt = self.u[40:]
-
- def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
- if name == "AESV3":
- return self.decrypt_aes256
- else:
- return None
-
- def authenticate(self, password: str) -> Optional[bytes]:
- password_b = self._normalize_password(password)
- hash = self._password_hash(password_b, self.o_validation_salt, self.u)
- if hash == self.o_hash:
- hash = self._password_hash(password_b, self.o_key_salt, self.u)
- cipher = Cipher(
- algorithms.AES(hash),
- modes.CBC(b"\0" * 16),
- backend=default_backend(),
- ) # type: ignore
- return cipher.decryptor().update(self.oe) # type: ignore
- hash = self._password_hash(password_b, self.u_validation_salt)
- if hash == self.u_hash:
- hash = self._password_hash(password_b, self.u_key_salt)
- cipher = Cipher(
- algorithms.AES(hash),
- modes.CBC(b"\0" * 16),
- backend=default_backend(),
- ) # type: ignore
- return cipher.decryptor().update(self.ue) # type: ignore
- return None
-
- def _normalize_password(self, password: str) -> bytes:
- if self.r == 6:
- # saslprep expects non-empty strings, apparently
- if not password:
- return b""
- from pdf2zh._saslprep import saslprep
-
- password = saslprep(password)
- return password.encode("utf-8")[:127]
-
- def _password_hash(
- self,
- password: bytes,
- salt: bytes,
- vector: Optional[bytes] = None,
- ) -> bytes:
- """Compute password hash depending on revision number"""
- if self.r == 5:
- return self._r5_password(password, salt, vector)
- return self._r6_password(password, salt[0:8], vector)
-
- def _r5_password(
- self,
- password: bytes,
- salt: bytes,
- vector: Optional[bytes] = None,
- ) -> bytes:
- """Compute the password for revision 5"""
- hash = sha256(password)
- hash.update(salt)
- if vector is not None:
- hash.update(vector)
- return hash.digest()
-
- def _r6_password(
- self,
- password: bytes,
- salt: bytes,
- vector: Optional[bytes] = None,
- ) -> bytes:
- """Compute the password for revision 6"""
- initial_hash = sha256(password)
- initial_hash.update(salt)
- if vector is not None:
- initial_hash.update(vector)
- k = initial_hash.digest()
- hashes = (sha256, sha384, sha512)
- round_no = last_byte_val = 0
- while round_no < 64 or last_byte_val > round_no - 32:
- k1 = (password + k + (vector or b"")) * 64
- e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
- # compute the first 16 bytes of e,
- # interpreted as an unsigned integer mod 3
- next_hash = hashes[self._bytes_mod_3(e[:16])]
- k = next_hash(e).digest()
- last_byte_val = e[len(e) - 1]
- round_no += 1
- return k[:32]
-
- @staticmethod
- def _bytes_mod_3(input_bytes: bytes) -> int:
- # 256 is 1 mod 3, so we can just sum 'em
- return sum(b % 3 for b in input_bytes) % 3
-
- def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
- cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
- encryptor = cipher.encryptor() # type: ignore
- return encryptor.update(data) + encryptor.finalize() # type: ignore
-
- def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
- initialization_vector = data[:16]
- ciphertext = data[16:]
- assert self.key is not None
- cipher = Cipher(
- algorithms.AES(self.key),
- modes.CBC(initialization_vector),
- backend=default_backend(),
- ) # type: ignore
- return cipher.decryptor().update(ciphertext) # type: ignore
-
-
-class PDFDocument:
- """PDFDocument object represents a PDF document.
-
- Since a PDF file can be very big, normally it is not loaded at
- once. So PDF document has to cooperate with a PDF parser in order to
- dynamically import the data as processing goes.
-
- Typical usage:
- doc = PDFDocument(parser, password)
- obj = doc.getobj(objid)
-
- """
-
- security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
- 1: PDFStandardSecurityHandler,
- 2: PDFStandardSecurityHandler,
- 4: PDFStandardSecurityHandlerV4,
- 5: PDFStandardSecurityHandlerV5,
- }
-
- def __init__(
- self,
- parser: PDFParser,
- password: str = "",
- caching: bool = True,
- fallback: bool = True,
- ) -> None:
- """Set the document to use a given PDFParser object."""
- self.caching = caching
- self.xrefs: List[PDFBaseXRef] = []
- self.info = []
- self.catalog: Dict[str, Any] = {}
- self.encryption: Optional[Tuple[Any, Any]] = None
- self.decipher: Optional[DecipherCallable] = None
- self._parser = None
- self._cached_objs: Dict[int, Tuple[object, int]] = {}
- self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
- self._parser = parser
- self._parser.set_document(self)
- self.is_printable = self.is_modifiable = self.is_extractable = True
- # Retrieve the information of each header that was appended
- # (maybe multiple times) at the end of the document.
- try:
- # print('FIND XREF')
- pos = self.find_xref(parser)
- self.pos = pos
- self.read_xref_from(parser, pos, self.xrefs)
- except PDFNoValidXRef:
- if fallback:
- parser.fallback = True
- newxref = PDFXRefFallback()
- newxref.load(parser)
- self.xrefs.append(newxref)
- # print(f'XREF {self.xrefs}')
- for xref in self.xrefs:
- trailer = xref.get_trailer()
- if not trailer:
- continue
- # If there's an encryption info, remember it.
- if "Encrypt" in trailer:
- if "ID" in trailer:
- id_value = list_value(trailer["ID"])
- else:
- # Some documents may not have a /ID, use two empty
- # byte strings instead. Solves
- # https://github.com/pdf2zh/pdf2zh.six/issues/594
- id_value = (b"", b"")
- self.encryption = (id_value, dict_value(trailer["Encrypt"]))
- self._initialize_password(password)
- if "Info" in trailer:
- self.info.append(dict_value(trailer["Info"]))
- if "Root" in trailer:
- # Every PDF file must have exactly one /Root dictionary.
- self.catalog = dict_value(trailer["Root"])
- break
- else:
- raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
- if self.catalog.get("Type") is not LITERAL_CATALOG:
- if settings.STRICT:
- raise PDFSyntaxError("Catalog not found!")
-
- KEYWORD_OBJ = KWD(b"obj")
-
- # _initialize_password(password=b'')
- # Perform the initialization with a given password.
- def _initialize_password(self, password: str = "") -> None:
- assert self.encryption is not None
- (docid, param) = self.encryption
- if literal_name(param.get("Filter")) != "Standard":
- raise PDFEncryptionError("Unknown filter: param=%r" % param)
- v = int_value(param.get("V", 0))
- factory = self.security_handler_registry.get(v)
- if factory is None:
- raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
- handler = factory(docid, param, password)
- self.decipher = handler.decrypt
- self.is_printable = handler.is_printable()
- self.is_modifiable = handler.is_modifiable()
- self.is_extractable = handler.is_extractable()
- assert self._parser is not None
- self._parser.fallback = False # need to read streams with exact length
-
- def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
- if stream.objid in self._parsed_objs:
- (objs, n) = self._parsed_objs[stream.objid]
- else:
- (objs, n) = self._get_objects(stream)
- if self.caching:
- assert stream.objid is not None
- self._parsed_objs[stream.objid] = (objs, n)
- i = n * 2 + index
- try:
- obj = objs[i]
- except IndexError:
- raise PDFSyntaxError("index too big: %r" % index)
- return obj
-
- def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
- if stream.get("Type") is not LITERAL_OBJSTM:
- if settings.STRICT:
- raise PDFSyntaxError("Not a stream object: %r" % stream)
- try:
- n = cast(int, stream["N"])
- except KeyError:
- if settings.STRICT:
- raise PDFSyntaxError("N is not defined: %r" % stream)
- n = 0
- parser = PDFStreamParser(stream.get_data())
- parser.set_document(self)
- objs: List[object] = []
- try:
- while 1:
- _, (_, obj) = parser.nextobject()
- objs.append(obj)
- except PSEOF:
- pass
- return (objs, n)
-
- def _getobj_parse(self, pos: int, objid: int) -> object:
- assert self._parser is not None
- self._parser.seek(pos)
- (_, objid1) = self._parser.nexttoken() # objid
- (_, genno) = self._parser.nexttoken() # genno
- (_, kwd) = self._parser.nexttoken()
- # hack around malformed pdf files
- # copied from https://github.com/jaepil/pdf2zh3k/blob/master/
- # pdf2zh/pdfparser.py#L399
- # to solve https://github.com/pdf2zh/pdf2zh.six/issues/56
- # assert objid1 == objid, str((objid1, objid))
- if objid1 != objid:
- x = []
- while kwd is not self.KEYWORD_OBJ:
- (_, kwd) = self._parser.nexttoken()
- x.append(kwd)
- if len(x) >= 2:
- objid1 = x[-2]
- # #### end hack around malformed pdf files
- if objid1 != objid:
- raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")
-
- if kwd != KWD(b"obj"):
- raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
- end, (_, obj) = self._parser.nextobject()
- return end, obj
-
- # can raise PDFObjectNotFound
- def getobj(self, objid: int) -> object:
- """Get object from PDF
-
- :raises PDFException if PDFDocument is not initialized
- :raises PDFObjectNotFound if objid does not exist in PDF
- """
- if not self.xrefs:
- raise PDFException("PDFDocument is not initialized")
- # log.debug("getobj: objid=%r", objid)
- if objid in self._cached_objs:
- (obj, genno) = self._cached_objs[objid]
- else:
- for xref in self.xrefs:
- try:
- (strmid, index, genno) = xref.get_pos(objid)
- except KeyError:
- continue
- try:
- if strmid is not None:
- stream = stream_value(self.getobj(strmid))
- obj = self._getobj_objstm(stream, index, objid)
- else:
- end, obj = self._getobj_parse(index, objid)
- if self.decipher:
- obj = decipher_all(self.decipher, objid, genno, obj)
-
- if isinstance(obj, PDFStream):
- obj.set_objid(objid, genno)
- break
- except (PSEOF, PDFSyntaxError):
- continue
- else:
- raise PDFObjectNotFound(objid)
- # log.debug("register: objid=%r: %r", objid, obj)
- if self.caching:
- self._cached_objs[objid] = (obj, genno)
- return obj
-
- OutlineType = Tuple[Any, Any, Any, Any, Any]
-
- def get_outlines(self) -> Iterator[OutlineType]:
- if "Outlines" not in self.catalog:
- raise PDFNoOutlines
-
- def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
- entry = dict_value(entry)
- if "Title" in entry:
- if "A" in entry or "Dest" in entry:
- title = decode_text(str_value(entry["Title"]))
- dest = entry.get("Dest")
- action = entry.get("A")
- se = entry.get("SE")
- yield (level, title, dest, action, se)
- if "First" in entry and "Last" in entry:
- yield from search(entry["First"], level + 1)
- if "Next" in entry:
- yield from search(entry["Next"], level)
-
- return search(self.catalog["Outlines"], 0)
-
- def get_page_labels(self) -> Iterator[str]:
- """Generate page label strings for the PDF document.
-
- If the document includes page labels, generates strings, one per page.
- If not, raises PDFNoPageLabels.
-
- The resulting iteration is unbounded.
- """
- assert self.catalog is not None
-
- try:
- page_labels = PageLabels(self.catalog["PageLabels"])
- except (PDFTypeError, KeyError):
- raise PDFNoPageLabels
-
- return page_labels.labels
-
- def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
- try:
- names = dict_value(self.catalog["Names"])
- except (PDFTypeError, KeyError):
- raise PDFKeyError((cat, key))
- # may raise KeyError
- d0 = dict_value(names[cat])
-
- def lookup(d: Dict[str, Any]) -> Any:
- if "Limits" in d:
- (k1, k2) = list_value(d["Limits"])
- if key < k1 or k2 < key:
- return None
- if "Names" in d:
- objs = list_value(d["Names"])
- names = dict(
- cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)),
- )
- return names[key]
- if "Kids" in d:
- for c in list_value(d["Kids"]):
- v = lookup(dict_value(c))
- if v:
- return v
- raise PDFKeyError((cat, key))
-
- return lookup(d0)
-
- def get_dest(self, name: Union[str, bytes]) -> Any:
- try:
- # PDF-1.2 or later
- obj = self.lookup_name("Dests", name)
- except KeyError:
- # PDF-1.1 or prior
- if "Dests" not in self.catalog:
- raise PDFDestinationNotFound(name)
- d0 = dict_value(self.catalog["Dests"])
- if name not in d0:
- raise PDFDestinationNotFound(name)
- obj = d0[name]
- return obj
-
- # find_xref
- def find_xref(self, parser: PDFParser) -> int:
- """Internal function used to locate the first XRef."""
- # search the last xref table by scanning the file backwards.
- prev = b""
- for line in parser.revreadlines():
- line = line.strip()
- # log.debug("find_xref: %r", line)
-
- if line == b"startxref":
- # log.debug("xref found: pos=%r", prev)
-
- if not prev.isdigit():
- raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
-
- start = int(prev)
-
- if not start >= 0:
- raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
-
- return start
-
- if line:
- prev = line
-
- raise PDFNoValidXRef("Unexpected EOF")
-
- # read xref table
- def read_xref_from(
- self,
- parser: PDFParser,
- start: int,
- xrefs: List[PDFBaseXRef],
- ) -> None:
- """Reads XRefs from the given location."""
- parser.seek(start)
- parser.reset()
- try:
- (pos, token) = parser.nexttoken()
- except PSEOF:
- raise PDFNoValidXRef("Unexpected EOF")
- # log.debug("read_xref_from: start=%d, token=%r", start, token)
- if isinstance(token, int):
- # XRefStream: PDF-1.5
- parser.seek(pos)
- parser.reset()
- xref: PDFBaseXRef = PDFXRefStream()
- xref.load(parser)
- else:
- if token is parser.KEYWORD_XREF:
- parser.nextline()
- xref = PDFXRef()
- xref.load(parser)
- xrefs.append(xref)
- trailer = xref.get_trailer()
- # log.debug("trailer: %r", trailer)
- if "XRefStm" in trailer:
- pos = int_value(trailer["XRefStm"])
- self.read_xref_from(parser, pos, xrefs)
- if "Prev" in trailer:
- # find previous xref
- pos = int_value(trailer["Prev"])
- self.read_xref_from(parser, pos, xrefs)
-
-
-class PageLabels(NumberTree):
- """PageLabels from the document catalog.
-
- See Section 8.3.1 in the PDF Reference.
- """
-
- @property
- def labels(self) -> Iterator[str]:
- ranges = self.values
-
- # The tree must begin with page index 0
- if len(ranges) == 0 or ranges[0][0] != 0:
- if settings.STRICT:
- raise PDFSyntaxError("PageLabels is missing page index 0")
- else:
- # Try to cope, by assuming empty labels for the initial pages
- ranges.insert(0, (0, {}))
-
- for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
- label_dict = dict_value(label_dict_unchecked)
- style = label_dict.get("S")
- prefix = decode_text(str_value(label_dict.get("P", b"")))
- first_value = int_value(label_dict.get("St", 1))
-
- if next == len(ranges):
- # This is the last specified range. It continues until the end
- # of the document.
- values: Iterable[int] = itertools.count(first_value)
- else:
- end, _ = ranges[next]
- range_length = end - start
- values = range(first_value, first_value + range_length)
-
- for value in values:
- label = self._format_page_label(value, style)
- yield prefix + label
-
- @staticmethod
- def _format_page_label(value: int, style: Any) -> str:
- """Format page label value in a specific style"""
- if style is None:
- label = ""
- elif style is LIT("D"): # Decimal arabic numerals
- label = str(value)
- elif style is LIT("R"): # Uppercase roman numerals
- label = format_int_roman(value).upper()
- elif style is LIT("r"): # Lowercase roman numerals
- label = format_int_roman(value)
- elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
- label = format_int_alpha(value).upper()
- elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
- label = format_int_alpha(value)
- else:
- log.warning("Unknown page label style: %r", style)
- label = ""
- return label
diff --git a/pdf2zh/pdfexceptions.py b/pdf2zh/pdfexceptions.py
deleted file mode 100644
index e1a82ac..0000000
--- a/pdf2zh/pdfexceptions.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from pdf2zh.psexceptions import PSException
-
-
-class PDFException(PSException):
- pass
-
-
-class PDFTypeError(PDFException, TypeError):
- pass
-
-
-class PDFValueError(PDFException, ValueError):
- pass
-
-
-class PDFObjectNotFound(PDFException):
- pass
-
-
-class PDFNotImplementedError(PDFException, NotImplementedError):
- pass
-
-
-class PDFKeyError(PDFException, KeyError):
- pass
-
-
-class PDFEOFError(PDFException, EOFError):
- pass
-
-
-class PDFIOError(PDFException, IOError):
- pass
diff --git a/pdf2zh/pdffont.py b/pdf2zh/pdffont.py
deleted file mode 100644
index 5591e1e..0000000
--- a/pdf2zh/pdffont.py
+++ /dev/null
@@ -1,1190 +0,0 @@
-import logging
-import struct
-from io import BytesIO
-from typing import (
- TYPE_CHECKING,
- Any,
- BinaryIO,
- Dict,
- Iterable,
- Iterator,
- List,
- Mapping,
- Optional,
- Tuple,
- Union,
- cast,
-)
-
-from pdf2zh import settings
-from pdf2zh.cmapdb import (
- CMap,
- CMapBase,
- CMapDB,
- CMapParser,
- FileUnicodeMap,
- IdentityUnicodeMap,
- UnicodeMap,
-)
-from pdf2zh.encodingdb import EncodingDB, name2unicode
-from pdf2zh.fontmetrics import FONT_METRICS
-from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError
-from pdf2zh.pdftypes import (
- PDFStream,
- dict_value,
- int_value,
- list_value,
- num_value,
- resolve1,
- resolve_all,
- stream_value,
-)
-from pdf2zh.psexceptions import PSEOF
-from pdf2zh.psparser import (
- KWD,
- LIT,
- PSKeyword,
- PSLiteral,
- PSStackParser,
- literal_name,
-)
-from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
-
-if TYPE_CHECKING:
- from pdf2zh.pdfinterp import PDFResourceManager
-
-log = logging.getLogger(__name__)
-
-
-def get_widths(seq: Iterable[object]) -> Dict[int, float]:
- """Build a mapping of character widths for horizontal writing."""
- widths: Dict[int, float] = {}
- r: List[float] = []
- for v in seq:
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for i, w in enumerate(v):
- widths[cast(int, char1) + i] = w
- r = []
- elif isinstance(v, (int, float)): # == utils.isnumber(v)
- r.append(v)
- if len(r) == 3:
- (char1, char2, w) = r
- for i in range(cast(int, char1), cast(int, char2) + 1):
- widths[i] = w
- r = []
- return widths
-
-
-def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
- """Build a mapping of character widths for vertical writing."""
- widths: Dict[int, Tuple[float, Point]] = {}
- r: List[float] = []
- for v in seq:
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for i, (w, vx, vy) in enumerate(choplist(3, v)):
- widths[cast(int, char1) + i] = (w, (vx, vy))
- r = []
- elif isinstance(v, (int, float)): # == utils.isnumber(v)
- r.append(v)
- if len(r) == 5:
- (char1, char2, w, vx, vy) = r
- for i in range(cast(int, char1), cast(int, char2) + 1):
- widths[i] = (w, (vx, vy))
- r = []
- return widths
-
-
-class FontMetricsDB:
- @classmethod
- def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
- return FONT_METRICS[fontname]
-
-
-# int here means that we're not extending PSStackParser with additional types.
-class Type1FontHeaderParser(PSStackParser[int]):
- KEYWORD_BEGIN = KWD(b"begin")
- KEYWORD_END = KWD(b"end")
- KEYWORD_DEF = KWD(b"def")
- KEYWORD_PUT = KWD(b"put")
- KEYWORD_DICT = KWD(b"dict")
- KEYWORD_ARRAY = KWD(b"array")
- KEYWORD_READONLY = KWD(b"readonly")
- KEYWORD_FOR = KWD(b"for")
-
- def __init__(self, data: BinaryIO) -> None:
- PSStackParser.__init__(self, data)
- self._cid2unicode: Dict[int, str] = {}
-
- def get_encoding(self) -> Dict[int, str]:
- """Parse the font encoding.
-
- The Type1 font encoding maps character codes to character names. These
- character names could either be standard Adobe glyph names, or
- character names associated with custom CharStrings for this font. A
- CharString is a sequence of operations that describe how the character
- should be drawn. Currently, this function returns '' (empty string)
- for character names that are associated with a CharStrings.
-
- Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
-
- :returns mapping of character identifiers (cid's) to unicode characters
- """
- while 1:
- try:
- _, (cid, name) = self.nextobject()
- except PSEOF:
- break
- try:
- self._cid2unicode[cid] = name2unicode(cast(str, name))
- except KeyError:
- # log.debug(str(e))
- pass
- return self._cid2unicode
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- if token is self.KEYWORD_PUT:
- ((_, key), (_, value)) = self.pop(2)
- if isinstance(key, int) and isinstance(value, PSLiteral):
- self.add_results((key, literal_name(value)))
-
-
-NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
-
-# Mapping of cmap names. Original cmap name is kept if not in the mapping.
-# (missing reference for why DLIdent is mapped to Identity)
-IDENTITY_ENCODER = {
- "DLIdent-H": "Identity-H",
- "DLIdent-V": "Identity-V",
-}
-
-
-def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
- d: Dict[int, List[Union[float, int]]] = {}
- fp = BytesIO(data)
- stack: List[Union[float, int]] = []
- while 1:
- c = fp.read(1)
- if not c:
- break
- b0 = ord(c)
- if b0 <= 21:
- d[b0] = stack
- stack = []
- continue
- if b0 == 30:
- s = ""
- loop = True
- while loop:
- b = ord(fp.read(1))
- for n in (b >> 4, b & 15):
- if n == 15:
- loop = False
- else:
- nibble = NIBBLES[n]
- assert nibble is not None
- s += nibble
- value = float(s)
- elif b0 >= 32 and b0 <= 246:
- value = b0 - 139
- else:
- b1 = ord(fp.read(1))
- if b0 >= 247 and b0 <= 250:
- value = ((b0 - 247) << 8) + b1 + 108
- elif b0 >= 251 and b0 <= 254:
- value = -((b0 - 251) << 8) - b1 - 108
- else:
- b2 = ord(fp.read(1))
- if b1 >= 128:
- b1 -= 256
- if b0 == 28:
- value = b1 << 8 | b2
- else:
- value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
- stack.append(value)
- return d
-
-
-class CFFFont:
- STANDARD_STRINGS = (
- ".notdef",
- "space",
- "exclam",
- "quotedbl",
- "numbersign",
- "dollar",
- "percent",
- "ampersand",
- "quoteright",
- "parenleft",
- "parenright",
- "asterisk",
- "plus",
- "comma",
- "hyphen",
- "period",
- "slash",
- "zero",
- "one",
- "two",
- "three",
- "four",
- "five",
- "six",
- "seven",
- "eight",
- "nine",
- "colon",
- "semicolon",
- "less",
- "equal",
- "greater",
- "question",
- "at",
- "A",
- "B",
- "C",
- "D",
- "E",
- "F",
- "G",
- "H",
- "I",
- "J",
- "K",
- "L",
- "M",
- "N",
- "O",
- "P",
- "Q",
- "R",
- "S",
- "T",
- "U",
- "V",
- "W",
- "X",
- "Y",
- "Z",
- "bracketleft",
- "backslash",
- "bracketright",
- "asciicircum",
- "underscore",
- "quoteleft",
- "a",
- "b",
- "c",
- "d",
- "e",
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "braceleft",
- "bar",
- "braceright",
- "asciitilde",
- "exclamdown",
- "cent",
- "sterling",
- "fraction",
- "yen",
- "florin",
- "section",
- "currency",
- "quotesingle",
- "quotedblleft",
- "guillemotleft",
- "guilsinglleft",
- "guilsinglright",
- "fi",
- "fl",
- "endash",
- "dagger",
- "daggerdbl",
- "periodcentered",
- "paragraph",
- "bullet",
- "quotesinglbase",
- "quotedblbase",
- "quotedblright",
- "guillemotright",
- "ellipsis",
- "perthousand",
- "questiondown",
- "grave",
- "acute",
- "circumflex",
- "tilde",
- "macron",
- "breve",
- "dotaccent",
- "dieresis",
- "ring",
- "cedilla",
- "hungarumlaut",
- "ogonek",
- "caron",
- "emdash",
- "AE",
- "ordfeminine",
- "Lslash",
- "Oslash",
- "OE",
- "ordmasculine",
- "ae",
- "dotlessi",
- "lslash",
- "oslash",
- "oe",
- "germandbls",
- "onesuperior",
- "logicalnot",
- "mu",
- "trademark",
- "Eth",
- "onehalf",
- "plusminus",
- "Thorn",
- "onequarter",
- "divide",
- "brokenbar",
- "degree",
- "thorn",
- "threequarters",
- "twosuperior",
- "registered",
- "minus",
- "eth",
- "multiply",
- "threesuperior",
- "copyright",
- "Aacute",
- "Acircumflex",
- "Adieresis",
- "Agrave",
- "Aring",
- "Atilde",
- "Ccedilla",
- "Eacute",
- "Ecircumflex",
- "Edieresis",
- "Egrave",
- "Iacute",
- "Icircumflex",
- "Idieresis",
- "Igrave",
- "Ntilde",
- "Oacute",
- "Ocircumflex",
- "Odieresis",
- "Ograve",
- "Otilde",
- "Scaron",
- "Uacute",
- "Ucircumflex",
- "Udieresis",
- "Ugrave",
- "Yacute",
- "Ydieresis",
- "Zcaron",
- "aacute",
- "acircumflex",
- "adieresis",
- "agrave",
- "aring",
- "atilde",
- "ccedilla",
- "eacute",
- "ecircumflex",
- "edieresis",
- "egrave",
- "iacute",
- "icircumflex",
- "idieresis",
- "igrave",
- "ntilde",
- "oacute",
- "ocircumflex",
- "odieresis",
- "ograve",
- "otilde",
- "scaron",
- "uacute",
- "ucircumflex",
- "udieresis",
- "ugrave",
- "yacute",
- "ydieresis",
- "zcaron",
- "exclamsmall",
- "Hungarumlautsmall",
- "dollaroldstyle",
- "dollarsuperior",
- "ampersandsmall",
- "Acutesmall",
- "parenleftsuperior",
- "parenrightsuperior",
- "twodotenleader",
- "onedotenleader",
- "zerooldstyle",
- "oneoldstyle",
- "twooldstyle",
- "threeoldstyle",
- "fouroldstyle",
- "fiveoldstyle",
- "sixoldstyle",
- "sevenoldstyle",
- "eightoldstyle",
- "nineoldstyle",
- "commasuperior",
- "threequartersemdash",
- "periodsuperior",
- "questionsmall",
- "asuperior",
- "bsuperior",
- "centsuperior",
- "dsuperior",
- "esuperior",
- "isuperior",
- "lsuperior",
- "msuperior",
- "nsuperior",
- "osuperior",
- "rsuperior",
- "ssuperior",
- "tsuperior",
- "ff",
- "ffi",
- "ffl",
- "parenleftinferior",
- "parenrightinferior",
- "Circumflexsmall",
- "hyphensuperior",
- "Gravesmall",
- "Asmall",
- "Bsmall",
- "Csmall",
- "Dsmall",
- "Esmall",
- "Fsmall",
- "Gsmall",
- "Hsmall",
- "Ismall",
- "Jsmall",
- "Ksmall",
- "Lsmall",
- "Msmall",
- "Nsmall",
- "Osmall",
- "Psmall",
- "Qsmall",
- "Rsmall",
- "Ssmall",
- "Tsmall",
- "Usmall",
- "Vsmall",
- "Wsmall",
- "Xsmall",
- "Ysmall",
- "Zsmall",
- "colonmonetary",
- "onefitted",
- "rupiah",
- "Tildesmall",
- "exclamdownsmall",
- "centoldstyle",
- "Lslashsmall",
- "Scaronsmall",
- "Zcaronsmall",
- "Dieresissmall",
- "Brevesmall",
- "Caronsmall",
- "Dotaccentsmall",
- "Macronsmall",
- "figuredash",
- "hypheninferior",
- "Ogoneksmall",
- "Ringsmall",
- "Cedillasmall",
- "questiondownsmall",
- "oneeighth",
- "threeeighths",
- "fiveeighths",
- "seveneighths",
- "onethird",
- "twothirds",
- "zerosuperior",
- "foursuperior",
- "fivesuperior",
- "sixsuperior",
- "sevensuperior",
- "eightsuperior",
- "ninesuperior",
- "zeroinferior",
- "oneinferior",
- "twoinferior",
- "threeinferior",
- "fourinferior",
- "fiveinferior",
- "sixinferior",
- "seveninferior",
- "eightinferior",
- "nineinferior",
- "centinferior",
- "dollarinferior",
- "periodinferior",
- "commainferior",
- "Agravesmall",
- "Aacutesmall",
- "Acircumflexsmall",
- "Atildesmall",
- "Adieresissmall",
- "Aringsmall",
- "AEsmall",
- "Ccedillasmall",
- "Egravesmall",
- "Eacutesmall",
- "Ecircumflexsmall",
- "Edieresissmall",
- "Igravesmall",
- "Iacutesmall",
- "Icircumflexsmall",
- "Idieresissmall",
- "Ethsmall",
- "Ntildesmall",
- "Ogravesmall",
- "Oacutesmall",
- "Ocircumflexsmall",
- "Otildesmall",
- "Odieresissmall",
- "OEsmall",
- "Oslashsmall",
- "Ugravesmall",
- "Uacutesmall",
- "Ucircumflexsmall",
- "Udieresissmall",
- "Yacutesmall",
- "Thornsmall",
- "Ydieresissmall",
- "001.000",
- "001.001",
- "001.002",
- "001.003",
- "Black",
- "Bold",
- "Book",
- "Light",
- "Medium",
- "Regular",
- "Roman",
- "Semibold",
- )
-
- class INDEX:
- def __init__(self, fp: BinaryIO) -> None:
- self.fp = fp
- self.offsets: List[int] = []
- (count, offsize) = struct.unpack(">HB", self.fp.read(3))
- for i in range(count + 1):
- self.offsets.append(nunpack(self.fp.read(offsize)))
- self.base = self.fp.tell() - 1
- self.fp.seek(self.base + self.offsets[-1])
-
- def __repr__(self) -> str:
- return "" % len(self)
-
- def __len__(self) -> int:
- return len(self.offsets) - 1
-
- def __getitem__(self, i: int) -> bytes:
- self.fp.seek(self.base + self.offsets[i])
- return self.fp.read(self.offsets[i + 1] - self.offsets[i])
-
- def __iter__(self) -> Iterator[bytes]:
- return iter(self[i] for i in range(len(self)))
-
- def __init__(self, name: str, fp: BinaryIO) -> None:
- self.name = name
- self.fp = fp
- # Header
- (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
- self.fp.read(hdrsize - 4)
- # Name INDEX
- self.name_index = self.INDEX(self.fp)
- # Top DICT INDEX
- self.dict_index = self.INDEX(self.fp)
- # String INDEX
- self.string_index = self.INDEX(self.fp)
- # Global Subr INDEX
- self.subr_index = self.INDEX(self.fp)
- # Top DICT DATA
- self.top_dict = getdict(self.dict_index[0])
- (charset_pos,) = self.top_dict.get(15, [0])
- (encoding_pos,) = self.top_dict.get(16, [0])
- (charstring_pos,) = self.top_dict.get(17, [0])
- # CharStrings
- self.fp.seek(cast(int, charstring_pos))
- self.charstring = self.INDEX(self.fp)
- self.nglyphs = len(self.charstring)
- # Encodings
- self.code2gid = {}
- self.gid2code = {}
- self.fp.seek(cast(int, encoding_pos))
- format = self.fp.read(1)
- if format == b"\x00":
- # Format 0
- (n,) = struct.unpack("B", self.fp.read(1))
- for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- elif format == b"\x01":
- # Format 1
- (n,) = struct.unpack("B", self.fp.read(1))
- code = 0
- for i in range(n):
- (first, nleft) = struct.unpack("BB", self.fp.read(2))
- for gid in range(first, first + nleft + 1):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- code += 1
- else:
- raise PDFValueError("unsupported encoding format: %r" % format)
- # Charsets
- self.name2gid = {}
- self.gid2name = {}
- self.fp.seek(cast(int, charset_pos))
- format = self.fp.read(1)
- if format == b"\x00":
- # Format 0
- n = self.nglyphs - 1
- for gid, sid in enumerate(
- cast(
- Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
- ),
- ):
- gid += 1
- sidname = self.getstr(sid)
- self.name2gid[sidname] = gid
- self.gid2name[gid] = sidname
- elif format == b"\x01":
- # Format 1
- (n,) = struct.unpack("B", self.fp.read(1))
- sid = 0
- for i in range(n):
- (first, nleft) = struct.unpack("BB", self.fp.read(2))
- for gid in range(first, first + nleft + 1):
- sidname = self.getstr(sid)
- self.name2gid[sidname] = gid
- self.gid2name[gid] = sidname
- sid += 1
- elif format == b"\x02":
- # Format 2
- assert False, str(("Unhandled", format))
- else:
- raise PDFValueError("unsupported charset format: %r" % format)
-
- def getstr(self, sid: int) -> Union[str, bytes]:
- # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
- # and appears to be a needless source of type complexity.
- if sid < len(self.STANDARD_STRINGS):
- return self.STANDARD_STRINGS[sid]
- return self.string_index[sid - len(self.STANDARD_STRINGS)]
-
-
-class TrueTypeFont:
- class CMapNotFound(PDFException):
- pass
-
- def __init__(self, name: str, fp: BinaryIO) -> None:
- self.name = name
- self.fp = fp
- self.tables: Dict[bytes, Tuple[int, int]] = {}
- self.fonttype = fp.read(4)
- try:
- (ntables, _1, _2, _3) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHHH", fp.read(8)),
- )
- for _ in range(ntables):
- (name_bytes, tsum, offset, length) = cast(
- Tuple[bytes, int, int, int],
- struct.unpack(">4sLLL", fp.read(16)),
- )
- self.tables[name_bytes] = (offset, length)
- except struct.error:
- # Do not fail if there are not enough bytes to read. Even for
- # corrupted PDFs we would like to get as much information as
- # possible, so continue.
- pass
-
- def create_unicode_map(self) -> FileUnicodeMap:
- if b"cmap" not in self.tables:
- raise TrueTypeFont.CMapNotFound
- (base_offset, length) = self.tables[b"cmap"]
- fp = self.fp
- fp.seek(base_offset)
- (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
- subtables: List[Tuple[int, int, int]] = []
- for i in range(nsubtables):
- subtables.append(
- cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
- )
- char2gid: Dict[int, int] = {}
- # Only supports subtable type 0, 2 and 4.
- for platform_id, encoding_id, st_offset in subtables:
- # Skip non-Unicode cmaps.
- # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
- if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
- continue
- fp.seek(base_offset + st_offset)
- (fmttype, fmtlen, fmtlang) = cast(
- Tuple[int, int, int],
- struct.unpack(">HHH", fp.read(6)),
- )
- if fmttype == 0:
- char2gid.update(
- enumerate(
- cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
- ),
- )
- elif fmttype == 2:
- subheaderkeys = cast(
- Tuple[int, ...],
- struct.unpack(">256H", fp.read(512)),
- )
- firstbytes = [0] * 8192
- for i, k in enumerate(subheaderkeys):
- firstbytes[k // 8] = i
- nhdrs = max(subheaderkeys) // 8 + 1
- hdrs: List[Tuple[int, int, int, int, int]] = []
- for i in range(nhdrs):
- (firstcode, entcount, delta, offset) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHhH", fp.read(8)),
- )
- hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
- for i, firstcode, entcount, delta, pos in hdrs:
- if not entcount:
- continue
- first = firstcode + (firstbytes[i] << 8)
- fp.seek(pos)
- for c in range(entcount):
- gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
- if gid:
- gid += delta
- char2gid[first + c] = gid
- elif fmttype == 4:
- (segcount, _1, _2, _3) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHHH", fp.read(8)),
- )
- segcount //= 2
- ecs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- fp.read(2)
- scs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- idds = cast(
- Tuple[int, ...],
- struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
- )
- pos = fp.tell()
- idrs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
- if idr:
- fp.seek(pos + idr)
- for c in range(sc, ec + 1):
- b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
- char2gid[c] = (b + idd) & 0xFFFF
- else:
- for c in range(sc, ec + 1):
- char2gid[c] = (c + idd) & 0xFFFF
- else:
- assert False, str(("Unhandled", fmttype))
- if not char2gid:
- raise TrueTypeFont.CMapNotFound
- # create unicode map
- unicode_map = FileUnicodeMap()
- for char, gid in char2gid.items():
- unicode_map.add_cid2unichr(gid, char)
- return unicode_map
-
-
-class PDFFontError(PDFException):
- pass
-
-
-class PDFUnicodeNotDefined(PDFFontError):
- pass
-
-
-LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
-LITERAL_TYPE1C = LIT("Type1C")
-
-# Font widths are maintained in a dict type that maps from *either* unicode
-# chars or integer character IDs.
-FontWidthDict = Union[Dict[int, float], Dict[str, float]]
-
-
-class PDFFont:
- def __init__(
- self,
- descriptor: Mapping[str, Any],
- widths: FontWidthDict,
- default_width: Optional[float] = None,
- ) -> None:
- self.descriptor = descriptor
- self.widths: FontWidthDict = resolve_all(widths)
- self.fontname = resolve1(descriptor.get("FontName", "unknown"))
- if isinstance(self.fontname, PSLiteral):
- self.fontname = literal_name(self.fontname)
- self.flags = int_value(descriptor.get("Flags", 0))
- self.ascent = num_value(descriptor.get("Ascent", 0))
- self.descent = num_value(descriptor.get("Descent", 0))
- self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
- if default_width is None:
- self.default_width = num_value(descriptor.get("MissingWidth", 0))
- else:
- self.default_width = default_width
- self.default_width = resolve1(self.default_width)
- self.leading = num_value(descriptor.get("Leading", 0))
- self.bbox = cast(
- Rect,
- list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))),
- )
- self.hscale = self.vscale = 0.001
-
- # PDF RM 9.8.1 specifies /Descent should always be a negative number.
- # PScript5.dll seems to produce Descent with a positive number, but
- # text analysis will be wrong if this is taken as correct. So force
- # descent to negative.
- if self.descent > 0:
- self.descent = -self.descent
-
- def __repr__(self) -> str:
- return ""
-
- def is_vertical(self) -> bool:
- return False
-
- def is_multibyte(self) -> bool:
- return False
-
- def decode(self, bytes: bytes) -> Iterable[int]:
- return bytearray(bytes) # map(ord, bytes)
-
- def get_ascent(self) -> float:
- """Ascent above the baseline, in text space units"""
- return self.ascent * self.vscale
-
- def get_descent(self) -> float:
- """Descent below the baseline, in text space units; always negative"""
- return self.descent * self.vscale
-
- def get_width(self) -> float:
- w = self.bbox[2] - self.bbox[0]
- if w == 0:
- w = -self.default_width
- return w * self.hscale
-
- def get_height(self) -> float:
- h = self.bbox[3] - self.bbox[1]
- if h == 0:
- h = self.ascent - self.descent
- return h * self.vscale
-
- def char_width(self, cid: int) -> float:
- # Because character widths may be mapping either IDs or strings,
- # we try to lookup the character ID first, then its str equivalent.
- try:
- return cast(Dict[int, float], self.widths)[cid] * self.hscale
- except KeyError:
- str_widths = cast(Dict[str, float], self.widths)
- try:
- return str_widths[self.to_unichr(cid)] * self.hscale
- except (KeyError, PDFUnicodeNotDefined):
- return self.default_width * self.hscale
-
- def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
- """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
- return 0
-
- def string_width(self, s: bytes) -> float:
- return sum(self.char_width(cid) for cid in self.decode(s))
-
- def to_unichr(self, cid: int) -> str:
- raise NotImplementedError
-
-
-class PDFSimpleFont(PDFFont):
- def __init__(
- self,
- descriptor: Mapping[str, Any],
- widths: FontWidthDict,
- spec: Mapping[str, Any],
- ) -> None:
- # Font encoding is specified either by a name of
- # built-in encoding or a dictionary that describes
- # the differences.
- if "Encoding" in spec:
- encoding = resolve1(spec["Encoding"])
- else:
- encoding = LITERAL_STANDARD_ENCODING
- if isinstance(encoding, dict):
- name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
- diff = list_value(encoding.get("Differences", []))
- self.cid2unicode = EncodingDB.get_encoding(name, diff)
- else:
- self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
- self.unicode_map: Optional[UnicodeMap] = None
- if "ToUnicode" in spec:
- strm = stream_value(spec["ToUnicode"])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
- PDFFont.__init__(self, descriptor, widths)
-
- def to_unichr(self, cid: int) -> str:
- if self.unicode_map:
- try:
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- pass
- try:
- return self.cid2unicode[cid]
- except KeyError:
- raise PDFUnicodeNotDefined(None, cid)
-
-
-class PDFType1Font(PDFSimpleFont):
- def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
- try:
- self.basefont = literal_name(spec["BaseFont"])
- except KeyError:
- if settings.STRICT:
- raise PDFFontError("BaseFont is missing")
- self.basefont = "unknown"
-
- widths: FontWidthDict
- try:
- (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
- widths = cast(Dict[str, float], int_widths) # implicit int->float
- except KeyError:
- descriptor = dict_value(spec.get("FontDescriptor", {}))
- firstchar = int_value(spec.get("FirstChar", 0))
- # lastchar = int_value(spec.get('LastChar', 255))
- width_list = list_value(spec.get("Widths", [0] * 256))
- widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- if "Encoding" not in spec and "FontFile" in descriptor:
- # try to recover the missing encoding info from the font file.
- self.fontfile = stream_value(descriptor.get("FontFile"))
- length1 = int_value(self.fontfile["Length1"])
- data = self.fontfile.get_data()[:length1]
- parser = Type1FontHeaderParser(BytesIO(data))
- self.cid2unicode = parser.get_encoding()
-
- def __repr__(self) -> str:
- return "" % self.basefont
-
-
-class PDFTrueTypeFont(PDFType1Font):
- def __repr__(self) -> str:
- return "" % self.basefont
-
-
-class PDFType3Font(PDFSimpleFont):
- def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
- firstchar = int_value(spec.get("FirstChar", 0))
- # lastchar = int_value(spec.get('LastChar', 0))
- width_list = list_value(spec.get("Widths", [0] * 256))
- widths = {i + firstchar: w for (i, w) in enumerate(width_list)}
- if "FontDescriptor" in spec:
- descriptor = dict_value(spec["FontDescriptor"])
- else:
- descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
- (_, self.descent, _, self.ascent) = self.bbox
- (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
-
- def __repr__(self) -> str:
- return ""
-
-
-class PDFCIDFont(PDFFont):
- default_disp: Union[float, Tuple[Optional[float], float]]
-
- def __init__(
- self,
- rsrcmgr: "PDFResourceManager",
- spec: Mapping[str, Any],
- strict: bool = settings.STRICT,
- ) -> None:
- try:
- self.basefont = literal_name(spec["BaseFont"])
- except KeyError:
- if strict:
- raise PDFFontError("BaseFont is missing")
- self.basefont = "unknown"
- self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
- cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
- "latin1",
- )
- cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
- "latin1",
- )
- self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
- self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
-
- try:
- descriptor = dict_value(spec["FontDescriptor"])
- except KeyError:
- if strict:
- raise PDFFontError("FontDescriptor is missing")
- descriptor = {}
- ttf = None
- if "FontFile2" in descriptor:
- self.fontfile = stream_value(descriptor.get("FontFile2"))
- ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
- self.unicode_map: Optional[UnicodeMap] = None
- if "ToUnicode" in spec:
- if isinstance(spec["ToUnicode"], PDFStream):
- strm = stream_value(spec["ToUnicode"])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
- else:
- cmap_name = literal_name(spec["ToUnicode"])
- encoding = literal_name(spec["Encoding"])
- if (
- "Identity" in cid_ordering
- or "Identity" in cmap_name
- or "Identity" in encoding
- ):
- self.unicode_map = IdentityUnicodeMap()
- elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
- if ttf:
- try:
- self.unicode_map = ttf.create_unicode_map()
- except TrueTypeFont.CMapNotFound:
- pass
- else:
- try:
- self.unicode_map = CMapDB.get_unicode_map(
- self.cidcoding,
- self.cmap.is_vertical(),
- )
- except CMapDB.CMapNotFound:
- pass
-
- self.vertical = self.cmap.is_vertical()
- if self.vertical:
- # writing mode: vertical
- widths2 = get_widths2(list_value(spec.get("W2", [])))
- self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
- (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
- self.default_disp = (None, vy)
- widths = {cid: w for (cid, (w, _)) in widths2.items()}
- default_width = w
- else:
- # writing mode: horizontal
- self.disps = {}
- self.default_disp = 0
- widths = get_widths(list_value(spec.get("W", [])))
- default_width = spec.get("DW", 1000)
- PDFFont.__init__(self, descriptor, widths, default_width=default_width)
-
- def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
- """Get cmap from font specification
-
- For certain PDFs, Encoding Type isn't mentioned as an attribute of
- Encoding but as an attribute of CMapName, where CMapName is an
- attribute of spec['Encoding'].
- The horizontal/vertical modes are mentioned with different name
- such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
- """
- cmap_name = self._get_cmap_name(spec, strict)
-
- try:
- return CMapDB.get_cmap(cmap_name)
- except CMapDB.CMapNotFound as e:
- if strict:
- raise PDFFontError(e)
- return CMap()
-
- @staticmethod
- def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
- """Get cmap name from font specification"""
- cmap_name = "unknown" # default value
-
- try:
- spec_encoding = spec["Encoding"]
- if hasattr(spec_encoding, "name"):
- cmap_name = literal_name(spec["Encoding"])
- else:
- cmap_name = literal_name(spec_encoding["CMapName"])
- except KeyError:
- if strict:
- raise PDFFontError("Encoding is unspecified")
-
- if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
- cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
- if "CMapName" in cmap_name_stream:
- cmap_name = cmap_name_stream.get("CMapName").name
- elif strict:
- raise PDFFontError("CMapName unspecified for encoding")
-
- return IDENTITY_ENCODER.get(cmap_name, cmap_name)
-
- def __repr__(self) -> str:
- return f""
-
- def is_vertical(self) -> bool:
- return self.vertical
-
- def is_multibyte(self) -> bool:
- return True
-
- def decode(self, bytes: bytes) -> Iterable[int]:
- return self.cmap.decode(bytes)
-
- def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
- """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
- return self.disps.get(cid, self.default_disp)
-
- def to_unichr(self, cid: int) -> str:
- try:
- if not self.unicode_map:
- raise PDFKeyError(cid)
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- raise PDFUnicodeNotDefined(self.cidcoding, cid)
diff --git a/pdf2zh/pdfinterp.py b/pdf2zh/pdfinterp.py
index b9d2338..9ea16b6 100644
--- a/pdf2zh/pdfinterp.py
+++ b/pdf2zh/pdfinterp.py
@@ -1,51 +1,39 @@
import logging
-import re
-from io import BytesIO
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Dict, Optional, Sequence, Tuple, cast
import numpy as np
-from pdf2zh import settings
-from pdf2zh.casting import safe_float
-from pdf2zh.cmapdb import CMap, CMapBase, CMapDB
-from pdf2zh.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
-from pdf2zh.pdfdevice import PDFDevice, PDFTextSeq
-from pdf2zh.pdfexceptions import PDFException, PDFValueError
-from pdf2zh.pdffont import (
- PDFCIDFont,
- PDFFont,
- PDFFontError,
- PDFTrueTypeFont,
- PDFType1Font,
- PDFType3Font,
+from pdfminer import settings
+from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.pdfinterp import (
+ PDFPageInterpreter,
+ PDFResourceManager,
+ PDFContentParser,
+ PDFInterpreterError,
+ Color,
+ PDFStackT,
+ LITERAL_FORM,
+ LITERAL_IMAGE,
)
-from pdf2zh.pdfpage import PDFPage
-from pdf2zh.pdftypes import (
- LITERALS_ASCII85_DECODE,
+from pdfminer.pdffont import PDFFont
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdftypes import (
PDFObjRef,
- PDFStream,
dict_value,
list_value,
resolve1,
stream_value,
)
-from pdf2zh.psexceptions import PSEOF, PSTypeError
-from pdf2zh.psparser import (
- KWD,
- LIT,
+from pdfminer.psexceptions import PSEOF
+from pdfminer.psparser import (
PSKeyword,
- PSLiteral,
- PSStackParser,
- PSStackType,
keyword_name,
literal_name,
)
-from pdf2zh.utils import (
+from pdfminer.utils import (
MATRIX_IDENTITY,
Matrix,
- PathSegment,
- Point,
Rect,
- choplist,
mult_matrix,
apply_matrix_pt,
)
@@ -53,316 +41,14 @@
log = logging.getLogger(__name__)
-class PDFResourceError(PDFException):
- pass
+def safe_float(o: Any) -> Optional[float]:
+ try:
+ return float(o)
+ except (TypeError, ValueError):
+ return None
-class PDFInterpreterError(PDFException):
- pass
-
-
-LITERAL_PDF = LIT("PDF")
-LITERAL_TEXT = LIT("Text")
-LITERAL_FONT = LIT("Font")
-LITERAL_FORM = LIT("Form")
-LITERAL_IMAGE = LIT("Image")
-
-
-class PDFTextState:
- matrix: Matrix
- linematrix: Point
-
- def __init__(self) -> None:
- self.font: Optional[PDFFont] = None
- self.fontsize: float = 0
- self.charspace: float = 0
- self.wordspace: float = 0
- self.scaling: float = 100
- self.leading: float = 0
- self.render: int = 0
- self.rise: float = 0
- self.reset()
- # self.matrix is set
- # self.linematrix is set
-
- def __repr__(self) -> str:
- return (
- ""
- % (
- self.font,
- self.fontsize,
- self.charspace,
- self.wordspace,
- self.scaling,
- self.leading,
- self.render,
- self.rise,
- self.matrix,
- self.linematrix,
- )
- )
-
- def copy(self) -> "PDFTextState":
- obj = PDFTextState()
- obj.font = self.font
- obj.fontsize = self.fontsize
- obj.charspace = self.charspace
- obj.wordspace = self.wordspace
- obj.scaling = self.scaling
- obj.leading = self.leading
- obj.render = self.render
- obj.rise = self.rise
- obj.matrix = self.matrix
- obj.linematrix = self.linematrix
- return obj
-
- def reset(self) -> None:
- self.matrix = MATRIX_IDENTITY
- self.linematrix = (0, 0)
-
-
-Color = Union[
- float, # Greyscale
- Tuple[float, float, float], # R, G, B
- Tuple[float, float, float, float], # C, M, Y, K
-]
-
-
-class PDFGraphicState:
- def __init__(self) -> None:
- self.linewidth: float = 0
- self.linecap: Optional[object] = None
- self.linejoin: Optional[object] = None
- self.miterlimit: Optional[object] = None
- self.dash: Optional[Tuple[object, object]] = None
- self.intent: Optional[object] = None
- self.flatness: Optional[object] = None
-
- # stroking color
- self.scolor: Optional[Color] = None
-
- # non stroking color
- self.ncolor: Optional[Color] = None
-
- def copy(self) -> "PDFGraphicState":
- obj = PDFGraphicState()
- obj.linewidth = self.linewidth
- obj.linecap = self.linecap
- obj.linejoin = self.linejoin
- obj.miterlimit = self.miterlimit
- obj.dash = self.dash
- obj.intent = self.intent
- obj.flatness = self.flatness
- obj.scolor = self.scolor
- obj.ncolor = self.ncolor
- return obj
-
- def __repr__(self) -> str:
- return (
- ""
- % (
- self.linewidth,
- self.linecap,
- self.linejoin,
- self.miterlimit,
- self.dash,
- self.intent,
- self.flatness,
- self.scolor,
- self.ncolor,
- )
- )
-
-
-class PDFResourceManager:
- """Repository of shared resources.
-
- ResourceManager facilitates reuse of shared resources
- such as fonts and images so that large objects are not
- allocated multiple times.
- """
-
- def __init__(self, caching: bool = True) -> None:
- self.caching = caching
- self._cached_fonts: Dict[object, PDFFont] = {}
-
- def get_procset(self, procs: Sequence[object]) -> None:
- for proc in procs:
- if proc is LITERAL_PDF or proc is LITERAL_TEXT:
- pass
- else:
- pass
-
- def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
- try:
- return CMapDB.get_cmap(cmapname)
- except CMapDB.CMapNotFound:
- if strict:
- raise
- return CMap()
-
- def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
- if objid and objid in self._cached_fonts:
- font = self._cached_fonts[objid]
- else:
- # log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
- if settings.STRICT:
- if spec["Type"] is not LITERAL_FONT:
- raise PDFFontError("Type is not /Font")
- # Create a Font object.
- if "Subtype" in spec:
- subtype = literal_name(spec["Subtype"])
- else:
- if settings.STRICT:
- raise PDFFontError("Font Subtype is not specified.")
- subtype = "Type1"
- if subtype in ("Type1", "MMType1"):
- # Type1 Font
- font = PDFType1Font(self, spec)
- elif subtype == "TrueType":
- # TrueType Font
- font = PDFTrueTypeFont(self, spec)
- elif subtype == "Type3":
- # Type3 Font
- font = PDFType3Font(self, spec)
- elif subtype in ("CIDFontType0", "CIDFontType2"):
- # CID Font
- font = PDFCIDFont(self, spec)
- elif subtype == "Type0":
- # Type0 Font
- dfonts = list_value(spec["DescendantFonts"])
- assert dfonts
- subspec = dict_value(dfonts[0]).copy()
- for k in ("Encoding", "ToUnicode"):
- if k in spec:
- subspec[k] = resolve1(spec[k])
- font = self.get_font(None, subspec)
- else:
- if settings.STRICT:
- raise PDFFontError("Invalid Font spec: %r" % spec)
- font = PDFType1Font(self, spec) # this is so wrong!
- if objid and self.caching:
- self._cached_fonts[objid] = font
- return font
-
-
-class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
- def __init__(self, streams: Sequence[object]) -> None:
- self.streams = streams
- self.istream = 0
- # PSStackParser.__init__(fp=None) is safe only because we've overloaded
- # all the methods that would attempt to access self.fp without first
- # calling self.fillfp().
- PSStackParser.__init__(self, None) # type: ignore[arg-type]
-
- def fillfp(self) -> None:
- if not self.fp:
- if self.istream < len(self.streams):
- strm = stream_value(self.streams[self.istream])
- self.istream += 1
- else:
- raise PSEOF("Unexpected EOF, file truncated?")
- self.fp = BytesIO(strm.get_data())
- # if log.isEnabledFor(logging.DEBUG):
- # log.debug(f'STREAM DATA {strm.get_data()}')
-
- def seek(self, pos: int) -> None:
- self.fillfp()
- PSStackParser.seek(self, pos)
-
- def fillbuf(self) -> None:
- if self.charpos < len(self.buf):
- return
- while 1:
- self.fillfp()
- self.bufpos = self.fp.tell()
- self.buf = self.fp.read(self.BUFSIZ)
- if self.buf:
- break
- self.fp = None # type: ignore[assignment]
- self.charpos = 0
-
- def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
- self.seek(pos)
- i = 0
- data = b""
- while i <= len(target):
- self.fillbuf()
- if i:
- ci = self.buf[self.charpos]
- c = bytes((ci,))
- data += c
- self.charpos += 1
- if (
- len(target) <= i
- and c.isspace()
- or i < len(target)
- and c == (bytes((target[i],)))
- ):
- i += 1
- else:
- i = 0
- else:
- try:
- j = self.buf.index(target[0], self.charpos)
- data += self.buf[self.charpos : j + 1]
- self.charpos = j + 1
- i = 1
- except ValueError:
- data += self.buf[self.charpos :]
- self.charpos = len(self.buf)
- data = data[: -(len(target) + 1)] # strip the last part
- data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
- return (pos, data)
-
- def flush(self) -> None:
- self.add_results(*self.popall())
-
- KEYWORD_BI = KWD(b"BI")
- KEYWORD_ID = KWD(b"ID")
- KEYWORD_EI = KWD(b"EI")
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- if token is self.KEYWORD_BI:
- # inline image within a content stream
- self.start_type(pos, "inline")
- elif token is self.KEYWORD_ID:
- try:
- (_, objs) = self.end_type("inline")
- if len(objs) % 2 != 0:
- error_msg = f"Invalid dictionary construct: {objs!r}"
- raise PSTypeError(error_msg)
- d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
- eos = b"EI"
- filter = d.get("F", None)
- if filter is not None:
- if isinstance(filter, PSLiteral):
- filter = [filter]
- if filter[0] in LITERALS_ASCII85_DECODE:
- eos = b"~>"
- (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
- if eos != b"EI": # it may be necessary for decoding
- data += eos
- obj = PDFStream(d, data)
- self.push((pos, obj))
- if eos == b"EI": # otherwise it is still in the stream
- self.push((pos, self.KEYWORD_EI))
- except PSTypeError:
- if settings.STRICT:
- raise
- else:
- self.push((pos, token))
-
-
-PDFStackT = PSStackType[PDFStream]
-"""Types that may appear on the PDF argument stack."""
-
-
-class PDFPageInterpreter:
+class PDFPageInterpreterEx(PDFPageInterpreter):
"""Processor for the content of a PDF page
Reference: PDF Reference, Appendix A, Operator Summary
@@ -375,7 +61,7 @@ def __init__(
self.device = device
self.obj_patch = obj_patch
- def dup(self) -> "PDFPageInterpreter":
+ def dup(self) -> "PDFPageInterpreterEx":
return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
def init_resources(self, resources: Dict[object, object]) -> None:
@@ -409,6 +95,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
+ self.fontmap[fontid].descent = 0 # hack fix descent
self.fontid[self.fontmap[fontid]] = fontid
elif k == "ColorSpace":
for csid, spec in dict_value(v).items():
@@ -421,155 +108,6 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
for xobjid, xobjstrm in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
- def init_state(self, ctm: Matrix) -> None:
- """Initialize the text and graphic states for rendering a page."""
- # gstack: stack for graphical states.
- self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
- self.ctm = ctm
- self.device.set_ctm(self.ctm)
- self.textstate = PDFTextState()
- self.graphicstate = PDFGraphicState()
- self.curpath: List[PathSegment] = []
- # argstack: stack for command arguments.
- self.argstack: List[PDFStackT] = []
- # set some global states.
- self.scs: Optional[PDFColorSpace] = None
- self.ncs: Optional[PDFColorSpace] = None
- if self.csmap:
- self.scs = self.ncs = next(iter(self.csmap.values()))
-
- def push(self, obj: PDFStackT) -> None:
- self.argstack.append(obj)
-
- def pop(self, n: int) -> List[PDFStackT]:
- if n == 0:
- return []
- x = self.argstack[-n:]
- self.argstack = self.argstack[:-n]
- return x
-
- def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
- return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
-
- def set_current_state(
- self,
- state: Tuple[Matrix, PDFTextState, PDFGraphicState],
- ) -> None:
- (self.ctm, self.textstate, self.graphicstate) = state
- self.device.set_ctm(self.ctm)
-
- def do_q(self) -> None:
- """Save graphics state"""
- self.gstack.append(self.get_current_state())
-
- def do_Q(self) -> None:
- """Restore graphics state"""
- if self.gstack:
- self.set_current_state(self.gstack.pop())
-
- def do_cm(
- self,
- a1: PDFStackT,
- b1: PDFStackT,
- c1: PDFStackT,
- d1: PDFStackT,
- e1: PDFStackT,
- f1: PDFStackT,
- ) -> None:
- """Concatenate matrix to current transformation matrix"""
- self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
- self.device.set_ctm(self.ctm)
-
- def do_w(self, linewidth: PDFStackT) -> None:
- """Set line width"""
- self.graphicstate.linewidth = cast(float, linewidth)
-
- def do_J(self, linecap: PDFStackT) -> None:
- """Set line cap style"""
- self.graphicstate.linecap = linecap
-
- def do_j(self, linejoin: PDFStackT) -> None:
- """Set line join style"""
- self.graphicstate.linejoin = linejoin
-
- def do_M(self, miterlimit: PDFStackT) -> None:
- """Set miter limit"""
- self.graphicstate.miterlimit = miterlimit
-
- def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
- """Set line dash pattern"""
- self.graphicstate.dash = (dash, phase)
-
- def do_ri(self, intent: PDFStackT) -> None:
- """Set color rendering intent"""
- self.graphicstate.intent = intent
-
- def do_i(self, flatness: PDFStackT) -> None:
- """Set flatness tolerance"""
- self.graphicstate.flatness = flatness
-
- def do_gs(self, name: PDFStackT) -> None:
- """Set parameters from graphics state parameter dictionary"""
- # TODO
-
- def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
- """Begin new subpath"""
- self.curpath.append(("m", cast(float, x), cast(float, y)))
-
- def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
- """Append straight line segment to path"""
- self.curpath.append(("l", cast(float, x), cast(float, y)))
-
- def do_c(
- self,
- x1: PDFStackT,
- y1: PDFStackT,
- x2: PDFStackT,
- y2: PDFStackT,
- x3: PDFStackT,
- y3: PDFStackT,
- ) -> None:
- """Append curved segment to path (three control points)"""
- self.curpath.append(
- (
- "c",
- cast(float, x1),
- cast(float, y1),
- cast(float, x2),
- cast(float, y2),
- cast(float, x3),
- cast(float, y3),
- ),
- )
-
- def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
- """Append curved segment to path (initial point replicated)"""
- self.curpath.append(
- ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)),
- )
-
- def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
- """Append curved segment to path (final point replicated)"""
- self.curpath.append(
- ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)),
- )
-
- def do_h(self) -> None:
- """Close subpath"""
- self.curpath.append(("h",))
-
- def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
- """Append rectangle to path"""
- x = cast(float, x)
- y = cast(float, y)
- w = cast(float, w)
- h = cast(float, h)
- self.curpath.append(("m", x, y))
- self.curpath.append(("l", x + w, y))
- self.curpath.append(("l", x + w, y + h))
- self.curpath.append(("l", x, y + h))
- self.curpath.append(("h",))
-
def do_S(self) -> None:
"""Stroke path"""
@@ -594,11 +132,6 @@ def is_black(color: Color) -> bool:
else:
self.curpath = []
- def do_s(self) -> None:
- """Close and stroke path"""
- self.do_h()
- self.do_S()
-
def do_f(self) -> None:
"""Fill path using nonzero winding number rule"""
# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
@@ -622,85 +155,6 @@ def do_B_a(self) -> None:
# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
- def do_b(self) -> None:
- """Close, fill, and stroke path using nonzero winding number rule"""
- self.do_h()
- self.do_B()
-
- def do_b_a(self) -> None:
- """Close, fill, and stroke path using even-odd rule"""
- self.do_h()
- self.do_B_a()
-
- def do_n(self) -> None:
- """End path without filling or stroking"""
- self.curpath = []
-
- def do_W(self) -> None:
- """Set clipping path using nonzero winding number rule"""
-
- def do_W_a(self) -> None:
- """Set clipping path using even-odd rule"""
-
- def do_CS(self, name: PDFStackT) -> None:
- """Set color space for stroking operations
-
- Introduced in PDF 1.1
- """
- try:
- self.scs = self.csmap[literal_name(name)]
- except KeyError:
- if settings.STRICT:
- raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
-
- def do_cs(self, name: PDFStackT) -> None:
- """Set color space for nonstroking operations"""
- try:
- self.ncs = self.csmap[literal_name(name)]
- except KeyError:
- if settings.STRICT:
- raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
-
- def do_G(self, gray: PDFStackT) -> None:
- """Set gray level for stroking operations"""
- self.graphicstate.scolor = cast(float, gray)
- self.scs = self.csmap["DeviceGray"]
-
- def do_g(self, gray: PDFStackT) -> None:
- """Set gray level for nonstroking operations"""
- self.graphicstate.ncolor = cast(float, gray)
- self.ncs = self.csmap["DeviceGray"]
-
- def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
- """Set RGB color for stroking operations"""
- self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
- self.scs = self.csmap["DeviceRGB"]
-
- def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
- """Set RGB color for nonstroking operations"""
- self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
- self.ncs = self.csmap["DeviceRGB"]
-
- def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
- """Set CMYK color for stroking operations"""
- self.graphicstate.scolor = (
- cast(float, c),
- cast(float, m),
- cast(float, y),
- cast(float, k),
- )
- self.scs = self.csmap["DeviceCMYK"]
-
- def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
- """Set CMYK color for nonstroking operations"""
- self.graphicstate.ncolor = (
- cast(float, c),
- cast(float, m),
- cast(float, y),
- cast(float, k),
- )
- self.ncs = self.csmap["DeviceCMYK"]
-
def do_SCN(self) -> None:
"""Set color for stroking operations."""
if self.scs:
@@ -733,223 +187,6 @@ def do_sc(self) -> None:
"""Set color for nonstroking operations"""
return self.do_scn()
- def do_sh(self, name: object) -> None:
- """Paint area defined by shading pattern"""
-
- def do_BT(self) -> None:
- """Begin text object
-
- Initializing the text matrix, Tm, and the text line matrix, Tlm, to
- the identity matrix. Text objects cannot be nested; a second BT cannot
- appear before an ET.
- """
- self.textstate.reset()
-
- def do_ET(self) -> None:
- """End a text object"""
-
- def do_BX(self) -> None:
- """Begin compatibility section"""
-
- def do_EX(self) -> None:
- """End compatibility section"""
-
- def do_MP(self, tag: PDFStackT) -> None:
- """Define marked-content point"""
- self.device.do_tag(cast(PSLiteral, tag))
-
- def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
- """Define marked-content point with property list"""
- self.device.do_tag(cast(PSLiteral, tag), props)
-
- def do_BMC(self, tag: PDFStackT) -> None:
- """Begin marked-content sequence"""
- self.device.begin_tag(cast(PSLiteral, tag))
-
- def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
- """Begin marked-content sequence with property list"""
- self.device.begin_tag(cast(PSLiteral, tag), props)
-
- def do_EMC(self) -> None:
- """End marked-content sequence"""
- self.device.end_tag()
-
- def do_Tc(self, space: PDFStackT) -> None:
- """Set character spacing.
-
- Character spacing is used by the Tj, TJ, and ' operators.
-
- :param space: a number expressed in unscaled text space units.
- """
- self.textstate.charspace = cast(float, space)
-
- def do_Tw(self, space: PDFStackT) -> None:
- """Set the word spacing.
-
- Word spacing is used by the Tj, TJ, and ' operators.
-
- :param space: a number expressed in unscaled text space units
- """
- self.textstate.wordspace = cast(float, space)
-
- def do_Tz(self, scale: PDFStackT) -> None:
- """Set the horizontal scaling.
-
- :param scale: is a number specifying the percentage of the normal width
- """
- self.textstate.scaling = cast(float, scale)
-
- def do_TL(self, leading: PDFStackT) -> None:
- """Set the text leading.
-
- Text leading is used only by the T*, ', and " operators.
-
- :param leading: a number expressed in unscaled text space units
- """
- self.textstate.leading = -cast(float, leading)
-
- def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
- """Set the text font
-
- :param fontid: the name of a font resource in the Font subdictionary
- of the current resource dictionary
- :param fontsize: size is a number representing a scale factor.
- """
- try:
- self.textstate.font = self.fontmap[literal_name(fontid)]
- except KeyError:
- if settings.STRICT:
- raise PDFInterpreterError("Undefined Font id: %r" % fontid)
- self.textstate.font = self.rsrcmgr.get_font(None, {})
- self.textstate.fontsize = cast(float, fontsize)
-
- def do_Tr(self, render: PDFStackT) -> None:
- """Set the text rendering mode"""
- self.textstate.render = cast(int, render)
-
- def do_Ts(self, rise: PDFStackT) -> None:
- """Set the text rise
-
- :param rise: a number expressed in unscaled text space units
- """
- self.textstate.rise = cast(float, rise)
-
- def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
- """Move to the start of the next line
-
- Offset from the start of the current line by (tx , ty).
- """
- tx_ = safe_float(tx)
- ty_ = safe_float(ty)
- if tx_ is not None and ty_ is not None:
- (a, b, c, d, e, f) = self.textstate.matrix
- e_new = tx_ * a + ty_ * c + e
- f_new = tx_ * b + ty_ * d + f
- self.textstate.matrix = (a, b, c, d, e_new, f_new)
-
- elif settings.STRICT:
- raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
-
- self.textstate.linematrix = (0, 0)
-
- def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
- """Move to the start of the next line.
-
- offset from the start of the current line by (tx , ty). As a side effect, this
- operator sets the leading parameter in the text state.
- """
- tx_ = safe_float(tx)
- ty_ = safe_float(ty)
-
- if tx_ is not None and ty_ is not None:
- (a, b, c, d, e, f) = self.textstate.matrix
- e_new = tx_ * a + ty_ * c + e
- f_new = tx_ * b + ty_ * d + f
- self.textstate.matrix = (a, b, c, d, e_new, f_new)
-
- elif settings.STRICT:
- raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
-
- if ty_ is not None:
- self.textstate.leading = ty_
-
- self.textstate.linematrix = (0, 0)
-
- def do_Tm(
- self,
- a: PDFStackT,
- b: PDFStackT,
- c: PDFStackT,
- d: PDFStackT,
- e: PDFStackT,
- f: PDFStackT,
- ) -> None:
- """Set text matrix and text line matrix"""
- self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
- self.textstate.linematrix = (0, 0)
-
- def do_T_a(self) -> None:
- """Move to start of next text line"""
- (a, b, c, d, e, f) = self.textstate.matrix
- self.textstate.matrix = (
- a,
- b,
- c,
- d,
- self.textstate.leading * c + e,
- self.textstate.leading * d + f,
- )
- self.textstate.linematrix = (0, 0)
-
- def do_TJ(self, seq: PDFStackT) -> None:
- """Show text, allowing individual glyph positioning"""
- if self.textstate.font is None:
- if settings.STRICT:
- raise PDFInterpreterError("No font specified!")
- return
- assert self.ncs is not None
- self.device.render_string(
- self.textstate,
- cast(PDFTextSeq, seq),
- self.ncs,
- self.graphicstate.copy(),
- )
-
- def do_Tj(self, s: PDFStackT) -> None:
- """Show text"""
- self.do_TJ([s])
-
- def do__q(self, s: PDFStackT) -> None:
- """Move to next line and show text
-
- The ' (single quote) operator.
- """
- self.do_T_a()
- self.do_TJ([s])
-
- def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
- """Set word and character spacing, move to next line, and show text
-
- The " (double quote) operator.
- """
- self.do_Tw(aw)
- self.do_Tc(ac)
- self.do_TJ([s])
-
- def do_BI(self) -> None:
- """Begin inline image object"""
-
- def do_ID(self) -> None:
- """Begin inline image data"""
-
- def do_EI(self, obj: PDFStackT) -> None:
- """End inline image object"""
- if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
- iobjid = str(id(obj))
- self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
- self.device.render_image(iobjid, obj)
- self.device.end_figure(iobjid)
-
def do_Do(self, xobjid_arg: PDFStackT) -> None:
"""Invoke named XObject"""
xobjid = literal_name(xobjid_arg)
@@ -1055,7 +292,7 @@ def execute(self, streams: Sequence[object]) -> None:
return
while True:
try:
- _, (_, obj) = parser.nextobject()
+ (_, obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
diff --git a/pdf2zh/pdfpage.py b/pdf2zh/pdfpage.py
deleted file mode 100644
index e6ac705..0000000
--- a/pdf2zh/pdfpage.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import itertools
-import logging
-from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
-
-from pdf2zh import settings
-from pdf2zh.pdfdocument import (
- PDFDocument,
- PDFNoPageLabels,
- PDFTextExtractionNotAllowed,
-)
-from pdf2zh.pdfexceptions import PDFObjectNotFound, PDFValueError
-from pdf2zh.pdfparser import PDFParser
-from pdf2zh.pdftypes import dict_value, int_value, list_value, resolve1
-from pdf2zh.psparser import LIT
-from pdf2zh.utils import parse_rect
-
-log = logging.getLogger(__name__)
-
-# some predefined literals and keywords.
-LITERAL_PAGE = LIT("Page")
-LITERAL_PAGES = LIT("Pages")
-
-
-class PDFPage:
- """An object that holds the information about a page.
-
- A PDFPage object is merely a convenience class that has a set
- of keys and values, which describe the properties of a page
- and point to its contents.
-
- Attributes
- ----------
- doc: a PDFDocument object.
- pageid: any Python object that can uniquely identify the page.
- attrs: a dictionary of page attributes.
- contents: a list of PDFStream objects that represents the page content.
- lastmod: the last modified time of the page.
- resources: a dictionary of resources used by the page.
- mediabox: the physical size of the page.
- cropbox: the crop rectangle of the page.
- rotate: the page rotation (in degree).
- annots: the page annotations.
- beads: a chain that represents natural reading order.
- label: the page's label (typically, the logical page number).
-
- """
-
- def __init__(
- self,
- doc: PDFDocument,
- pageid: object,
- attrs: object,
- label: Optional[str],
- ) -> None:
- """Initialize a page object.
-
- doc: a PDFDocument object.
- pageid: any Python object that can uniquely identify the page.
- attrs: a dictionary of page attributes.
- label: page label string.
- """
- self.doc = doc
- self.pageid = pageid
- self.pageno = 0
- self.attrs = dict_value(attrs)
- self.label = label
- self.lastmod = resolve1(self.attrs.get("LastModified"))
- self.resources: Dict[object, object] = resolve1(
- self.attrs.get("Resources", dict()),
- )
- mediabox_params: List[Any] = [
- resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]
- ]
- self.mediabox = parse_rect(resolve1(mediabox_params))
- self.cropbox = self.mediabox
- if "CropBox" in self.attrs:
- try:
- self.cropbox = parse_rect(resolve1(self.attrs["CropBox"]))
- except PDFValueError:
- pass
-
- self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
- self.annots = self.attrs.get("Annots")
- self.beads = self.attrs.get("B")
- if "Contents" in self.attrs:
- contents = resolve1(self.attrs["Contents"])
- else:
- contents = []
- if not isinstance(contents, list):
- contents = [contents]
- self.contents: List[object] = contents
-
- def __repr__(self) -> str:
- return f""
-
- INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
-
- @classmethod
- def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
- def depth_first_search(
- obj: Any,
- parent: Dict[str, Any],
- visited: Optional[Set[Any]] = None,
- ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
- if isinstance(obj, int):
- object_id = obj
- object_properties = dict_value(document.getobj(object_id)).copy()
- else:
- # This looks broken. obj.objid means obj could be either
- # PDFObjRef or PDFStream, but neither is valid for dict_value.
- object_id = obj.objid # type: ignore[attr-defined]
- object_properties = dict_value(obj).copy()
-
- # Avoid recursion errors by keeping track of visited nodes
- if visited is None:
- visited = set()
- if object_id in visited:
- return
- visited.add(object_id)
-
- for k, v in parent.items():
- if k in cls.INHERITABLE_ATTRS and k not in object_properties:
- object_properties[k] = v
-
- object_type = object_properties.get("Type")
- if object_type is None and not settings.STRICT: # See #64
- object_type = object_properties.get("type")
-
- if object_type is LITERAL_PAGES and "Kids" in object_properties:
- # log.debug("Pages: Kids=%r", object_properties["Kids"])
- for child in list_value(object_properties["Kids"]):
- yield from depth_first_search(child, object_properties, visited)
-
- elif object_type is LITERAL_PAGE:
- # log.debug("Page: %r", object_properties)
- yield (object_id, object_properties)
-
- try:
- page_labels: Iterator[Optional[str]] = document.get_page_labels()
- except PDFNoPageLabels:
- page_labels = itertools.repeat(None)
-
- pages = False
- if "Pages" in document.catalog:
- objects = depth_first_search(document.catalog["Pages"], document.catalog)
- for objid, tree in objects:
- yield cls(document, objid, tree, next(page_labels))
- pages = True
- if not pages:
- # fallback when /Pages is missing.
- for xref in document.xrefs:
- for objid in xref.get_objids():
- try:
- obj = document.getobj(objid)
- if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
- yield cls(document, objid, obj, next(page_labels))
- except PDFObjectNotFound:
- pass
-
- @classmethod
- def get_pages(
- cls,
- fp: BinaryIO,
- pagenos: Optional[Container[int]] = None,
- maxpages: int = 0,
- password: str = "",
- caching: bool = True,
- check_extractable: bool = False,
- ) -> Iterator["PDFPage"]:
- # Create a PDF parser object associated with the file object.
- parser = PDFParser(fp)
- # Create a PDF document object that stores the document structure.
- doc = PDFDocument(parser, password=password, caching=caching)
- # Check if the document allows text extraction.
- # If not, warn the user and proceed.
- if not doc.is_extractable:
- if check_extractable:
- error_msg = "Text extraction is not allowed: %r" % fp
- raise PDFTextExtractionNotAllowed(error_msg)
- else:
- warning_msg = (
- "The PDF %r contains a metadata field "
- "indicating that it should not allow "
- "text extraction. Ignoring this field "
- "and proceeding. Use the check_extractable "
- "if you want to raise an error in this case" % fp
- )
- log.warning(warning_msg)
- # Process each page contained in the document.
- for pageno, page in enumerate(cls.create_pages(doc)):
- page.pageno = pageno
- if pagenos and (pageno not in pagenos):
- continue
- yield page
- if maxpages and maxpages <= pageno + 1:
- break
diff --git a/pdf2zh/pdfparser.py b/pdf2zh/pdfparser.py
deleted file mode 100644
index 5b02150..0000000
--- a/pdf2zh/pdfparser.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import logging
-from io import BytesIO
-from typing import TYPE_CHECKING, BinaryIO, Optional, Union
-
-from pdf2zh import settings
-from pdf2zh.casting import safe_int
-from pdf2zh.pdfexceptions import PDFException
-from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
-from pdf2zh.psexceptions import PSEOF
-from pdf2zh.psparser import KWD, PSKeyword, PSStackParser
-
-if TYPE_CHECKING:
- from pdf2zh.pdfdocument import PDFDocument
-
-log = logging.getLogger(__name__)
-
-
-class PDFSyntaxError(PDFException):
- pass
-
-
-# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
-class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
- """PDFParser fetch PDF objects from a file stream.
- It can handle indirect references by referring to
- a PDF document set by set_document method.
- It also reads XRefs at the end of every PDF file.
-
- Typical usage:
- parser = PDFParser(fp)
- parser.read_xref()
- parser.read_xref(fallback=True) # optional
- parser.set_document(doc)
- parser.seek(offset)
- parser.nextobject()
-
- """
-
- def __init__(self, fp: BinaryIO) -> None:
- PSStackParser.__init__(self, fp)
- self.doc: Optional[PDFDocument] = None
- self.fallback = False
-
- def set_document(self, doc: "PDFDocument") -> None:
- """Associates the parser with a PDFDocument object."""
- self.doc = doc
-
- KEYWORD_R = KWD(b"R")
- KEYWORD_NULL = KWD(b"null")
- KEYWORD_ENDOBJ = KWD(b"endobj")
- KEYWORD_STREAM = KWD(b"stream")
- KEYWORD_XREF = KWD(b"xref")
- KEYWORD_STARTXREF = KWD(b"startxref")
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- """Handles PDF-related keywords."""
- if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
- self.add_results(*self.pop(1))
-
- elif token is self.KEYWORD_ENDOBJ:
- self.add_results(*self.pop(4))
-
- elif token is self.KEYWORD_NULL:
- # null object
- self.push((pos, None))
-
- elif token is self.KEYWORD_R:
- # reference to indirect object
- if len(self.curstack) >= 2:
- (_, _object_id), _ = self.pop(2)
- object_id = safe_int(_object_id)
- if object_id is not None:
- obj = PDFObjRef(self.doc, object_id)
- self.push((pos, obj))
-
- elif token is self.KEYWORD_STREAM:
- # stream object
- ((_, dic),) = self.pop(1)
- dic = dict_value(dic)
- objlen = 0
- if not self.fallback:
- try:
- objlen = int_value(dic["Length"])
- except KeyError:
- if settings.STRICT:
- raise PDFSyntaxError("/Length is undefined: %r" % dic)
- self.seek(pos)
- try:
- (_, line) = self.nextline() # 'stream'
- except PSEOF:
- if settings.STRICT:
- raise PDFSyntaxError("Unexpected EOF")
- return
- pos += len(line)
- self.fp.seek(pos)
- data = bytearray(self.fp.read(objlen))
- self.seek(pos + objlen)
- while 1:
- try:
- (linepos, line) = self.nextline()
- except PSEOF:
- if settings.STRICT:
- raise PDFSyntaxError("Unexpected EOF")
- break
- if b"endstream" in line:
- i = line.index(b"endstream")
- objlen += i
- if self.fallback:
- data += line[:i]
- break
- objlen += len(line)
- if self.fallback:
- data += line
- self.seek(pos + objlen)
- # XXX limit objlen not to exceed object boundary
- # log.debug(
- # "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
- # pos,
- # objlen,
- # dic,
- # data[:10],
- # )
- assert self.doc is not None
- stream = PDFStream(dic, bytes(data), self.doc.decipher)
- self.push((pos, stream))
-
- else:
- # others
- self.push((pos, token))
-
-
-class PDFStreamParser(PDFParser):
- """PDFStreamParser is used to parse PDF content streams
- that is contained in each page and has instructions
- for rendering the page. A reference to a PDF document is
- needed because a PDF content stream can also have
- indirect references to other objects in the same document.
- """
-
- def __init__(self, data: bytes) -> None:
- PDFParser.__init__(self, BytesIO(data))
-
- def flush(self) -> None:
- self.add_results(*self.popall())
-
- KEYWORD_OBJ = KWD(b"obj")
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- if token is self.KEYWORD_R:
- # reference to indirect object
- (_, _object_id), _ = self.pop(2)
- object_id = safe_int(_object_id)
- if object_id is not None:
- obj = PDFObjRef(self.doc, object_id)
- self.push((pos, obj))
- return
-
- elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
- if settings.STRICT:
- # See PDF Spec 3.4.6: Only the object values are stored in the
- # stream; the obj and endobj keywords are not used.
- raise PDFSyntaxError("Keyword endobj found in stream")
- return
-
- # others
- self.push((pos, token))
diff --git a/pdf2zh/pdftypes.py b/pdf2zh/pdftypes.py
deleted file mode 100644
index 2563fef..0000000
--- a/pdf2zh/pdftypes.py
+++ /dev/null
@@ -1,397 +0,0 @@
-import io
-import logging
-import zlib
-from typing import (
- TYPE_CHECKING,
- Any,
- Dict,
- Iterable,
- List,
- Optional,
- Protocol,
- Tuple,
- Union,
- cast,
-)
-from warnings import warn
-
-from pdf2zh import pdfexceptions, settings
-from pdf2zh.ascii85 import ascii85decode, asciihexdecode
-from pdf2zh.ccitt import ccittfaxdecode
-from pdf2zh.lzw import lzwdecode
-from pdf2zh.psparser import LIT, PSObject
-from pdf2zh.runlength import rldecode
-from pdf2zh.utils import apply_png_predictor
-
-if TYPE_CHECKING:
- from pdf2zh.pdfdocument import PDFDocument
-
-logger = logging.getLogger(__name__)
-
-LITERAL_CRYPT = LIT("Crypt")
-
-# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
-LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
-LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
-LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
-LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
-LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
-LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
-LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
-LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
-LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
-
-
-class DecipherCallable(Protocol):
- """Fully typed a decipher callback, with optional parameter."""
-
- def __call__(
- self,
- objid: int,
- genno: int,
- data: bytes,
- attrs: Optional[Dict[str, Any]] = None,
- ) -> bytes:
- raise NotImplementedError
-
-
-class PDFObject(PSObject):
- pass
-
-
-# Adding aliases for these exceptions for backwards compatibility
-PDFException = pdfexceptions.PDFException
-PDFTypeError = pdfexceptions.PDFTypeError
-PDFValueError = pdfexceptions.PDFValueError
-PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
-PDFNotImplementedError = pdfexceptions.PDFNotImplementedError
-
-_DEFAULT = object()
-
-
-class PDFObjRef(PDFObject):
- def __init__(
- self,
- doc: Optional["PDFDocument"],
- objid: int,
- _: Any = _DEFAULT,
- ) -> None:
- """Reference to a PDF object.
-
- :param doc: The PDF document.
- :param objid: The object number.
- :param _: Unused argument for backwards compatibility.
- """
- if _ is not _DEFAULT:
- warn(
- "The third argument of PDFObjRef is unused and will be removed after "
- "2024",
- DeprecationWarning,
- )
-
- if objid == 0:
- if settings.STRICT:
- raise PDFValueError("PDF object id cannot be 0.")
-
- self.doc = doc
- self.objid = objid
-
- def __repr__(self) -> str:
- return "" % (self.objid)
-
- def resolve(self, default: object = None) -> Any:
- assert self.doc is not None
- try:
- return self.doc.getobj(self.objid)
- except PDFObjectNotFound:
- return default
-
-
-def resolve1(x: object, default: object = None) -> Any:
- """Resolves an object.
-
- If this is an array or dictionary, it may still contains
- some indirect objects inside.
- """
- while isinstance(x, PDFObjRef):
- x = x.resolve(default=default)
- return x
-
-
-def resolve_all(x: object, default: object = None) -> Any:
- """Recursively resolves the given object and all the internals.
-
- Make sure there is no indirect reference within the nested object.
- This procedure might be slow.
- """
- while isinstance(x, PDFObjRef):
- x = x.resolve(default=default)
- if isinstance(x, list):
- x = [resolve_all(v, default=default) for v in x]
- elif isinstance(x, dict):
- for k, v in x.items():
- x[k] = resolve_all(v, default=default)
- return x
-
-
-def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
- """Recursively deciphers the given object."""
- if isinstance(x, bytes):
- if len(x) == 0:
- return x
- return decipher(objid, genno, x)
- if isinstance(x, list):
- x = [decipher_all(decipher, objid, genno, v) for v in x]
- elif isinstance(x, dict):
- for k, v in x.items():
- x[k] = decipher_all(decipher, objid, genno, v)
- return x
-
-
-def int_value(x: object) -> int:
- x = resolve1(x)
- if not isinstance(x, int):
- if settings.STRICT:
- raise PDFTypeError("Integer required: %r" % x)
- return 0
- return x
-
-
-def float_value(x: object) -> float:
- x = resolve1(x)
- if not isinstance(x, float):
- if settings.STRICT:
- raise PDFTypeError("Float required: %r" % x)
- return 0.0
- return x
-
-
-def num_value(x: object) -> float:
- x = resolve1(x)
- if not isinstance(x, (int, float)): # == utils.isnumber(x)
- if settings.STRICT:
- raise PDFTypeError("Int or Float required: %r" % x)
- return 0
- return x
-
-
-def uint_value(x: object, n_bits: int) -> int:
- """Resolve number and interpret it as a two's-complement unsigned number"""
- xi = int_value(x)
- if xi > 0:
- return xi
- else:
- return xi + cast(int, 2**n_bits)
-
-
-def str_value(x: object) -> bytes:
- x = resolve1(x)
- if not isinstance(x, bytes):
- if settings.STRICT:
- raise PDFTypeError("String required: %r" % x)
- return b""
- return x
-
-
-def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
- x = resolve1(x)
- if not isinstance(x, (list, tuple)):
- if settings.STRICT:
- raise PDFTypeError("List required: %r" % x)
- return []
- return x
-
-
-def dict_value(x: object) -> Dict[Any, Any]:
- x = resolve1(x)
- if not isinstance(x, dict):
- if settings.STRICT:
- logger.error("PDFTypeError : Dict required: %r", x)
- raise PDFTypeError("Dict required: %r" % x)
- return {}
- return x
-
-
-def stream_value(x: object) -> "PDFStream":
- x = resolve1(x)
- if not isinstance(x, PDFStream):
- if settings.STRICT:
- raise PDFTypeError("PDFStream required: %r" % x)
- return PDFStream({}, b"")
- return x
-
-
-def decompress_corrupted(data: bytes) -> bytes:
- """Called on some data that can't be properly decoded because of CRC checksum
- error. Attempt to decode it skipping the CRC.
- """
- d = zlib.decompressobj()
- f = io.BytesIO(data)
- result_str = b""
- buffer = f.read(1)
- i = 0
- try:
- while buffer:
- result_str += d.decompress(buffer)
- buffer = f.read(1)
- i += 1
- except zlib.error:
- # Let the error propagates if we're not yet in the CRC checksum
- if i < len(data) - 3:
- logger.warning("Data-loss while decompressing corrupted data")
- return result_str
-
-
-class PDFStream(PDFObject):
- def __init__(
- self,
- attrs: Dict[str, Any],
- rawdata: bytes,
- decipher: Optional[DecipherCallable] = None,
- ) -> None:
- assert isinstance(attrs, dict), str(type(attrs))
- self.attrs = attrs
- self.rawdata: Optional[bytes] = rawdata
- self.decipher = decipher
- self.data: Optional[bytes] = None
- self.objid: Optional[int] = None
- self.genno: Optional[int] = None
-
- def set_objid(self, objid: int, genno: int) -> None:
- self.objid = objid
- self.genno = genno
-
- def __repr__(self) -> str:
- if self.data is None:
- assert self.rawdata is not None
- return "" % (
- self.objid,
- len(self.rawdata),
- self.attrs,
- )
- else:
- assert self.data is not None
- return "" % (
- self.objid,
- len(self.data),
- self.attrs,
- )
-
- def __contains__(self, name: object) -> bool:
- return name in self.attrs
-
- def __getitem__(self, name: str) -> Any:
- return self.attrs[name]
-
- def get(self, name: str, default: object = None) -> Any:
- return self.attrs.get(name, default)
-
- def get_any(self, names: Iterable[str], default: object = None) -> Any:
- for name in names:
- if name in self.attrs:
- return self.attrs[name]
- return default
-
- def get_filters(self) -> List[Tuple[Any, Any]]:
- filters = self.get_any(("F", "Filter"))
- params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
- if not filters:
- return []
- if not isinstance(filters, list):
- filters = [filters]
- if not isinstance(params, list):
- # Make sure the parameters list is the same as filters.
- params = [params] * len(filters)
- if settings.STRICT and len(params) != len(filters):
- raise PDFException("Parameters len filter mismatch")
-
- resolved_filters = [resolve1(f) for f in filters]
- resolved_params = [resolve1(param) for param in params]
- return list(zip(resolved_filters, resolved_params))
-
- def decode(self) -> None:
- assert self.data is None and self.rawdata is not None, str(
- (self.data, self.rawdata),
- )
- data = self.rawdata
- if self.decipher:
- # Handle encryption
- assert self.objid is not None
- assert self.genno is not None
- data = self.decipher(self.objid, self.genno, data, self.attrs)
- filters = self.get_filters()
- if not filters:
- self.data = data
- self.rawdata = None
- return
- for f, params in filters:
- if f in LITERALS_FLATE_DECODE:
- # will get errors if the document is encrypted.
- try:
- data = zlib.decompress(data)
-
- except zlib.error as e:
- if settings.STRICT:
- error_msg = f"Invalid zlib bytes: {e!r}, {data!r}"
- raise PDFException(error_msg)
-
- try:
- data = decompress_corrupted(data)
- except zlib.error:
- data = b""
-
- elif f in LITERALS_LZW_DECODE:
- data = lzwdecode(data)
- elif f in LITERALS_ASCII85_DECODE:
- data = ascii85decode(data)
- elif f in LITERALS_ASCIIHEX_DECODE:
- data = asciihexdecode(data)
- elif f in LITERALS_RUNLENGTH_DECODE:
- data = rldecode(data)
- elif f in LITERALS_CCITTFAX_DECODE:
- data = ccittfaxdecode(data, params)
- elif f in LITERALS_DCT_DECODE:
- # This is probably a JPG stream
- # it does not need to be decoded twice.
- # Just return the stream to the user.
- pass
- elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE:
- pass
- elif f == LITERAL_CRYPT:
- # not yet..
- raise PDFNotImplementedError("/Crypt filter is unsupported")
- else:
- raise PDFNotImplementedError("Unsupported filter: %r" % f)
- # apply predictors
- if params and "Predictor" in params:
- pred = int_value(params["Predictor"])
- if pred == 1:
- # no predictor
- pass
- elif pred >= 10:
- # PNG predictor
- colors = int_value(params.get("Colors", 1))
- columns = int_value(params.get("Columns", 1))
- raw_bits_per_component = params.get("BitsPerComponent", 8)
- bitspercomponent = int_value(raw_bits_per_component)
- data = apply_png_predictor(
- pred,
- colors,
- columns,
- bitspercomponent,
- data,
- )
- else:
- error_msg = "Unsupported predictor: %r" % pred
- raise PDFNotImplementedError(error_msg)
- self.data = data
- self.rawdata = None
-
- def get_data(self) -> bytes:
- if self.data is None:
- self.decode()
- assert self.data is not None
- return self.data
-
- def get_rawdata(self) -> Optional[bytes]:
- return self.rawdata
diff --git a/pdf2zh/psexceptions.py b/pdf2zh/psexceptions.py
deleted file mode 100644
index b8291dc..0000000
--- a/pdf2zh/psexceptions.py
+++ /dev/null
@@ -1,18 +0,0 @@
-class PSException(Exception):
- pass
-
-
-class PSEOF(PSException):
- pass
-
-
-class PSSyntaxError(PSException):
- pass
-
-
-class PSTypeError(PSException):
- pass
-
-
-class PSValueError(PSException):
- pass
diff --git a/pdf2zh/psparser.py b/pdf2zh/psparser.py
deleted file mode 100644
index 1249153..0000000
--- a/pdf2zh/psparser.py
+++ /dev/null
@@ -1,656 +0,0 @@
-#!/usr/bin/env python3
-import io
-import logging
-import re
-from typing import (
- Any,
- BinaryIO,
- Dict,
- Generic,
- Iterator,
- List,
- Optional,
- Tuple,
- Type,
- TypeVar,
- Union,
-)
-
-from pdf2zh import psexceptions, settings
-from pdf2zh.utils import choplist
-
-log = logging.getLogger(__name__)
-
-
-# Adding aliases for these exceptions for backwards compatibility
-PSException = psexceptions.PSException
-PSEOF = psexceptions.PSEOF
-PSSyntaxError = psexceptions.PSSyntaxError
-PSTypeError = psexceptions.PSTypeError
-PSValueError = psexceptions.PSValueError
-
-
-class PSObject:
- """Base class for all PS or PDF-related data types."""
-
-
-class PSLiteral(PSObject):
- """A class that represents a PostScript literal.
-
- Postscript literals are used as identifiers, such as
- variable names, property names and dictionary keys.
- Literals are case sensitive and denoted by a preceding
- slash sign (e.g. "/Name")
-
- Note: Do not create an instance of PSLiteral directly.
- Always use PSLiteralTable.intern().
- """
-
- NameType = Union[str, bytes]
-
- def __init__(self, name: NameType) -> None:
- self.name = name
-
- def __repr__(self) -> str:
- name = self.name
- return "/%r" % name
-
-
-class PSKeyword(PSObject):
- """A class that represents a PostScript keyword.
-
- PostScript keywords are a dozen of predefined words.
- Commands and directives in PostScript are expressed by keywords.
- They are also used to denote the content boundaries.
-
- Note: Do not create an instance of PSKeyword directly.
- Always use PSKeywordTable.intern().
- """
-
- def __init__(self, name: bytes) -> None:
- self.name = name
-
- def __repr__(self) -> str:
- name = self.name
- return "/%r" % name
-
-
-_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
-
-
-class PSSymbolTable(Generic[_SymbolT]):
- """A utility class for storing PSLiteral/PSKeyword objects.
-
- Interned objects can be checked its identity with "is" operator.
- """
-
- def __init__(self, klass: Type[_SymbolT]) -> None:
- self.dict: Dict[PSLiteral.NameType, _SymbolT] = {}
- self.klass: Type[_SymbolT] = klass
-
- def intern(self, name: PSLiteral.NameType) -> _SymbolT:
- if name in self.dict:
- lit = self.dict[name]
- else:
- # Type confusion issue: PSKeyword always takes bytes as name
- # PSLiteral uses either str or bytes
- lit = self.klass(name) # type: ignore[arg-type]
- self.dict[name] = lit
- return lit
-
-
-PSLiteralTable = PSSymbolTable(PSLiteral)
-PSKeywordTable = PSSymbolTable(PSKeyword)
-LIT = PSLiteralTable.intern
-KWD = PSKeywordTable.intern
-KEYWORD_PROC_BEGIN = KWD(b"{")
-KEYWORD_PROC_END = KWD(b"}")
-KEYWORD_ARRAY_BEGIN = KWD(b"[")
-KEYWORD_ARRAY_END = KWD(b"]")
-KEYWORD_DICT_BEGIN = KWD(b"<<")
-KEYWORD_DICT_END = KWD(b">>")
-
-
-def literal_name(x: Any) -> str:
- if isinstance(x, PSLiteral):
- if isinstance(x.name, str):
- return x.name
- try:
- return str(x.name, "utf-8")
- except UnicodeDecodeError:
- return str(x.name)
- else:
- if settings.STRICT:
- raise PSTypeError(f"Literal required: {x!r}")
- return str(x)
-
-
-def keyword_name(x: Any) -> Any:
- if not isinstance(x, PSKeyword):
- if settings.STRICT:
- raise PSTypeError("Keyword required: %r" % x)
- else:
- name = x
- else:
- name = str(x.name, "utf-8", "ignore")
- return name
-
-
-EOL = re.compile(rb"[\r\n]")
-SPC = re.compile(rb"\s")
-NONSPC = re.compile(rb"\S")
-HEX = re.compile(rb"[0-9a-fA-F]")
-END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
-END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
-HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
-END_NUMBER = re.compile(rb"[^0-9]")
-END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
-END_STRING = re.compile(rb"[()\134]")
-OCT_STRING = re.compile(rb"[0-7]")
-ESC_STRING = {
- b"b": 8,
- b"t": 9,
- b"n": 10,
- b"f": 12,
- b"r": 13,
- b"(": 40,
- b")": 41,
- b"\\": 92,
-}
-
-
-PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
-
-
-class PSBaseParser:
- """Most basic PostScript parser that performs only tokenization."""
-
- BUFSIZ = 4096
-
- def __init__(self, fp: BinaryIO) -> None:
- self.fp = fp
- self.seek(0)
-
- def __repr__(self) -> str:
- return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
-
- def flush(self) -> None:
- pass
-
- def close(self) -> None:
- self.flush()
-
- def tell(self) -> int:
- return self.bufpos + self.charpos
-
- def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
- pos0 = self.fp.tell()
- if not pos:
- pos = self.bufpos + self.charpos
- self.fp.seek(pos)
- # log.debug("poll(%d): %r", pos, self.fp.read(n))
- self.fp.seek(pos0)
-
- def seek(self, pos: int) -> None:
- """Seeks the parser to the given position."""
- # log.debug("seek: %r", pos)
- self.fp.seek(pos)
- # reset the status for nextline()
- self.bufpos = pos
- self.buf = b""
- self.charpos = 0
- # reset the status for nexttoken()
- self._parse1 = self._parse_main
- self._curtoken = b""
- self._curtokenpos = 0
- self._tokens: List[Tuple[int, PSBaseParserToken]] = []
-
- def fillbuf(self) -> None:
- if self.charpos < len(self.buf):
- return
- # fetch next chunk.
- self.bufpos = self.fp.tell()
- self.buf = self.fp.read(self.BUFSIZ)
- if not self.buf:
- raise PSEOF("Unexpected EOF")
- self.charpos = 0
-
- def nextline(self) -> Tuple[int, bytes]:
- """Fetches a next line that ends either with \\r or \\n."""
- linebuf = b""
- linepos = self.bufpos + self.charpos
- eol = False
- while 1:
- self.fillbuf()
- if eol:
- c = self.buf[self.charpos : self.charpos + 1]
- # handle b'\r\n'
- if c == b"\n":
- linebuf += c
- self.charpos += 1
- break
- m = EOL.search(self.buf, self.charpos)
- if m:
- linebuf += self.buf[self.charpos : m.end(0)]
- self.charpos = m.end(0)
- if linebuf[-1:] == b"\r":
- eol = True
- else:
- break
- else:
- linebuf += self.buf[self.charpos :]
- self.charpos = len(self.buf)
- # log.debug("nextline: %r, %r", linepos, linebuf)
-
- return (linepos, linebuf)
-
- def revreadlines(self) -> Iterator[bytes]:
- """Fetches a next line backword.
-
- This is used to locate the trailers at the end of a file.
- """
- self.fp.seek(0, io.SEEK_END)
- pos = self.fp.tell()
- buf = b""
- while pos > 0:
- prevpos = pos
- pos = max(0, pos - self.BUFSIZ)
- self.fp.seek(pos)
- s = self.fp.read(prevpos - pos)
- if not s:
- break
- while 1:
- n = max(s.rfind(b"\r"), s.rfind(b"\n"))
- if n == -1:
- buf = s + buf
- break
- yield s[n:] + buf
- s = s[:n]
- buf = b""
-
- def _parse_main(self, s: bytes, i: int) -> int:
- m = NONSPC.search(s, i)
- if not m:
- return len(s)
- j = m.start(0)
- c = s[j : j + 1]
- self._curtokenpos = self.bufpos + j
- if c == b"%":
- self._curtoken = b"%"
- self._parse1 = self._parse_comment
- return j + 1
- elif c == b"/":
- self._curtoken = b""
- self._parse1 = self._parse_literal
- return j + 1
- elif c in b"-+" or c.isdigit():
- self._curtoken = c
- self._parse1 = self._parse_number
- return j + 1
- elif c == b".":
- self._curtoken = c
- self._parse1 = self._parse_float
- return j + 1
- elif c.isalpha():
- self._curtoken = c
- self._parse1 = self._parse_keyword
- return j + 1
- elif c == b"(":
- self._curtoken = b""
- self.paren = 1
- self._parse1 = self._parse_string
- return j + 1
- elif c == b"<":
- self._curtoken = b""
- self._parse1 = self._parse_wopen
- return j + 1
- elif c == b">":
- self._curtoken = b""
- self._parse1 = self._parse_wclose
- return j + 1
- elif c == b"\x00":
- return j + 1
- else:
- self._add_token(KWD(c))
- return j + 1
-
- def _add_token(self, obj: PSBaseParserToken) -> None:
- self._tokens.append((self._curtokenpos, obj))
-
- def _parse_comment(self, s: bytes, i: int) -> int:
- m = EOL.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- self._parse1 = self._parse_main
- # We ignore comments.
- # self._tokens.append(self._curtoken)
- return j
-
- def _parse_literal(self, s: bytes, i: int) -> int:
- m = END_LITERAL.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- c = s[j : j + 1]
- if c == b"#":
- self.hex = b""
- self._parse1 = self._parse_literal_hex
- return j + 1
- try:
- name: Union[str, bytes] = str(self._curtoken, "utf-8")
- except Exception:
- name = self._curtoken
- self._add_token(LIT(name))
- self._parse1 = self._parse_main
- return j
-
- def _parse_literal_hex(self, s: bytes, i: int) -> int:
- c = s[i : i + 1]
- if HEX.match(c) and len(self.hex) < 2:
- self.hex += c
- return i + 1
- if self.hex:
- self._curtoken += bytes((int(self.hex, 16),))
- self._parse1 = self._parse_literal
- return i
-
- def _parse_number(self, s: bytes, i: int) -> int:
- m = END_NUMBER.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- c = s[j : j + 1]
- if c == b".":
- self._curtoken += c
- self._parse1 = self._parse_float
- return j + 1
- try:
- self._add_token(int(self._curtoken))
- except ValueError:
- pass
- self._parse1 = self._parse_main
- return j
-
- def _parse_float(self, s: bytes, i: int) -> int:
- m = END_NUMBER.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- try:
- self._add_token(float(self._curtoken))
- except ValueError:
- pass
- self._parse1 = self._parse_main
- return j
-
- def _parse_keyword(self, s: bytes, i: int) -> int:
- m = END_KEYWORD.search(s, i)
- if m:
- j = m.start(0)
- self._curtoken += s[i:j]
- else:
- # Use the rest of the stream if no non-keyword character is found. This
- # can happen if the keyword is the final bytes of the stream
- # (https://github.com/pdf2zh/pdf2zh.six/issues/884).
- j = len(s)
- self._curtoken += s[i:]
- if self._curtoken == b"true":
- token: Union[bool, PSKeyword] = True
- elif self._curtoken == b"false":
- token = False
- else:
- token = KWD(self._curtoken)
- self._add_token(token)
- self._parse1 = self._parse_main
- return j
-
- def _parse_string(self, s: bytes, i: int) -> int:
- m = END_STRING.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- c = s[j : j + 1]
- if c == b"\\":
- self.oct = b""
- self._parse1 = self._parse_string_1
- return j + 1
- if c == b"(":
- self.paren += 1
- self._curtoken += c
- return j + 1
- if c == b")":
- self.paren -= 1
- if self.paren:
- # WTF, they said balanced parens need no special treatment.
- self._curtoken += c
- return j + 1
- self._add_token(self._curtoken)
- self._parse1 = self._parse_main
- return j + 1
-
- def _parse_string_1(self, s: bytes, i: int) -> int:
- """Parse literal strings
-
- PDF Reference 3.2.3
- """
- c = s[i : i + 1]
- if OCT_STRING.match(c) and len(self.oct) < 3:
- self.oct += c
- return i + 1
-
- elif self.oct:
- chrcode = int(self.oct, 8)
- assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
- self._curtoken += bytes((chrcode,))
- self._parse1 = self._parse_string
- return i
-
- elif c in ESC_STRING:
- self._curtoken += bytes((ESC_STRING[c],))
-
- elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
- # If current and next character is \r\n skip both because enters
- # after a \ are ignored
- i += 1
-
- # default action
- self._parse1 = self._parse_string
- return i + 1
-
- def _parse_wopen(self, s: bytes, i: int) -> int:
- c = s[i : i + 1]
- if c == b"<":
- self._add_token(KEYWORD_DICT_BEGIN)
- self._parse1 = self._parse_main
- i += 1
- else:
- self._parse1 = self._parse_hexstring
- return i
-
- def _parse_wclose(self, s: bytes, i: int) -> int:
- c = s[i : i + 1]
- if c == b">":
- self._add_token(KEYWORD_DICT_END)
- i += 1
- self._parse1 = self._parse_main
- return i
-
- def _parse_hexstring(self, s: bytes, i: int) -> int:
- m = END_HEX_STRING.search(s, i)
- if not m:
- self._curtoken += s[i:]
- return len(s)
- j = m.start(0)
- self._curtoken += s[i:j]
- token = HEX_PAIR.sub(
- lambda m: bytes((int(m.group(0), 16),)),
- SPC.sub(b"", self._curtoken),
- )
- self._add_token(token)
- self._parse1 = self._parse_main
- return j
-
- def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
- while not self._tokens:
- self.fillbuf()
- self.charpos = self._parse1(self.buf, self.charpos)
- token = self._tokens.pop(0)
- # log.debug("nexttoken: %r", token)
- return token
-
-
-# Stack slots may by occupied by any of:
-# * the name of a literal
-# * the PSBaseParserToken types
-# * list (via KEYWORD_ARRAY)
-# * dict (via KEYWORD_DICT)
-# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT
-ExtraT = TypeVar("ExtraT")
-PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT]
-PSStackEntry = Tuple[int, PSStackType[ExtraT]]
-
-
-class PSStackParser(PSBaseParser, Generic[ExtraT]):
- def __init__(self, fp: BinaryIO) -> None:
- PSBaseParser.__init__(self, fp)
- self.reset()
-
- def reset(self) -> None:
- self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
- self.curtype: Optional[str] = None
- self.curstack: List[PSStackEntry[ExtraT]] = []
- self.results: List[PSStackEntry[ExtraT]] = []
-
- def seek(self, pos: int) -> None:
- PSBaseParser.seek(self, pos)
- self.reset()
-
- def push(self, *objs: PSStackEntry[ExtraT]) -> None:
- self.curstack.extend(objs)
-
- def pop(self, n: int) -> List[PSStackEntry[ExtraT]]:
- objs = self.curstack[-n:]
- self.curstack[-n:] = []
- return objs
-
- def popall(self) -> List[PSStackEntry[ExtraT]]:
- objs = self.curstack
- self.curstack = []
- return objs
-
- def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
- # try:
- # log.debug("add_results: %r", objs)
- # except Exception:
- # log.debug("add_results: (unprintable object)")
- self.results.extend(objs)
-
- def start_type(self, pos: int, type: str) -> None:
- self.context.append((pos, self.curtype, self.curstack))
- (self.curtype, self.curstack) = (type, [])
- # log.debug("start_type: pos=%r, type=%r", pos, type)
-
- def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
- if self.curtype != type:
- raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
- objs = [obj for (_, obj) in self.curstack]
- (pos, self.curtype, self.curstack) = self.context.pop()
- # log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
- return (pos, objs)
-
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- pass
-
- def nextobject(self) -> PSStackEntry[ExtraT]:
- """Yields a list of objects.
-
- Arrays and dictionaries are represented as Python lists and
- dictionaries.
-
- :return: keywords, literals, strings, numbers, arrays and dictionaries.
- """
- end = None
- while not self.results:
- (pos, token) = self.nexttoken()
- if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
- # normal token
- self.push((pos, token))
- elif token == KEYWORD_ARRAY_BEGIN:
- # begin array
- self.start_type(pos, "a")
- elif token == KEYWORD_ARRAY_END:
- # end array
- try:
- self.push(self.end_type("a"))
- except PSTypeError:
- if settings.STRICT:
- raise
- elif token == KEYWORD_DICT_BEGIN:
- # begin dictionary
- self.start_type(pos, "d")
- elif token == KEYWORD_DICT_END:
- # end dictionary
- try:
- (pos, objs) = self.end_type("d")
- if len(objs) % 2 != 0:
- error_msg = "Invalid dictionary construct: %r" % objs
- raise PSSyntaxError(error_msg)
- d = {
- literal_name(k): v
- for (k, v) in choplist(2, objs)
- if v is not None
- }
- self.push((pos, d))
- except PSTypeError:
- if settings.STRICT:
- raise
- elif token == KEYWORD_PROC_BEGIN:
- # begin proc
- self.start_type(pos, "p")
- elif token == KEYWORD_PROC_END:
- # end proc
- try:
- self.push(self.end_type("p"))
- except PSTypeError:
- if settings.STRICT:
- raise
- elif isinstance(token, PSKeyword):
- # log.debug(
- # "do_keyword: pos=%r, token=%r, stack=%r",
- # pos,
- # token,
- # self.curstack,
- # )
- if token.name == b"endobj":
- end = pos + 7
- self.do_keyword(pos, token)
- else:
- log.error(
- "unknown token: pos=%r, token=%r, stack=%r",
- pos,
- token,
- self.curstack,
- )
- self.do_keyword(pos, token)
- raise PSException
- if self.context:
- continue
- else:
- self.flush()
- obj = self.results.pop(0)
- # try:
- # log.debug("nextobject: %r", obj)
- # except Exception:
- # log.debug("nextobject: (unprintable object)")
- return end, obj
diff --git a/pdf2zh/py.typed b/pdf2zh/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/pdf2zh/runlength.py b/pdf2zh/runlength.py
deleted file mode 100644
index 2774e2a..0000000
--- a/pdf2zh/runlength.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# RunLength decoder (Adobe version) implementation based on PDF Reference
-# version 1.4 section 3.3.4.
-#
-# * public domain *
-#
-
-
-def rldecode(data: bytes) -> bytes:
- """RunLength decoder (Adobe version) implementation based on PDF Reference
- version 1.4 section 3.3.4:
- The RunLengthDecode filter decodes data that has been encoded in a
- simple byte-oriented format based on run length. The encoded data
- is a sequence of runs, where each run consists of a length byte
- followed by 1 to 128 bytes of data. If the length byte is in the
- range 0 to 127, the following length + 1 (1 to 128) bytes are
- copied literally during decompression. If length is in the range
- 129 to 255, the following single byte is to be copied 257 - length
- (2 to 128) times during decompression. A length value of 128
- denotes EOD.
- """
- decoded = b""
- i = 0
- while i < len(data):
- length = data[i]
- if length == 128:
- break
-
- if length >= 0 and length < 128:
- for j in range(i + 1, (i + 1) + (length + 1)):
- decoded += bytes((data[j],))
- i = (i + 1) + (length + 1)
-
- if length > 128:
- run = bytes((data[i + 1],)) * (257 - length)
- decoded += run
- i = (i + 1) + 1
-
- return decoded
diff --git a/pdf2zh/settings.py b/pdf2zh/settings.py
deleted file mode 100644
index 810077a..0000000
--- a/pdf2zh/settings.py
+++ /dev/null
@@ -1 +0,0 @@
-STRICT = False
diff --git a/pdf2zh/translator.py b/pdf2zh/translator.py
index 047b95d..ae272b6 100644
--- a/pdf2zh/translator.py
+++ b/pdf2zh/translator.py
@@ -7,6 +7,7 @@
import time
from datetime import UTC, datetime
from json import dumps, loads
+import unicodedata
import deepl
import ollama
@@ -16,6 +17,10 @@
from azure.core.credentials import AzureKeyCredential
+def remove_control_characters(s):
+ return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
+
+
class BaseTranslator:
def __init__(self, service, lang_out, lang_in, model):
self.service = service
@@ -56,7 +61,7 @@ def translate(self, text):
raise ValueError("Empty translation result")
else:
result = html.unescape(re_result[0])
- return result
+ return remove_control_characters(result)
class TencentTranslator(BaseTranslator):
diff --git a/pdf2zh/utils.py b/pdf2zh/utils.py
deleted file mode 100644
index ad5643b..0000000
--- a/pdf2zh/utils.py
+++ /dev/null
@@ -1,834 +0,0 @@
-"""Miscellaneous Routines."""
-
-import io
-import pathlib
-import string
-import struct
-from html import escape
-from typing import (
- TYPE_CHECKING,
- Any,
- BinaryIO,
- Callable,
- Dict,
- Generic,
- Iterable,
- Iterator,
- List,
- Optional,
- Set,
- TextIO,
- Tuple,
- TypeVar,
- Union,
- cast,
-)
-
-from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
-
-if TYPE_CHECKING:
- from pdf2zh.layout import LTComponent
-
-import charset_normalizer # For str encoding detection
-
-# from sys import maxint as INF doesn't work anymore under Python3, but PDF
-# still uses 32 bits ints
-INF = (1 << 31) - 1
-
-
-FileOrName = Union[pathlib.PurePath, str, io.IOBase]
-AnyIO = Union[TextIO, BinaryIO]
-
-
-class open_filename:
- """Context manager that allows opening a filename
- (str or pathlib.PurePath type is supported) and closes it on exit,
- (just like `open`), but does nothing for file-like objects.
- """
-
- def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
- if isinstance(filename, pathlib.PurePath):
- filename = str(filename)
- if isinstance(filename, str):
- self.file_handler: AnyIO = open(filename, *args, **kwargs)
- self.closing = True
- elif isinstance(filename, io.IOBase):
- self.file_handler = cast(AnyIO, filename)
- self.closing = False
- else:
- raise PDFTypeError("Unsupported input type: %s" % type(filename))
-
- def __enter__(self) -> AnyIO:
- return self.file_handler
-
- def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
- if self.closing:
- self.file_handler.close()
-
-
-def make_compat_bytes(in_str: str) -> bytes:
- """Converts to bytes, encoding to unicode."""
- assert isinstance(in_str, str), str(type(in_str))
- return in_str.encode()
-
-
-def make_compat_str(o: object) -> str:
- """Converts everything to string, if bytes guessing the encoding."""
- if isinstance(o, bytes):
- enc = charset_normalizer.detect(o)
- try:
- return o.decode(enc["encoding"])
- except UnicodeDecodeError:
- return str(o)
- else:
- return str(o)
-
-
-def shorten_str(s: str, size: int) -> str:
- if size < 7:
- return s[:size]
- if len(s) > size:
- length = (size - 5) // 2
- return f"{s[:length]} ... {s[-length:]}"
- else:
- return s
-
-
-def compatible_encode_method(
- bytesorstring: Union[bytes, str],
- encoding: str = "utf-8",
- erraction: str = "ignore",
-) -> str:
- """When Py2 str.encode is called, it often means bytes.encode in Py3.
-
- This does either.
- """
- if isinstance(bytesorstring, str):
- return bytesorstring
- assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
- return bytesorstring.decode(encoding, erraction)
-
-
-def paeth_predictor(left: int, above: int, upper_left: int) -> int:
- # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
- # Initial estimate
- p = left + above - upper_left
- # Distances to a,b,c
- pa = abs(p - left)
- pb = abs(p - above)
- pc = abs(p - upper_left)
-
- # Return nearest of a,b,c breaking ties in order a,b,c
- if pa <= pb and pa <= pc:
- return left
- elif pb <= pc:
- return above
- else:
- return upper_left
-
-
-def apply_png_predictor(
- pred: int,
- colors: int,
- columns: int,
- bitspercomponent: int,
- data: bytes,
-) -> bytes:
- """Reverse the effect of the PNG predictor
-
- Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
- """
- if bitspercomponent not in [8, 1]:
- msg = "Unsupported `bitspercomponent': %d" % bitspercomponent
- raise PDFValueError(msg)
-
- nbytes = colors * columns * bitspercomponent // 8
- bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
- buf = []
- line_above = list(b"\x00" * columns)
- for scanline_i in range(0, len(data), nbytes + 1):
- filter_type = data[scanline_i]
- line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
- raw = []
-
- if filter_type == 0:
- # Filter type 0: None
- raw = list(line_encoded)
-
- elif filter_type == 1:
- # Filter type 1: Sub
- # To reverse the effect of the Sub() filter after decompression,
- # output the following value:
- # Raw(x) = Sub(x) + Raw(x - bpp)
- # (computed mod 256), where Raw() refers to the bytes already
- # decoded.
- for j, sub_x in enumerate(line_encoded):
- if j - bpp < 0:
- raw_x_bpp = 0
- else:
- raw_x_bpp = int(raw[j - bpp])
- raw_x = (sub_x + raw_x_bpp) & 255
- raw.append(raw_x)
-
- elif filter_type == 2:
- # Filter type 2: Up
- # To reverse the effect of the Up() filter after decompression,
- # output the following value:
- # Raw(x) = Up(x) + Prior(x)
- # (computed mod 256), where Prior() refers to the decoded bytes of
- # the prior scanline.
- for up_x, prior_x in zip(line_encoded, line_above):
- raw_x = (up_x + prior_x) & 255
- raw.append(raw_x)
-
- elif filter_type == 3:
- # Filter type 3: Average
- # To reverse the effect of the Average() filter after
- # decompression, output the following value:
- # Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2)
- # where the result is computed mod 256, but the prediction is
- # calculated in the same way as for encoding. Raw() refers to the
- # bytes already decoded, and Prior() refers to the decoded bytes of
- # the prior scanline.
- for j, average_x in enumerate(line_encoded):
- if j - bpp < 0:
- raw_x_bpp = 0
- else:
- raw_x_bpp = int(raw[j - bpp])
- prior_x = int(line_above[j])
- raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255
- raw.append(raw_x)
-
- elif filter_type == 4:
- # Filter type 4: Paeth
- # To reverse the effect of the Paeth() filter after decompression,
- # output the following value:
- # Raw(x) = Paeth(x)
- # + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp))
- # (computed mod 256), where Raw() and Prior() refer to bytes
- # already decoded. Exactly the same PaethPredictor() function is
- # used by both encoder and decoder.
- for j, paeth_x in enumerate(line_encoded):
- if j - bpp < 0:
- raw_x_bpp = 0
- prior_x_bpp = 0
- else:
- raw_x_bpp = int(raw[j - bpp])
- prior_x_bpp = int(line_above[j - bpp])
- prior_x = int(line_above[j])
- paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp)
- raw_x = (paeth_x + paeth) & 255
- raw.append(raw_x)
-
- else:
- raise PDFValueError("Unsupported predictor value: %d" % filter_type)
-
- buf.extend(raw)
- line_above = raw
- return bytes(buf)
-
-
-Point = Tuple[float, float]
-Rect = Tuple[float, float, float, float]
-Matrix = Tuple[float, float, float, float, float, float]
-PathSegment = Union[
- Tuple[str], # Literal['h']
- Tuple[str, float, float], # Literal['m', 'l']
- Tuple[str, float, float, float, float], # Literal['v', 'y']
- Tuple[str, float, float, float, float, float, float],
-] # Literal['c']
-
-# Matrix operations
-MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
-
-
-def parse_rect(o: Any) -> Rect:
- try:
- (x0, y0, x1, y1) = o
- return float(x0), float(y0), float(x1), float(y1)
- except ValueError:
- raise PDFValueError("Could not parse rectangle")
-
-
-def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
- (a1, b1, c1, d1, e1, f1) = m1
- (a0, b0, c0, d0, e0, f0) = m0
- """Returns the multiplication of two matrices."""
- return (
- a0 * a1 + c0 * b1,
- b0 * a1 + d0 * b1,
- a0 * c1 + c0 * d1,
- b0 * c1 + d0 * d1,
- a0 * e1 + c0 * f1 + e0,
- b0 * e1 + d0 * f1 + f0,
- )
-
-
-def translate_matrix(m: Matrix, v: Point) -> Matrix:
- """Translates a matrix by (x, y)."""
- (a, b, c, d, e, f) = m
- (x, y) = v
- return a, b, c, d, x * a + y * c + e, x * b + y * d + f
-
-
-def apply_matrix_pt(m: Matrix, v: Point) -> Point:
- (a, b, c, d, e, f) = m
- (x, y) = v
- """Applies a matrix to a point."""
- return a * x + c * y + e, b * x + d * y + f
-
-
-def apply_matrix_norm(m: Matrix, v: Point) -> Point:
- """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
- (a, b, c, d, e, f) = m
- (p, q) = v
- return a * p + c * q, b * p + d * q
-
-
-def matrix_scale(m: Matrix) -> float:
- (a, b, c, d, e, f) = m
- return (a**2 + c**2) ** 0.5
-
-
-# Utility functions
-
-
-def isnumber(x: object) -> bool:
- return isinstance(x, (int, float))
-
-
-_T = TypeVar("_T")
-
-
-def uniq(objs: Iterable[_T]) -> Iterator[_T]:
- """Eliminates duplicated elements."""
- done = set()
- for obj in objs:
- if obj in done:
- continue
- done.add(obj)
- yield obj
-
-
-def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
- """Split a list into two classes according to the predicate."""
- t = []
- f = []
- for obj in objs:
- if pred(obj):
- t.append(obj)
- else:
- f.append(obj)
- return t, f
-
-
-def drange(v0: float, v1: float, d: int) -> range:
- """Returns a discrete range."""
- return range(int(v0) // d, int(v1 + d) // d)
-
-
-def get_bound(pts: Iterable[Point]) -> Rect:
- """Compute a minimal rectangle that covers all the points."""
- limit: Rect = (INF, INF, -INF, -INF)
- (x0, y0, x1, y1) = limit
- for x, y in pts:
- x0 = min(x0, x)
- y0 = min(y0, y)
- x1 = max(x1, x)
- y1 = max(y1, y)
- return x0, y0, x1, y1
-
-
-def pick(
- seq: Iterable[_T],
- func: Callable[[_T], float],
- maxobj: Optional[_T] = None,
-) -> Optional[_T]:
- """Picks the object obj where func(obj) has the highest value."""
- maxscore = None
- for obj in seq:
- score = func(obj)
- if maxscore is None or maxscore < score:
- (maxscore, maxobj) = (score, obj)
- return maxobj
-
-
-def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
- """Groups every n elements of the list."""
- r = []
- for x in seq:
- r.append(x)
- if len(r) == n:
- yield tuple(r)
- r = []
-
-
-def nunpack(s: bytes, default: int = 0) -> int:
- """Unpacks 1 to 4 or 8 byte integers (big endian)."""
- length = len(s)
- if not length:
- return default
- elif length == 1:
- return ord(s)
- elif length == 2:
- return cast(int, struct.unpack(">H", s)[0])
- elif length == 3:
- return cast(int, struct.unpack(">L", b"\x00" + s)[0])
- elif length == 4:
- return cast(int, struct.unpack(">L", s)[0])
- elif length == 8:
- return cast(int, struct.unpack(">Q", s)[0])
- else:
- raise PDFTypeError("invalid length: %d" % length)
-
-
-PDFDocEncoding = "".join(
- chr(x)
- for x in (
- 0x0000,
- 0x0001,
- 0x0002,
- 0x0003,
- 0x0004,
- 0x0005,
- 0x0006,
- 0x0007,
- 0x0008,
- 0x0009,
- 0x000A,
- 0x000B,
- 0x000C,
- 0x000D,
- 0x000E,
- 0x000F,
- 0x0010,
- 0x0011,
- 0x0012,
- 0x0013,
- 0x0014,
- 0x0015,
- 0x0017,
- 0x0017,
- 0x02D8,
- 0x02C7,
- 0x02C6,
- 0x02D9,
- 0x02DD,
- 0x02DB,
- 0x02DA,
- 0x02DC,
- 0x0020,
- 0x0021,
- 0x0022,
- 0x0023,
- 0x0024,
- 0x0025,
- 0x0026,
- 0x0027,
- 0x0028,
- 0x0029,
- 0x002A,
- 0x002B,
- 0x002C,
- 0x002D,
- 0x002E,
- 0x002F,
- 0x0030,
- 0x0031,
- 0x0032,
- 0x0033,
- 0x0034,
- 0x0035,
- 0x0036,
- 0x0037,
- 0x0038,
- 0x0039,
- 0x003A,
- 0x003B,
- 0x003C,
- 0x003D,
- 0x003E,
- 0x003F,
- 0x0040,
- 0x0041,
- 0x0042,
- 0x0043,
- 0x0044,
- 0x0045,
- 0x0046,
- 0x0047,
- 0x0048,
- 0x0049,
- 0x004A,
- 0x004B,
- 0x004C,
- 0x004D,
- 0x004E,
- 0x004F,
- 0x0050,
- 0x0051,
- 0x0052,
- 0x0053,
- 0x0054,
- 0x0055,
- 0x0056,
- 0x0057,
- 0x0058,
- 0x0059,
- 0x005A,
- 0x005B,
- 0x005C,
- 0x005D,
- 0x005E,
- 0x005F,
- 0x0060,
- 0x0061,
- 0x0062,
- 0x0063,
- 0x0064,
- 0x0065,
- 0x0066,
- 0x0067,
- 0x0068,
- 0x0069,
- 0x006A,
- 0x006B,
- 0x006C,
- 0x006D,
- 0x006E,
- 0x006F,
- 0x0070,
- 0x0071,
- 0x0072,
- 0x0073,
- 0x0074,
- 0x0075,
- 0x0076,
- 0x0077,
- 0x0078,
- 0x0079,
- 0x007A,
- 0x007B,
- 0x007C,
- 0x007D,
- 0x007E,
- 0x0000,
- 0x2022,
- 0x2020,
- 0x2021,
- 0x2026,
- 0x2014,
- 0x2013,
- 0x0192,
- 0x2044,
- 0x2039,
- 0x203A,
- 0x2212,
- 0x2030,
- 0x201E,
- 0x201C,
- 0x201D,
- 0x2018,
- 0x2019,
- 0x201A,
- 0x2122,
- 0xFB01,
- 0xFB02,
- 0x0141,
- 0x0152,
- 0x0160,
- 0x0178,
- 0x017D,
- 0x0131,
- 0x0142,
- 0x0153,
- 0x0161,
- 0x017E,
- 0x0000,
- 0x20AC,
- 0x00A1,
- 0x00A2,
- 0x00A3,
- 0x00A4,
- 0x00A5,
- 0x00A6,
- 0x00A7,
- 0x00A8,
- 0x00A9,
- 0x00AA,
- 0x00AB,
- 0x00AC,
- 0x0000,
- 0x00AE,
- 0x00AF,
- 0x00B0,
- 0x00B1,
- 0x00B2,
- 0x00B3,
- 0x00B4,
- 0x00B5,
- 0x00B6,
- 0x00B7,
- 0x00B8,
- 0x00B9,
- 0x00BA,
- 0x00BB,
- 0x00BC,
- 0x00BD,
- 0x00BE,
- 0x00BF,
- 0x00C0,
- 0x00C1,
- 0x00C2,
- 0x00C3,
- 0x00C4,
- 0x00C5,
- 0x00C6,
- 0x00C7,
- 0x00C8,
- 0x00C9,
- 0x00CA,
- 0x00CB,
- 0x00CC,
- 0x00CD,
- 0x00CE,
- 0x00CF,
- 0x00D0,
- 0x00D1,
- 0x00D2,
- 0x00D3,
- 0x00D4,
- 0x00D5,
- 0x00D6,
- 0x00D7,
- 0x00D8,
- 0x00D9,
- 0x00DA,
- 0x00DB,
- 0x00DC,
- 0x00DD,
- 0x00DE,
- 0x00DF,
- 0x00E0,
- 0x00E1,
- 0x00E2,
- 0x00E3,
- 0x00E4,
- 0x00E5,
- 0x00E6,
- 0x00E7,
- 0x00E8,
- 0x00E9,
- 0x00EA,
- 0x00EB,
- 0x00EC,
- 0x00ED,
- 0x00EE,
- 0x00EF,
- 0x00F0,
- 0x00F1,
- 0x00F2,
- 0x00F3,
- 0x00F4,
- 0x00F5,
- 0x00F6,
- 0x00F7,
- 0x00F8,
- 0x00F9,
- 0x00FA,
- 0x00FB,
- 0x00FC,
- 0x00FD,
- 0x00FE,
- 0x00FF,
- )
-)
-
-
-def decode_text(s: bytes) -> str:
- """Decodes a PDFDocEncoding string to Unicode."""
- if s.startswith(b"\xfe\xff"):
- return str(s[2:], "utf-16be", "ignore")
- else:
- return "".join(PDFDocEncoding[c] for c in s)
-
-
-def enc(x: str) -> str:
- """Encodes a string for SGML/XML/HTML"""
- if isinstance(x, bytes):
- return ""
- return escape(x)
-
-
-def bbox2str(bbox: Rect) -> str:
- (x0, y0, x1, y1) = bbox
- return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}"
-
-
-def matrix2str(m: Matrix) -> str:
- (a, b, c, d, e, f) = m
- return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]"
-
-
-def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
- """A distance function between two TextBoxes.
-
- Consider the bounding rectangle for obj1 and obj2.
- Return vector between 2 boxes boundaries if they don't overlap, otherwise
- returns vector betweeen boxes centers
-
- +------+..........+ (x1, y1)
- | obj1 | :
- +------+www+------+
- : | obj2 |
- (x0, y0) +..........+------+
- """
- (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
- (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
- (ow, oh) = (x1 - x0, y1 - y0)
- (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
- if iw < 0 and ih < 0:
- # if one is inside another we compute euclidean distance
- (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
- (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
- return xc1 - xc2, yc1 - yc2
- else:
- return max(0, iw), max(0, ih)
-
-
-LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
-
-
-class Plane(Generic[LTComponentT]):
- """A set-like data structure for objects placed on a plane.
-
- Can efficiently find objects in a certain rectangular area.
- It maintains two parallel lists of objects, each of
- which is sorted by its x or y coordinate.
- """
-
- def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
- self._seq: List[LTComponentT] = [] # preserve the object order.
- self._objs: Set[LTComponentT] = set()
- self._grid: Dict[Point, List[LTComponentT]] = {}
- self.gridsize = gridsize
- (self.x0, self.y0, self.x1, self.y1) = bbox
-
- def __repr__(self) -> str:
- return "" % list(self)
-
- def __iter__(self) -> Iterator[LTComponentT]:
- return (obj for obj in self._seq if obj in self._objs)
-
- def __len__(self) -> int:
- return len(self._objs)
-
- def __contains__(self, obj: object) -> bool:
- return obj in self._objs
-
- def _getrange(self, bbox: Rect) -> Iterator[Point]:
- (x0, y0, x1, y1) = bbox
- if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
- return
- x0 = max(self.x0, x0)
- y0 = max(self.y0, y0)
- x1 = min(self.x1, x1)
- y1 = min(self.y1, y1)
- for grid_y in drange(y0, y1, self.gridsize):
- for grid_x in drange(x0, x1, self.gridsize):
- yield (grid_x, grid_y)
-
- def extend(self, objs: Iterable[LTComponentT]) -> None:
- for obj in objs:
- self.add(obj)
-
- def add(self, obj: LTComponentT) -> None:
- """Place an object."""
- for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
- if k not in self._grid:
- r: List[LTComponentT] = []
- self._grid[k] = r
- else:
- r = self._grid[k]
- r.append(obj)
- self._seq.append(obj)
- self._objs.add(obj)
-
- def remove(self, obj: LTComponentT) -> None:
- """Displace an object."""
- for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
- try:
- self._grid[k].remove(obj)
- except (KeyError, ValueError):
- pass
- self._objs.remove(obj)
-
- def find(self, bbox: Rect) -> Iterator[LTComponentT]:
- """Finds objects that are in a certain area."""
- (x0, y0, x1, y1) = bbox
- done = set()
- for k in self._getrange(bbox):
- if k not in self._grid:
- continue
- for obj in self._grid[k]:
- if obj in done:
- continue
- done.add(obj)
- if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
- continue
- yield obj
-
-
-ROMAN_ONES = ["i", "x", "c", "m"]
-ROMAN_FIVES = ["v", "l", "d"]
-
-
-def format_int_roman(value: int) -> str:
- """Format a number as lowercase Roman numerals."""
- assert 0 < value < 4000
- result: List[str] = []
- index = 0
-
- while value != 0:
- value, remainder = divmod(value, 10)
- if remainder == 9:
- result.insert(0, ROMAN_ONES[index])
- result.insert(1, ROMAN_ONES[index + 1])
- elif remainder == 4:
- result.insert(0, ROMAN_ONES[index])
- result.insert(1, ROMAN_FIVES[index])
- else:
- over_five = remainder >= 5
- if over_five:
- result.insert(0, ROMAN_FIVES[index])
- remainder -= 5
- result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
- index += 1
-
- return "".join(result)
-
-
-def format_int_alpha(value: int) -> str:
- """Format a number as lowercase letters a-z, aa-zz, etc."""
- assert value > 0
- result: List[str] = []
-
- while value != 0:
- value, remainder = divmod(value - 1, len(string.ascii_lowercase))
- result.append(string.ascii_lowercase[remainder])
-
- result.reverse()
- return "".join(result)
-
-
-def get_device():
- """Get the device to use for computation."""
- try:
- import torch
-
- if torch.cuda.is_available():
- return "cuda:0"
- except ImportError:
- pass
-
- return "cpu"
diff --git a/pyproject.toml b/pyproject.toml
index e95c3f2..b61d955 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
"onnx",
"onnxruntime",
"opencv-python-headless",
+ "pdfminer.six>=20240706",
]
[project.optional-dependencies]
diff --git a/setup.cfg b/setup.cfg
index 053bd42..d4304f1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 120
-ignore = E203,W503,E261
+ignore = E203,E261,E501,W503,E741
exclude = .git,build,dist,docs
\ No newline at end of file