| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- import logging
- import re
- from typing import Dict, Iterable, Optional, cast
- from pdf2zh.glyphlist import glyphname2unicode
- from pdf2zh.latin_enc import ENCODING
- from pdf2zh.pdfexceptions import PDFKeyError
- from pdf2zh.psparser import PSLiteral
- HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
- log = logging.getLogger(__name__)
- def name2unicode(name: str) -> str:
- """Converts Adobe glyph names to Unicode numbers.
- In contrast to the specification, this raises a KeyError instead of return
- an empty string when the key is unknown.
- This way the caller must explicitly define what to do
- when there is not a match.
- Reference:
- https://github.com/adobe-type-tools/agl-specification#2-the-mapping
- :returns unicode character if name resembles something,
- otherwise a KeyError
- """
- if not isinstance(name, str):
- raise PDFKeyError(
- 'Could not convert unicode name "%s" to character because '
- "it should be of type str but is of type %s" % (name, type(name)),
- )
- name = name.split(".")[0]
- components = name.split("_")
- if len(components) > 1:
- return "".join(map(name2unicode, components))
- elif name in glyphname2unicode:
- return glyphname2unicode[name]
- elif name.startswith("uni"):
- name_without_uni = name.strip("uni")
- if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
- unicode_digits = [
- int(name_without_uni[i : i + 4], base=16)
- for i in range(0, len(name_without_uni), 4)
- ]
- for digit in unicode_digits:
- raise_key_error_for_invalid_unicode(digit)
- characters = map(chr, unicode_digits)
- return "".join(characters)
- elif name.startswith("u"):
- name_without_u = name.strip("u")
- if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
- unicode_digit = int(name_without_u, base=16)
- raise_key_error_for_invalid_unicode(unicode_digit)
- return chr(unicode_digit)
- raise PDFKeyError(
- 'Could not convert unicode name "%s" to character because '
- "it does not match specification" % name,
- )
- def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
- """Unicode values should not be in the range D800 through DFFF because
- that is used for surrogate pairs in UTF-16
- :raises KeyError if unicode digit is invalid
- """
- if 55295 < unicode_digit < 57344:
- raise PDFKeyError(
- "Unicode digit %d is invalid because "
- "it is in the range D800 through DFFF" % unicode_digit,
- )
- class EncodingDB:
- std2unicode: Dict[int, str] = {}
- mac2unicode: Dict[int, str] = {}
- win2unicode: Dict[int, str] = {}
- pdf2unicode: Dict[int, str] = {}
- for name, std, mac, win, pdf in ENCODING:
- c = name2unicode(name)
- if std:
- std2unicode[std] = c
- if mac:
- mac2unicode[mac] = c
- if win:
- win2unicode[win] = c
- if pdf:
- pdf2unicode[pdf] = c
- encodings = {
- "StandardEncoding": std2unicode,
- "MacRomanEncoding": mac2unicode,
- "WinAnsiEncoding": win2unicode,
- "PDFDocEncoding": pdf2unicode,
- }
- @classmethod
- def get_encoding(
- cls,
- name: str,
- diff: Optional[Iterable[object]] = None,
- ) -> Dict[int, str]:
- cid2unicode = cls.encodings.get(name, cls.std2unicode)
- if diff:
- cid2unicode = cid2unicode.copy()
- cid = 0
- for x in diff:
- if isinstance(x, int):
- cid = x
- elif isinstance(x, PSLiteral):
- try:
- cid2unicode[cid] = name2unicode(cast(str, x.name))
- except (KeyError, ValueError):
- # log.debug(str(e))
- pass
- cid += 1
- return cid2unicode
|