encodingdb.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import logging
  2. import re
  3. from typing import Dict, Iterable, Optional, cast
  4. from pdf2zh.glyphlist import glyphname2unicode
  5. from pdf2zh.latin_enc import ENCODING
  6. from pdf2zh.pdfexceptions import PDFKeyError
  7. from pdf2zh.psparser import PSLiteral
  8. HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
  9. log = logging.getLogger(__name__)
  10. def name2unicode(name: str) -> str:
  11. """Converts Adobe glyph names to Unicode numbers.
  12. In contrast to the specification, this raises a KeyError instead of return
  13. an empty string when the key is unknown.
  14. This way the caller must explicitly define what to do
  15. when there is not a match.
  16. Reference:
  17. https://github.com/adobe-type-tools/agl-specification#2-the-mapping
  18. :returns unicode character if name resembles something,
  19. otherwise a KeyError
  20. """
  21. if not isinstance(name, str):
  22. raise PDFKeyError(
  23. 'Could not convert unicode name "%s" to character because '
  24. "it should be of type str but is of type %s" % (name, type(name)),
  25. )
  26. name = name.split(".")[0]
  27. components = name.split("_")
  28. if len(components) > 1:
  29. return "".join(map(name2unicode, components))
  30. elif name in glyphname2unicode:
  31. return glyphname2unicode[name]
  32. elif name.startswith("uni"):
  33. name_without_uni = name.strip("uni")
  34. if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
  35. unicode_digits = [
  36. int(name_without_uni[i : i + 4], base=16)
  37. for i in range(0, len(name_without_uni), 4)
  38. ]
  39. for digit in unicode_digits:
  40. raise_key_error_for_invalid_unicode(digit)
  41. characters = map(chr, unicode_digits)
  42. return "".join(characters)
  43. elif name.startswith("u"):
  44. name_without_u = name.strip("u")
  45. if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
  46. unicode_digit = int(name_without_u, base=16)
  47. raise_key_error_for_invalid_unicode(unicode_digit)
  48. return chr(unicode_digit)
  49. raise PDFKeyError(
  50. 'Could not convert unicode name "%s" to character because '
  51. "it does not match specification" % name,
  52. )
  53. def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
  54. """Unicode values should not be in the range D800 through DFFF because
  55. that is used for surrogate pairs in UTF-16
  56. :raises KeyError if unicode digit is invalid
  57. """
  58. if 55295 < unicode_digit < 57344:
  59. raise PDFKeyError(
  60. "Unicode digit %d is invalid because "
  61. "it is in the range D800 through DFFF" % unicode_digit,
  62. )
  63. class EncodingDB:
  64. std2unicode: Dict[int, str] = {}
  65. mac2unicode: Dict[int, str] = {}
  66. win2unicode: Dict[int, str] = {}
  67. pdf2unicode: Dict[int, str] = {}
  68. for name, std, mac, win, pdf in ENCODING:
  69. c = name2unicode(name)
  70. if std:
  71. std2unicode[std] = c
  72. if mac:
  73. mac2unicode[mac] = c
  74. if win:
  75. win2unicode[win] = c
  76. if pdf:
  77. pdf2unicode[pdf] = c
  78. encodings = {
  79. "StandardEncoding": std2unicode,
  80. "MacRomanEncoding": mac2unicode,
  81. "WinAnsiEncoding": win2unicode,
  82. "PDFDocEncoding": pdf2unicode,
  83. }
  84. @classmethod
  85. def get_encoding(
  86. cls,
  87. name: str,
  88. diff: Optional[Iterable[object]] = None,
  89. ) -> Dict[int, str]:
  90. cid2unicode = cls.encodings.get(name, cls.std2unicode)
  91. if diff:
  92. cid2unicode = cid2unicode.copy()
  93. cid = 0
  94. for x in diff:
  95. if isinstance(x, int):
  96. cid = x
  97. elif isinstance(x, PSLiteral):
  98. try:
  99. cid2unicode[cid] = name2unicode(cast(str, x.name))
  100. except (KeyError, ValueError):
  101. # log.debug(str(e))
  102. pass
  103. cid += 1
  104. return cid2unicode