pdfdocument.py 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. import itertools
  2. import logging
  3. import re
  4. import struct
  5. from hashlib import md5, sha256, sha384, sha512
  6. from typing import (
  7. Any,
  8. Callable,
  9. Dict,
  10. Iterable,
  11. Iterator,
  12. KeysView,
  13. List,
  14. Optional,
  15. Sequence,
  16. Tuple,
  17. Type,
  18. Union,
  19. cast,
  20. )
  21. from cryptography.hazmat.backends import default_backend
  22. from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
  23. from pdf2zh import settings
  24. from pdf2zh.arcfour import Arcfour
  25. from pdf2zh.data_structures import NumberTree
  26. from pdf2zh.pdfexceptions import (
  27. PDFException,
  28. PDFKeyError,
  29. PDFObjectNotFound,
  30. PDFTypeError,
  31. )
  32. from pdf2zh.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError
  33. from pdf2zh.pdftypes import (
  34. DecipherCallable,
  35. PDFStream,
  36. decipher_all,
  37. dict_value,
  38. int_value,
  39. list_value,
  40. str_value,
  41. stream_value,
  42. uint_value,
  43. )
  44. from pdf2zh.psexceptions import PSEOF
  45. from pdf2zh.psparser import KWD, LIT, literal_name
  46. from pdf2zh.utils import (
  47. choplist,
  48. decode_text,
  49. format_int_alpha,
  50. format_int_roman,
  51. nunpack,
  52. )
  53. log = logging.getLogger(__name__)
  54. class PDFNoValidXRef(PDFSyntaxError):
  55. pass
  56. class PDFNoValidXRefWarning(SyntaxWarning):
  57. """Legacy warning for missing xref.
  58. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  59. """
  60. class PDFNoOutlines(PDFException):
  61. pass
  62. class PDFNoPageLabels(PDFException):
  63. pass
  64. class PDFDestinationNotFound(PDFException):
  65. pass
  66. class PDFEncryptionError(PDFException):
  67. pass
  68. class PDFPasswordIncorrect(PDFEncryptionError):
  69. pass
  70. class PDFEncryptionWarning(UserWarning):
  71. """Legacy warning for failed decryption.
  72. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  73. """
  74. class PDFTextExtractionNotAllowedWarning(UserWarning):
  75. """Legacy warning for PDF that does not allow extraction.
  76. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  77. """
  78. class PDFTextExtractionNotAllowed(PDFEncryptionError):
  79. pass
  80. # some predefined literals and keywords.
  81. LITERAL_OBJSTM = LIT("ObjStm")
  82. LITERAL_XREF = LIT("XRef")
  83. LITERAL_CATALOG = LIT("Catalog")
  84. class PDFBaseXRef:
  85. def get_trailer(self) -> Dict[str, Any]:
  86. raise NotImplementedError
  87. def get_objids(self) -> Iterable[int]:
  88. return []
  89. # Must return
  90. # (strmid, index, genno)
  91. # or (None, pos, genno)
  92. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  93. raise PDFKeyError(objid)
  94. def load(self, parser: PDFParser) -> None:
  95. raise NotImplementedError
  96. class PDFXRef(PDFBaseXRef):
  97. def __init__(self) -> None:
  98. self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
  99. self.trailer: Dict[str, Any] = {}
  100. def __repr__(self) -> str:
  101. return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
  102. def load(self, parser: PDFParser) -> None:
  103. while True:
  104. try:
  105. (pos, line) = parser.nextline()
  106. line = line.strip()
  107. if not line:
  108. continue
  109. except PSEOF:
  110. raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
  111. if line.startswith(b"trailer"):
  112. parser.seek(pos)
  113. break
  114. f = line.split(b" ")
  115. if len(f) != 2:
  116. error_msg = f"Trailer not found: {parser!r}: line={line!r}"
  117. raise PDFNoValidXRef(error_msg)
  118. try:
  119. (start, nobjs) = map(int, f)
  120. except ValueError:
  121. error_msg = f"Invalid line: {parser!r}: line={line!r}"
  122. raise PDFNoValidXRef(error_msg)
  123. for objid in range(start, start + nobjs):
  124. try:
  125. (_, line) = parser.nextline()
  126. line = line.strip()
  127. except PSEOF:
  128. raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
  129. f = line.split(b" ")
  130. if len(f) != 3:
  131. error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
  132. raise PDFNoValidXRef(error_msg)
  133. (pos_b, genno_b, use_b) = f
  134. if use_b != b"n":
  135. continue
  136. self.offsets[objid] = (None, int(pos_b), int(genno_b))
  137. # log.debug("xref objects: %r", self.offsets)
  138. self.load_trailer(parser)
  139. def load_trailer(self, parser: PDFParser) -> None:
  140. try:
  141. (_, kwd) = parser.nexttoken()
  142. assert kwd is KWD(b"trailer"), str(kwd)
  143. _, (_, dic) = parser.nextobject()
  144. except PSEOF:
  145. x = parser.pop(1)
  146. if not x:
  147. raise PDFNoValidXRef("Unexpected EOF - file corrupted")
  148. (_, dic) = x[0]
  149. self.trailer.update(dict_value(dic))
  150. # log.debug("trailer=%r", self.trailer)
  151. def get_trailer(self) -> Dict[str, Any]:
  152. return self.trailer
  153. def get_objids(self) -> KeysView[int]:
  154. return self.offsets.keys()
  155. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  156. return self.offsets[objid]
  157. class PDFXRefFallback(PDFXRef):
  158. def __repr__(self) -> str:
  159. return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
  160. PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
  161. def load(self, parser: PDFParser) -> None:
  162. parser.seek(0)
  163. while 1:
  164. try:
  165. (pos, line_bytes) = parser.nextline()
  166. except PSEOF:
  167. break
  168. if line_bytes.startswith(b"trailer"):
  169. parser.seek(pos)
  170. self.load_trailer(parser)
  171. # log.debug("trailer: %r", self.trailer)
  172. break
  173. line = line_bytes.decode("latin-1") # default pdf encoding
  174. m = self.PDFOBJ_CUE.match(line)
  175. if not m:
  176. continue
  177. (objid_s, genno_s) = m.groups()
  178. objid = int(objid_s)
  179. genno = int(genno_s)
  180. self.offsets[objid] = (None, pos, genno)
  181. # expand ObjStm.
  182. parser.seek(pos)
  183. _, (_, obj) = parser.nextobject()
  184. if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
  185. stream = stream_value(obj)
  186. try:
  187. n = stream["N"]
  188. except KeyError:
  189. if settings.STRICT:
  190. raise PDFSyntaxError("N is not defined: %r" % stream)
  191. n = 0
  192. parser1 = PDFStreamParser(stream.get_data())
  193. objs: List[int] = []
  194. try:
  195. while 1:
  196. _, (_, obj) = parser1.nextobject()
  197. objs.append(cast(int, obj))
  198. except PSEOF:
  199. pass
  200. n = min(n, len(objs) // 2)
  201. for index in range(n):
  202. objid1 = objs[index * 2]
  203. self.offsets[objid1] = (objid, index, 0)
  204. class PDFXRefStream(PDFBaseXRef):
  205. def __init__(self) -> None:
  206. self.data: Optional[bytes] = None
  207. self.entlen: Optional[int] = None
  208. self.fl1: Optional[int] = None
  209. self.fl2: Optional[int] = None
  210. self.fl3: Optional[int] = None
  211. self.ranges: List[Tuple[int, int]] = []
  212. def __repr__(self) -> str:
  213. return "<PDFXRefStream: ranges=%r>" % (self.ranges)
  214. def load(self, parser: PDFParser) -> None:
  215. (_, objid) = parser.nexttoken() # ignored
  216. (_, genno) = parser.nexttoken() # ignored
  217. (_, kwd) = parser.nexttoken()
  218. _, (_, stream) = parser.nextobject()
  219. if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
  220. raise PDFNoValidXRef("Invalid PDF stream spec.")
  221. size = stream["Size"]
  222. index_array = stream.get("Index", (0, size))
  223. if len(index_array) % 2 != 0:
  224. raise PDFSyntaxError("Invalid index number")
  225. self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
  226. (self.fl1, self.fl2, self.fl3) = stream["W"]
  227. assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
  228. self.data = stream.get_data()
  229. self.entlen = self.fl1 + self.fl2 + self.fl3
  230. self.trailer = stream.attrs
  231. # log.debug(
  232. # "xref stream: objid=%s, fields=%d,%d,%d",
  233. # ", ".join(map(repr, self.ranges)),
  234. # self.fl1,
  235. # self.fl2,
  236. # self.fl3,
  237. # )
  238. def get_trailer(self) -> Dict[str, Any]:
  239. return self.trailer
  240. def get_objids(self) -> Iterator[int]:
  241. for start, nobjs in self.ranges:
  242. for i in range(nobjs):
  243. assert self.entlen is not None
  244. assert self.data is not None
  245. offset = self.entlen * i
  246. ent = self.data[offset : offset + self.entlen]
  247. f1 = nunpack(ent[: self.fl1], 1)
  248. if f1 == 1 or f1 == 2:
  249. yield start + i
  250. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  251. index = 0
  252. for start, nobjs in self.ranges:
  253. if start <= objid and objid < start + nobjs:
  254. index += objid - start
  255. break
  256. else:
  257. index += nobjs
  258. else:
  259. raise PDFKeyError(objid)
  260. assert self.entlen is not None
  261. assert self.data is not None
  262. assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
  263. offset = self.entlen * index
  264. ent = self.data[offset : offset + self.entlen]
  265. f1 = nunpack(ent[: self.fl1], 1)
  266. f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
  267. f3 = nunpack(ent[self.fl1 + self.fl2 :])
  268. if f1 == 1:
  269. return (None, f2, f3)
  270. elif f1 == 2:
  271. return (f2, f3, 0)
  272. else:
  273. # this is a free object
  274. raise PDFKeyError(objid)
  275. class PDFStandardSecurityHandler:
  276. PASSWORD_PADDING = (
  277. b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
  278. b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
  279. )
  280. supported_revisions: Tuple[int, ...] = (2, 3)
  281. def __init__(
  282. self,
  283. docid: Sequence[bytes],
  284. param: Dict[str, Any],
  285. password: str = "",
  286. ) -> None:
  287. self.docid = docid
  288. self.param = param
  289. self.password = password
  290. self.init()
  291. def init(self) -> None:
  292. self.init_params()
  293. if self.r not in self.supported_revisions:
  294. error_msg = "Unsupported revision: param=%r" % self.param
  295. raise PDFEncryptionError(error_msg)
  296. self.init_key()
  297. def init_params(self) -> None:
  298. self.v = int_value(self.param.get("V", 0))
  299. self.r = int_value(self.param["R"])
  300. self.p = uint_value(self.param["P"], 32)
  301. self.o = str_value(self.param["O"])
  302. self.u = str_value(self.param["U"])
  303. self.length = int_value(self.param.get("Length", 40))
  304. def init_key(self) -> None:
  305. self.key = self.authenticate(self.password)
  306. if self.key is None:
  307. raise PDFPasswordIncorrect
  308. def is_printable(self) -> bool:
  309. return bool(self.p & 4)
  310. def is_modifiable(self) -> bool:
  311. return bool(self.p & 8)
  312. def is_extractable(self) -> bool:
  313. return bool(self.p & 16)
  314. def compute_u(self, key: bytes) -> bytes:
  315. if self.r == 2:
  316. # Algorithm 3.4
  317. return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
  318. else:
  319. # Algorithm 3.5
  320. hash = md5(self.PASSWORD_PADDING) # 2
  321. hash.update(self.docid[0]) # 3
  322. result = Arcfour(key).encrypt(hash.digest()) # 4
  323. for i in range(1, 20): # 5
  324. k = b"".join(bytes((c ^ i,)) for c in iter(key))
  325. result = Arcfour(k).encrypt(result)
  326. result += result # 6
  327. return result
  328. def compute_encryption_key(self, password: bytes) -> bytes:
  329. # Algorithm 3.2
  330. password = (password + self.PASSWORD_PADDING)[:32] # 1
  331. hash = md5(password) # 2
  332. hash.update(self.o) # 3
  333. # See https://github.com/pdf2zh/pdf2zh.six/issues/186
  334. hash.update(struct.pack("<L", self.p)) # 4
  335. hash.update(self.docid[0]) # 5
  336. if self.r >= 4:
  337. if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
  338. hash.update(b"\xff\xff\xff\xff")
  339. result = hash.digest()
  340. n = 5
  341. if self.r >= 3:
  342. n = self.length // 8
  343. for _ in range(50):
  344. result = md5(result[:n]).digest()
  345. return result[:n]
  346. def authenticate(self, password: str) -> Optional[bytes]:
  347. password_bytes = password.encode("latin1")
  348. key = self.authenticate_user_password(password_bytes)
  349. if key is None:
  350. key = self.authenticate_owner_password(password_bytes)
  351. return key
  352. def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
  353. key = self.compute_encryption_key(password)
  354. if self.verify_encryption_key(key):
  355. return key
  356. else:
  357. return None
  358. def verify_encryption_key(self, key: bytes) -> bool:
  359. # Algorithm 3.6
  360. u = self.compute_u(key)
  361. if self.r == 2:
  362. return u == self.u
  363. return u[:16] == self.u[:16]
  364. def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
  365. # Algorithm 3.7
  366. password = (password + self.PASSWORD_PADDING)[:32]
  367. hash = md5(password)
  368. if self.r >= 3:
  369. for _ in range(50):
  370. hash = md5(hash.digest())
  371. n = 5
  372. if self.r >= 3:
  373. n = self.length // 8
  374. key = hash.digest()[:n]
  375. if self.r == 2:
  376. user_password = Arcfour(key).decrypt(self.o)
  377. else:
  378. user_password = self.o
  379. for i in range(19, -1, -1):
  380. k = b"".join(bytes((c ^ i,)) for c in iter(key))
  381. user_password = Arcfour(k).decrypt(user_password)
  382. return self.authenticate_user_password(user_password)
  383. def decrypt(
  384. self,
  385. objid: int,
  386. genno: int,
  387. data: bytes,
  388. attrs: Optional[Dict[str, Any]] = None,
  389. ) -> bytes:
  390. return self.decrypt_rc4(objid, genno, data)
  391. def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
  392. assert self.key is not None
  393. key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
  394. hash = md5(key)
  395. key = hash.digest()[: min(len(key), 16)]
  396. return Arcfour(key).decrypt(data)
  397. class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
  398. supported_revisions: Tuple[int, ...] = (4,)
  399. def init_params(self) -> None:
  400. super().init_params()
  401. self.length = 128
  402. self.cf = dict_value(self.param.get("CF"))
  403. self.stmf = literal_name(self.param["StmF"])
  404. self.strf = literal_name(self.param["StrF"])
  405. self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
  406. if self.stmf != self.strf:
  407. error_msg = "Unsupported crypt filter: param=%r" % self.param
  408. raise PDFEncryptionError(error_msg)
  409. self.cfm = {}
  410. for k, v in self.cf.items():
  411. f = self.get_cfm(literal_name(v["CFM"]))
  412. if f is None:
  413. error_msg = "Unknown crypt filter method: param=%r" % self.param
  414. raise PDFEncryptionError(error_msg)
  415. self.cfm[k] = f
  416. self.cfm["Identity"] = self.decrypt_identity
  417. if self.strf not in self.cfm:
  418. error_msg = "Undefined crypt filter: param=%r" % self.param
  419. raise PDFEncryptionError(error_msg)
  420. def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
  421. if name == "V2":
  422. return self.decrypt_rc4
  423. elif name == "AESV2":
  424. return self.decrypt_aes128
  425. else:
  426. return None
  427. def decrypt(
  428. self,
  429. objid: int,
  430. genno: int,
  431. data: bytes,
  432. attrs: Optional[Dict[str, Any]] = None,
  433. name: Optional[str] = None,
  434. ) -> bytes:
  435. if not self.encrypt_metadata and attrs is not None:
  436. t = attrs.get("Type")
  437. if t is not None and literal_name(t) == "Metadata":
  438. return data
  439. if name is None:
  440. name = self.strf
  441. return self.cfm[name](objid, genno, data)
  442. def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
  443. return data
  444. def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
  445. assert self.key is not None
  446. key = (
  447. self.key
  448. + struct.pack("<L", objid)[:3]
  449. + struct.pack("<L", genno)[:2]
  450. + b"sAlT"
  451. )
  452. hash = md5(key)
  453. key = hash.digest()[: min(len(key), 16)]
  454. initialization_vector = data[:16]
  455. ciphertext = data[16:]
  456. cipher = Cipher(
  457. algorithms.AES(key),
  458. modes.CBC(initialization_vector),
  459. backend=default_backend(),
  460. ) # type: ignore
  461. return cipher.decryptor().update(ciphertext) # type: ignore
  462. class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
  463. supported_revisions = (5, 6)
  464. def init_params(self) -> None:
  465. super().init_params()
  466. self.length = 256
  467. self.oe = str_value(self.param["OE"])
  468. self.ue = str_value(self.param["UE"])
  469. self.o_hash = self.o[:32]
  470. self.o_validation_salt = self.o[32:40]
  471. self.o_key_salt = self.o[40:]
  472. self.u_hash = self.u[:32]
  473. self.u_validation_salt = self.u[32:40]
  474. self.u_key_salt = self.u[40:]
  475. def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
  476. if name == "AESV3":
  477. return self.decrypt_aes256
  478. else:
  479. return None
  480. def authenticate(self, password: str) -> Optional[bytes]:
  481. password_b = self._normalize_password(password)
  482. hash = self._password_hash(password_b, self.o_validation_salt, self.u)
  483. if hash == self.o_hash:
  484. hash = self._password_hash(password_b, self.o_key_salt, self.u)
  485. cipher = Cipher(
  486. algorithms.AES(hash),
  487. modes.CBC(b"\0" * 16),
  488. backend=default_backend(),
  489. ) # type: ignore
  490. return cipher.decryptor().update(self.oe) # type: ignore
  491. hash = self._password_hash(password_b, self.u_validation_salt)
  492. if hash == self.u_hash:
  493. hash = self._password_hash(password_b, self.u_key_salt)
  494. cipher = Cipher(
  495. algorithms.AES(hash),
  496. modes.CBC(b"\0" * 16),
  497. backend=default_backend(),
  498. ) # type: ignore
  499. return cipher.decryptor().update(self.ue) # type: ignore
  500. return None
  501. def _normalize_password(self, password: str) -> bytes:
  502. if self.r == 6:
  503. # saslprep expects non-empty strings, apparently
  504. if not password:
  505. return b""
  506. from pdf2zh._saslprep import saslprep
  507. password = saslprep(password)
  508. return password.encode("utf-8")[:127]
  509. def _password_hash(
  510. self,
  511. password: bytes,
  512. salt: bytes,
  513. vector: Optional[bytes] = None,
  514. ) -> bytes:
  515. """Compute password hash depending on revision number"""
  516. if self.r == 5:
  517. return self._r5_password(password, salt, vector)
  518. return self._r6_password(password, salt[0:8], vector)
  519. def _r5_password(
  520. self,
  521. password: bytes,
  522. salt: bytes,
  523. vector: Optional[bytes] = None,
  524. ) -> bytes:
  525. """Compute the password for revision 5"""
  526. hash = sha256(password)
  527. hash.update(salt)
  528. if vector is not None:
  529. hash.update(vector)
  530. return hash.digest()
  531. def _r6_password(
  532. self,
  533. password: bytes,
  534. salt: bytes,
  535. vector: Optional[bytes] = None,
  536. ) -> bytes:
  537. """Compute the password for revision 6"""
  538. initial_hash = sha256(password)
  539. initial_hash.update(salt)
  540. if vector is not None:
  541. initial_hash.update(vector)
  542. k = initial_hash.digest()
  543. hashes = (sha256, sha384, sha512)
  544. round_no = last_byte_val = 0
  545. while round_no < 64 or last_byte_val > round_no - 32:
  546. k1 = (password + k + (vector or b"")) * 64
  547. e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
  548. # compute the first 16 bytes of e,
  549. # interpreted as an unsigned integer mod 3
  550. next_hash = hashes[self._bytes_mod_3(e[:16])]
  551. k = next_hash(e).digest()
  552. last_byte_val = e[len(e) - 1]
  553. round_no += 1
  554. return k[:32]
  555. @staticmethod
  556. def _bytes_mod_3(input_bytes: bytes) -> int:
  557. # 256 is 1 mod 3, so we can just sum 'em
  558. return sum(b % 3 for b in input_bytes) % 3
  559. def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
  560. cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
  561. encryptor = cipher.encryptor() # type: ignore
  562. return encryptor.update(data) + encryptor.finalize() # type: ignore
  563. def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
  564. initialization_vector = data[:16]
  565. ciphertext = data[16:]
  566. assert self.key is not None
  567. cipher = Cipher(
  568. algorithms.AES(self.key),
  569. modes.CBC(initialization_vector),
  570. backend=default_backend(),
  571. ) # type: ignore
  572. return cipher.decryptor().update(ciphertext) # type: ignore
  573. class PDFDocument:
  574. """PDFDocument object represents a PDF document.
  575. Since a PDF file can be very big, normally it is not loaded at
  576. once. So PDF document has to cooperate with a PDF parser in order to
  577. dynamically import the data as processing goes.
  578. Typical usage:
  579. doc = PDFDocument(parser, password)
  580. obj = doc.getobj(objid)
  581. """
  582. security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
  583. 1: PDFStandardSecurityHandler,
  584. 2: PDFStandardSecurityHandler,
  585. 4: PDFStandardSecurityHandlerV4,
  586. 5: PDFStandardSecurityHandlerV5,
  587. }
  588. def __init__(
  589. self,
  590. parser: PDFParser,
  591. password: str = "",
  592. caching: bool = True,
  593. fallback: bool = True,
  594. ) -> None:
  595. """Set the document to use a given PDFParser object."""
  596. self.caching = caching
  597. self.xrefs: List[PDFBaseXRef] = []
  598. self.info = []
  599. self.catalog: Dict[str, Any] = {}
  600. self.encryption: Optional[Tuple[Any, Any]] = None
  601. self.decipher: Optional[DecipherCallable] = None
  602. self._parser = None
  603. self._cached_objs: Dict[int, Tuple[object, int]] = {}
  604. self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
  605. self._parser = parser
  606. self._parser.set_document(self)
  607. self.is_printable = self.is_modifiable = self.is_extractable = True
  608. # Retrieve the information of each header that was appended
  609. # (maybe multiple times) at the end of the document.
  610. try:
  611. # print('FIND XREF')
  612. pos = self.find_xref(parser)
  613. self.pos=pos
  614. self.read_xref_from(parser, pos, self.xrefs)
  615. except PDFNoValidXRef:
  616. if fallback:
  617. parser.fallback = True
  618. newxref = PDFXRefFallback()
  619. newxref.load(parser)
  620. self.xrefs.append(newxref)
  621. # print(f'XREF {self.xrefs}')
  622. for xref in self.xrefs:
  623. trailer = xref.get_trailer()
  624. if not trailer:
  625. continue
  626. # If there's an encryption info, remember it.
  627. if "Encrypt" in trailer:
  628. if "ID" in trailer:
  629. id_value = list_value(trailer["ID"])
  630. else:
  631. # Some documents may not have a /ID, use two empty
  632. # byte strings instead. Solves
  633. # https://github.com/pdf2zh/pdf2zh.six/issues/594
  634. id_value = (b"", b"")
  635. self.encryption = (id_value, dict_value(trailer["Encrypt"]))
  636. self._initialize_password(password)
  637. if "Info" in trailer:
  638. self.info.append(dict_value(trailer["Info"]))
  639. if "Root" in trailer:
  640. # Every PDF file must have exactly one /Root dictionary.
  641. self.catalog = dict_value(trailer["Root"])
  642. break
  643. else:
  644. raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
  645. if self.catalog.get("Type") is not LITERAL_CATALOG:
  646. if settings.STRICT:
  647. raise PDFSyntaxError("Catalog not found!")
  648. KEYWORD_OBJ = KWD(b"obj")
  649. # _initialize_password(password=b'')
  650. # Perform the initialization with a given password.
  651. def _initialize_password(self, password: str = "") -> None:
  652. assert self.encryption is not None
  653. (docid, param) = self.encryption
  654. if literal_name(param.get("Filter")) != "Standard":
  655. raise PDFEncryptionError("Unknown filter: param=%r" % param)
  656. v = int_value(param.get("V", 0))
  657. factory = self.security_handler_registry.get(v)
  658. if factory is None:
  659. raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
  660. handler = factory(docid, param, password)
  661. self.decipher = handler.decrypt
  662. self.is_printable = handler.is_printable()
  663. self.is_modifiable = handler.is_modifiable()
  664. self.is_extractable = handler.is_extractable()
  665. assert self._parser is not None
  666. self._parser.fallback = False # need to read streams with exact length
  667. def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
  668. if stream.objid in self._parsed_objs:
  669. (objs, n) = self._parsed_objs[stream.objid]
  670. else:
  671. (objs, n) = self._get_objects(stream)
  672. if self.caching:
  673. assert stream.objid is not None
  674. self._parsed_objs[stream.objid] = (objs, n)
  675. i = n * 2 + index
  676. try:
  677. obj = objs[i]
  678. except IndexError:
  679. raise PDFSyntaxError("index too big: %r" % index)
  680. return obj
  681. def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
  682. if stream.get("Type") is not LITERAL_OBJSTM:
  683. if settings.STRICT:
  684. raise PDFSyntaxError("Not a stream object: %r" % stream)
  685. try:
  686. n = cast(int, stream["N"])
  687. except KeyError:
  688. if settings.STRICT:
  689. raise PDFSyntaxError("N is not defined: %r" % stream)
  690. n = 0
  691. parser = PDFStreamParser(stream.get_data())
  692. parser.set_document(self)
  693. objs: List[object] = []
  694. try:
  695. while 1:
  696. _, (_, obj) = parser.nextobject()
  697. objs.append(obj)
  698. except PSEOF:
  699. pass
  700. return (objs, n)
  701. def _getobj_parse(self, pos: int, objid: int) -> object:
  702. assert self._parser is not None
  703. self._parser.seek(pos)
  704. (_, objid1) = self._parser.nexttoken() # objid
  705. (_, genno) = self._parser.nexttoken() # genno
  706. (_, kwd) = self._parser.nexttoken()
  707. # hack around malformed pdf files
  708. # copied from https://github.com/jaepil/pdf2zh3k/blob/master/
  709. # pdf2zh/pdfparser.py#L399
  710. # to solve https://github.com/pdf2zh/pdf2zh.six/issues/56
  711. # assert objid1 == objid, str((objid1, objid))
  712. if objid1 != objid:
  713. x = []
  714. while kwd is not self.KEYWORD_OBJ:
  715. (_, kwd) = self._parser.nexttoken()
  716. x.append(kwd)
  717. if len(x) >= 2:
  718. objid1 = x[-2]
  719. # #### end hack around malformed pdf files
  720. if objid1 != objid:
  721. raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")
  722. if kwd != KWD(b"obj"):
  723. raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
  724. end, (_, obj) = self._parser.nextobject()
  725. return end, obj
  726. # can raise PDFObjectNotFound
  727. def getobj(self, objid: int) -> object:
  728. """Get object from PDF
  729. :raises PDFException if PDFDocument is not initialized
  730. :raises PDFObjectNotFound if objid does not exist in PDF
  731. """
  732. if not self.xrefs:
  733. raise PDFException("PDFDocument is not initialized")
  734. # log.debug("getobj: objid=%r", objid)
  735. if objid in self._cached_objs:
  736. (obj, genno) = self._cached_objs[objid]
  737. else:
  738. for xref in self.xrefs:
  739. try:
  740. (strmid, index, genno) = xref.get_pos(objid)
  741. except KeyError:
  742. continue
  743. try:
  744. if strmid is not None:
  745. stream = stream_value(self.getobj(strmid))
  746. obj = self._getobj_objstm(stream, index, objid)
  747. else:
  748. end, obj = self._getobj_parse(index, objid)
  749. if self.decipher:
  750. obj = decipher_all(self.decipher, objid, genno, obj)
  751. if isinstance(obj, PDFStream):
  752. obj.set_objid(objid, genno)
  753. break
  754. except (PSEOF, PDFSyntaxError):
  755. continue
  756. else:
  757. raise PDFObjectNotFound(objid)
  758. # log.debug("register: objid=%r: %r", objid, obj)
  759. if self.caching:
  760. self._cached_objs[objid] = (obj, genno)
  761. return obj
  762. OutlineType = Tuple[Any, Any, Any, Any, Any]
  763. def get_outlines(self) -> Iterator[OutlineType]:
  764. if "Outlines" not in self.catalog:
  765. raise PDFNoOutlines
  766. def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
  767. entry = dict_value(entry)
  768. if "Title" in entry:
  769. if "A" in entry or "Dest" in entry:
  770. title = decode_text(str_value(entry["Title"]))
  771. dest = entry.get("Dest")
  772. action = entry.get("A")
  773. se = entry.get("SE")
  774. yield (level, title, dest, action, se)
  775. if "First" in entry and "Last" in entry:
  776. yield from search(entry["First"], level + 1)
  777. if "Next" in entry:
  778. yield from search(entry["Next"], level)
  779. return search(self.catalog["Outlines"], 0)
  780. def get_page_labels(self) -> Iterator[str]:
  781. """Generate page label strings for the PDF document.
  782. If the document includes page labels, generates strings, one per page.
  783. If not, raises PDFNoPageLabels.
  784. The resulting iteration is unbounded.
  785. """
  786. assert self.catalog is not None
  787. try:
  788. page_labels = PageLabels(self.catalog["PageLabels"])
  789. except (PDFTypeError, KeyError):
  790. raise PDFNoPageLabels
  791. return page_labels.labels
  792. def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
  793. try:
  794. names = dict_value(self.catalog["Names"])
  795. except (PDFTypeError, KeyError):
  796. raise PDFKeyError((cat, key))
  797. # may raise KeyError
  798. d0 = dict_value(names[cat])
  799. def lookup(d: Dict[str, Any]) -> Any:
  800. if "Limits" in d:
  801. (k1, k2) = list_value(d["Limits"])
  802. if key < k1 or k2 < key:
  803. return None
  804. if "Names" in d:
  805. objs = list_value(d["Names"])
  806. names = dict(
  807. cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)),
  808. )
  809. return names[key]
  810. if "Kids" in d:
  811. for c in list_value(d["Kids"]):
  812. v = lookup(dict_value(c))
  813. if v:
  814. return v
  815. raise PDFKeyError((cat, key))
  816. return lookup(d0)
  817. def get_dest(self, name: Union[str, bytes]) -> Any:
  818. try:
  819. # PDF-1.2 or later
  820. obj = self.lookup_name("Dests", name)
  821. except KeyError:
  822. # PDF-1.1 or prior
  823. if "Dests" not in self.catalog:
  824. raise PDFDestinationNotFound(name)
  825. d0 = dict_value(self.catalog["Dests"])
  826. if name not in d0:
  827. raise PDFDestinationNotFound(name)
  828. obj = d0[name]
  829. return obj
  830. # find_xref
  831. def find_xref(self, parser: PDFParser) -> int:
  832. """Internal function used to locate the first XRef."""
  833. # search the last xref table by scanning the file backwards.
  834. prev = b""
  835. for line in parser.revreadlines():
  836. line = line.strip()
  837. # log.debug("find_xref: %r", line)
  838. if line == b"startxref":
  839. # log.debug("xref found: pos=%r", prev)
  840. if not prev.isdigit():
  841. raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
  842. start = int(prev)
  843. if not start >= 0:
  844. raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
  845. return start
  846. if line:
  847. prev = line
  848. raise PDFNoValidXRef("Unexpected EOF")
  849. # read xref table
  850. def read_xref_from(
  851. self,
  852. parser: PDFParser,
  853. start: int,
  854. xrefs: List[PDFBaseXRef],
  855. ) -> None:
  856. """Reads XRefs from the given location."""
  857. parser.seek(start)
  858. parser.reset()
  859. try:
  860. (pos, token) = parser.nexttoken()
  861. except PSEOF:
  862. raise PDFNoValidXRef("Unexpected EOF")
  863. # log.debug("read_xref_from: start=%d, token=%r", start, token)
  864. if isinstance(token, int):
  865. # XRefStream: PDF-1.5
  866. parser.seek(pos)
  867. parser.reset()
  868. xref: PDFBaseXRef = PDFXRefStream()
  869. xref.load(parser)
  870. else:
  871. if token is parser.KEYWORD_XREF:
  872. parser.nextline()
  873. xref = PDFXRef()
  874. xref.load(parser)
  875. xrefs.append(xref)
  876. trailer = xref.get_trailer()
  877. # log.debug("trailer: %r", trailer)
  878. if "XRefStm" in trailer:
  879. pos = int_value(trailer["XRefStm"])
  880. self.read_xref_from(parser, pos, xrefs)
  881. if "Prev" in trailer:
  882. # find previous xref
  883. pos = int_value(trailer["Prev"])
  884. self.read_xref_from(parser, pos, xrefs)
  885. class PageLabels(NumberTree):
  886. """PageLabels from the document catalog.
  887. See Section 8.3.1 in the PDF Reference.
  888. """
  889. @property
  890. def labels(self) -> Iterator[str]:
  891. ranges = self.values
  892. # The tree must begin with page index 0
  893. if len(ranges) == 0 or ranges[0][0] != 0:
  894. if settings.STRICT:
  895. raise PDFSyntaxError("PageLabels is missing page index 0")
  896. else:
  897. # Try to cope, by assuming empty labels for the initial pages
  898. ranges.insert(0, (0, {}))
  899. for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
  900. label_dict = dict_value(label_dict_unchecked)
  901. style = label_dict.get("S")
  902. prefix = decode_text(str_value(label_dict.get("P", b"")))
  903. first_value = int_value(label_dict.get("St", 1))
  904. if next == len(ranges):
  905. # This is the last specified range. It continues until the end
  906. # of the document.
  907. values: Iterable[int] = itertools.count(first_value)
  908. else:
  909. end, _ = ranges[next]
  910. range_length = end - start
  911. values = range(first_value, first_value + range_length)
  912. for value in values:
  913. label = self._format_page_label(value, style)
  914. yield prefix + label
  915. @staticmethod
  916. def _format_page_label(value: int, style: Any) -> str:
  917. """Format page label value in a specific style"""
  918. if style is None:
  919. label = ""
  920. elif style is LIT("D"): # Decimal arabic numerals
  921. label = str(value)
  922. elif style is LIT("R"): # Uppercase roman numerals
  923. label = format_int_roman(value).upper()
  924. elif style is LIT("r"): # Lowercase roman numerals
  925. label = format_int_roman(value)
  926. elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
  927. label = format_int_alpha(value).upper()
  928. elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
  929. label = format_int_alpha(value)
  930. else:
  931. log.warning("Unknown page label style: %r", style)
  932. label = ""
  933. return label