pdf2zh.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. #!/usr/bin/env python3
  2. """A command line tool for extracting text and images from PDF and
  3. output it to plain text, html, xml or tags.
  4. """
  5. from __future__ import annotations
  6. import argparse
  7. import logging
  8. import os
  9. import sys
  10. from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
  11. import pymupdf
  12. from huggingface_hub import hf_hub_download
  13. from pathlib import Path
  14. from pdf2zh import __version__
  15. from pdf2zh.pdfexceptions import PDFValueError
  16. if TYPE_CHECKING:
  17. from pdf2zh.layout import LAParams
  18. from pdf2zh.utils import AnyIO
  19. OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
  20. def setup_log() -> None:
  21. import doclayout_yolo
  22. logging.basicConfig()
  23. doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
  24. def check_files(files: List[str]) -> List[str]:
  25. missing_files = [file for file in files if not os.path.exists(file)]
  26. return missing_files
  27. def float_or_disabled(x: str) -> Optional[float]:
  28. if x.lower().strip() == "disabled":
  29. return None
  30. try:
  31. return float(x)
  32. except ValueError:
  33. raise argparse.ArgumentTypeError(f"invalid float value: {x}")
  34. def extract_text(
  35. files: Iterable[str] = [],
  36. outfile: str = "-",
  37. laparams: Optional[LAParams] = None,
  38. output_type: str = "text",
  39. codec: str = "utf-8",
  40. strip_control: bool = False,
  41. maxpages: int = 0,
  42. pages: Optional[Container[int]] = None,
  43. password: str = "",
  44. scale: float = 1.0,
  45. rotation: int = 0,
  46. layoutmode: str = "normal",
  47. output_dir: Optional[str] = None,
  48. debug: bool = False,
  49. disable_caching: bool = False,
  50. vfont: str = "",
  51. vchar: str = "",
  52. thread: int = 0,
  53. lang_in: str = "",
  54. lang_out: str = "",
  55. service: str = "",
  56. callback: object = None,
  57. output: str = "",
  58. **kwargs: Any,
  59. ) -> AnyIO:
  60. import doclayout_yolo
  61. import pdf2zh.high_level
  62. if not files:
  63. raise PDFValueError("Must provide files to work upon!")
  64. if output_type == "text" and outfile != "-":
  65. for override, alttype in OUTPUT_TYPES:
  66. if outfile.endswith(override):
  67. output_type = alttype
  68. outfp: AnyIO = sys.stdout
  69. # pth = os.path.join(tempfile.gettempdir(), 'doclayout_yolo_docstructbench_imgsz1024.pt')
  70. # if not os.path.exists(pth):
  71. # print('Downloading...')
  72. # urllib.request.urlretrieve("http://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt",pth)
  73. pth = hf_hub_download(
  74. repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
  75. filename="doclayout_yolo_docstructbench_imgsz1024.pt",
  76. )
  77. model = doclayout_yolo.YOLOv10(pth)
  78. for file in files:
  79. filename = os.path.splitext(os.path.basename(file))[0]
  80. doc_en = pymupdf.open(file)
  81. page_count = doc_en.page_count
  82. font_list = ["china-ss", "tiro"]
  83. font_id = {}
  84. for page in doc_en:
  85. for font in font_list:
  86. font_id[font] = page.insert_font(font)
  87. xreflen = doc_en.xref_length()
  88. for xref in range(1, xreflen):
  89. for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
  90. try: # xref 读写可能出错
  91. font_res = doc_en.xref_get_key(xref, f"{label}Font")
  92. if font_res[0] == "dict":
  93. for font in font_list:
  94. font_exist = doc_en.xref_get_key(
  95. xref, f"{label}Font/{font}"
  96. )
  97. if font_exist[0] == "null":
  98. doc_en.xref_set_key(
  99. xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
  100. )
  101. except Exception:
  102. pass
  103. doc_en.save(Path(output) / f"{filename}-en.pdf")
  104. with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
  105. obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
  106. for obj_id, ops_new in obj_patch.items():
  107. # ops_old=doc_en.xref_stream(obj_id)
  108. # print(obj_id)
  109. # print(ops_old)
  110. # print(ops_new.encode())
  111. doc_en.update_stream(obj_id, ops_new.encode())
  112. doc_zh = doc_en
  113. doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
  114. doc_dual.insert_file(doc_zh)
  115. for id in range(page_count):
  116. doc_dual.move_page(page_count + id, id * 2 + 1)
  117. doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
  118. doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
  119. doc_zh.close()
  120. doc_dual.close()
  121. os.remove(Path(output) / f"{filename}-en.pdf")
  122. return
  123. def create_parser() -> argparse.ArgumentParser:
  124. parser = argparse.ArgumentParser(description=__doc__, add_help=True)
  125. parser.add_argument(
  126. "files",
  127. type=str,
  128. default=None,
  129. nargs="*",
  130. help="One or more paths to PDF files.",
  131. )
  132. parser.add_argument(
  133. "--version",
  134. "-v",
  135. action="version",
  136. version=f"pdf2zh v{__version__}",
  137. )
  138. parser.add_argument(
  139. "--debug",
  140. "-d",
  141. default=False,
  142. action="store_true",
  143. help="Use debug logging level.",
  144. )
  145. parse_params = parser.add_argument_group(
  146. "Parser",
  147. description="Used during PDF parsing",
  148. )
  149. parse_params.add_argument(
  150. "--pages",
  151. "-p",
  152. type=str,
  153. help="The list of page numbers to parse.",
  154. )
  155. parse_params.add_argument(
  156. "--password",
  157. "-P",
  158. type=str,
  159. default="",
  160. help="The password to use for decrypting PDF file.",
  161. )
  162. parse_params.add_argument(
  163. "--vfont",
  164. "-f",
  165. type=str,
  166. default="",
  167. help="The regex to math font name of formula.",
  168. )
  169. parse_params.add_argument(
  170. "--vchar",
  171. "-c",
  172. type=str,
  173. default="",
  174. help="The regex to math character of formula.",
  175. )
  176. parse_params.add_argument(
  177. "--lang-in",
  178. "-li",
  179. type=str,
  180. default="auto",
  181. help="The code of source language.",
  182. )
  183. parse_params.add_argument(
  184. "--lang-out",
  185. "-lo",
  186. type=str,
  187. default="auto",
  188. help="The code of target language.",
  189. )
  190. parse_params.add_argument(
  191. "--service",
  192. "-s",
  193. type=str,
  194. default="google",
  195. help="The service to use for translation.",
  196. )
  197. parse_params.add_argument(
  198. "--output",
  199. "-o",
  200. type=str,
  201. default="",
  202. help="Output directory for files.",
  203. )
  204. parse_params.add_argument(
  205. "--thread",
  206. "-t",
  207. type=int,
  208. default=4,
  209. help="The number of threads to execute translation.",
  210. )
  211. parse_params.add_argument(
  212. "--interactive",
  213. "-i",
  214. action="store_true",
  215. help="Interact with GUI.",
  216. )
  217. parse_params.add_argument(
  218. "--share",
  219. action="store_true",
  220. help="Enable Gradio Share",
  221. )
  222. return parser
  223. def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
  224. parsed_args = create_parser().parse_args(args=args)
  225. if parsed_args.pages:
  226. pages = []
  227. for p in parsed_args.pages.split(","):
  228. if "-" in p:
  229. start, end = p.split("-")
  230. pages.extend(range(int(start) - 1, int(end)))
  231. else:
  232. pages.append(int(p) - 1)
  233. parsed_args.pages = pages
  234. return parsed_args
  235. def main(args: Optional[List[str]] = None) -> int:
  236. parsed_args = parse_args(args)
  237. missing_files = check_files(parsed_args.files)
  238. if missing_files:
  239. print("The following files do not exist:", file=sys.stderr)
  240. for file in missing_files:
  241. print(f" {file}", file=sys.stderr)
  242. return -1
  243. if parsed_args.interactive:
  244. from pdf2zh.gui import setup_gui
  245. setup_gui(parsed_args.share)
  246. return 0
  247. setup_log()
  248. extract_text(**vars(parsed_args))
  249. return 0
  250. if __name__ == "__main__":
  251. sys.exit(main())