pdf2zh.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #!/usr/bin/env python3
  2. """A command line tool for extracting text and images from PDF and
  3. output it to plain text, html, xml or tags.
  4. """
  5. from __future__ import annotations
  6. import argparse
  7. import logging
  8. import os
  9. import subprocess
  10. import sys
  11. from pathlib import Path
  12. from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
  13. import pymupdf
  14. from pdf2zh import __version__
  15. from pdf2zh.pdfexceptions import PDFValueError
  16. if TYPE_CHECKING:
  17. from pdf2zh.layout import LAParams
  18. from pdf2zh.utils import AnyIO
  19. OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
  20. def setup_log() -> None:
  21. logging.basicConfig()
  22. try:
  23. import doclayout_yolo
  24. doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
  25. except ImportError:
  26. pass
  27. def check_files(files: List[str]) -> List[str]:
  28. missing_files = [file for file in files if not os.path.exists(file)]
  29. return missing_files
  30. def float_or_disabled(x: str) -> Optional[float]:
  31. if x.lower().strip() == "disabled":
  32. return None
  33. try:
  34. return float(x)
  35. except ValueError:
  36. raise argparse.ArgumentTypeError(f"invalid float value: {x}")
  37. def extract_text(
  38. files: Iterable[str] = [],
  39. outfile: str = "-",
  40. laparams: Optional[LAParams] = None,
  41. output_type: str = "text",
  42. codec: str = "utf-8",
  43. strip_control: bool = False,
  44. maxpages: int = 0,
  45. pages: Optional[Container[int]] = None,
  46. password: str = "",
  47. scale: float = 1.0,
  48. rotation: int = 0,
  49. layoutmode: str = "normal",
  50. output_dir: Optional[str] = None,
  51. debug: bool = False,
  52. disable_caching: bool = False,
  53. vfont: str = "",
  54. vchar: str = "",
  55. thread: int = 0,
  56. lang_in: str = "",
  57. lang_out: str = "",
  58. service: str = "",
  59. callback: object = None,
  60. output: str = "",
  61. **kwargs: Any,
  62. ) -> AnyIO:
  63. from pdf2zh.doclayout import DocLayoutModel
  64. import pdf2zh.high_level
  65. if not files:
  66. raise PDFValueError("Must provide files to work upon!")
  67. if output_type == "text" and outfile != "-":
  68. for override, alttype in OUTPUT_TYPES:
  69. if outfile.endswith(override):
  70. output_type = alttype
  71. outfp: AnyIO = sys.stdout
  72. model = DocLayoutModel.load_available()
  73. for file in files:
  74. filename = os.path.splitext(os.path.basename(file))[0]
  75. def convert_to_pdfa(input_pdf_path, output_pdfa_path):
  76. """
  77. Converts a PDF to PDF/A format using Ghostscript.
  78. Args:
  79. input_pdf_path (str): Path to the input PDF file.
  80. output_pdfa_path (str): Path where the PDF/A file will be saved.
  81. """
  82. try:
  83. # Ghostscript command for conversion
  84. command = [
  85. "gs",
  86. "-dPDFA",
  87. "-dBATCH",
  88. "-dNOPAUSE",
  89. "-dNOOUTERSAVE",
  90. "-sDEVICE=pdfwrite",
  91. "-sOutputFile=" + output_pdfa_path,
  92. "-dPDFACompatibilityPolicy=1",
  93. input_pdf_path,
  94. ]
  95. # Run the command
  96. subprocess.run(command, check=True)
  97. print(
  98. f"Successfully converted {input_pdf_path} to PDF/A at {output_pdfa_path}"
  99. )
  100. except subprocess.CalledProcessError as e:
  101. print(f"Error during conversion: {e}")
  102. except FileNotFoundError:
  103. print("Ghostscript is not installed or not found in the PATH.")
  104. try:
  105. file_pdfa = f"{str(file)}-pdfa.pdf"
  106. convert_to_pdfa(file, file_pdfa)
  107. doc_en = pymupdf.open(file_pdfa)
  108. except Exception as e:
  109. print(f"Error converting PDF: {e}")
  110. doc_en = pymupdf.open(file)
  111. page_count = doc_en.page_count
  112. font_list = ["china-ss", "tiro"]
  113. font_id = {}
  114. for page in doc_en:
  115. for font in font_list:
  116. font_id[font] = page.insert_font(font)
  117. xreflen = doc_en.xref_length()
  118. for xref in range(1, xreflen):
  119. for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
  120. try: # xref 读写可能出错
  121. font_res = doc_en.xref_get_key(xref, f"{label}Font")
  122. if font_res[0] == "dict":
  123. for font in font_list:
  124. font_exist = doc_en.xref_get_key(
  125. xref, f"{label}Font/{font}"
  126. )
  127. if font_exist[0] == "null":
  128. doc_en.xref_set_key(
  129. xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
  130. )
  131. except Exception:
  132. pass
  133. doc_en.save(Path(output) / f"{filename}-en.pdf")
  134. with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
  135. obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
  136. for obj_id, ops_new in obj_patch.items():
  137. # ops_old=doc_en.xref_stream(obj_id)
  138. # print(obj_id)
  139. # print(ops_old)
  140. # print(ops_new.encode())
  141. doc_en.update_stream(obj_id, ops_new.encode())
  142. doc_zh = doc_en
  143. doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
  144. doc_dual.insert_file(doc_zh)
  145. for id in range(page_count):
  146. doc_dual.move_page(page_count + id, id * 2 + 1)
  147. doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
  148. doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
  149. doc_zh.close()
  150. doc_dual.close()
  151. os.remove(Path(output) / f"{filename}-en.pdf")
  152. return
  153. def create_parser() -> argparse.ArgumentParser:
  154. parser = argparse.ArgumentParser(description=__doc__, add_help=True)
  155. parser.add_argument(
  156. "files",
  157. type=str,
  158. default=None,
  159. nargs="*",
  160. help="One or more paths to PDF files.",
  161. )
  162. parser.add_argument(
  163. "--version",
  164. "-v",
  165. action="version",
  166. version=f"pdf2zh v{__version__}",
  167. )
  168. parser.add_argument(
  169. "--debug",
  170. "-d",
  171. default=False,
  172. action="store_true",
  173. help="Use debug logging level.",
  174. )
  175. parse_params = parser.add_argument_group(
  176. "Parser",
  177. description="Used during PDF parsing",
  178. )
  179. parse_params.add_argument(
  180. "--pages",
  181. "-p",
  182. type=str,
  183. help="The list of page numbers to parse.",
  184. )
  185. parse_params.add_argument(
  186. "--password",
  187. "-P",
  188. type=str,
  189. default="",
  190. help="The password to use for decrypting PDF file.",
  191. )
  192. parse_params.add_argument(
  193. "--vfont",
  194. "-f",
  195. type=str,
  196. default="",
  197. help="The regex to math font name of formula.",
  198. )
  199. parse_params.add_argument(
  200. "--vchar",
  201. "-c",
  202. type=str,
  203. default="",
  204. help="The regex to math character of formula.",
  205. )
  206. parse_params.add_argument(
  207. "--lang-in",
  208. "-li",
  209. type=str,
  210. default="auto",
  211. help="The code of source language.",
  212. )
  213. parse_params.add_argument(
  214. "--lang-out",
  215. "-lo",
  216. type=str,
  217. default="auto",
  218. help="The code of target language.",
  219. )
  220. parse_params.add_argument(
  221. "--service",
  222. "-s",
  223. type=str,
  224. default="google",
  225. help="The service to use for translation.",
  226. )
  227. parse_params.add_argument(
  228. "--output",
  229. "-o",
  230. type=str,
  231. default="",
  232. help="Output directory for files.",
  233. )
  234. parse_params.add_argument(
  235. "--thread",
  236. "-t",
  237. type=int,
  238. default=4,
  239. help="The number of threads to execute translation.",
  240. )
  241. parse_params.add_argument(
  242. "--interactive",
  243. "-i",
  244. action="store_true",
  245. help="Interact with GUI.",
  246. )
  247. parse_params.add_argument(
  248. "--share",
  249. action="store_true",
  250. help="Enable Gradio Share",
  251. )
  252. return parser
  253. def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
  254. parsed_args = create_parser().parse_args(args=args)
  255. if parsed_args.pages:
  256. pages = []
  257. for p in parsed_args.pages.split(","):
  258. if "-" in p:
  259. start, end = p.split("-")
  260. pages.extend(range(int(start) - 1, int(end)))
  261. else:
  262. pages.append(int(p) - 1)
  263. parsed_args.pages = pages
  264. return parsed_args
  265. def main(args: Optional[List[str]] = None) -> int:
  266. parsed_args = parse_args(args)
  267. missing_files = check_files(parsed_args.files)
  268. if missing_files:
  269. print("The following files do not exist:", file=sys.stderr)
  270. for file in missing_files:
  271. print(f" {file}", file=sys.stderr)
  272. return -1
  273. if parsed_args.interactive:
  274. from pdf2zh.gui import setup_gui
  275. setup_gui(parsed_args.share)
  276. return 0
  277. setup_log()
  278. extract_text(**vars(parsed_args))
  279. return 0
  280. if __name__ == "__main__":
  281. sys.exit(main())