high_level.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. """Functions that can be used for the most common use-cases for pdf2zh.six"""
  2. import asyncio
  3. from asyncio import CancelledError
  4. from typing import BinaryIO
  5. import numpy as np
  6. import tqdm
  7. import sys
  8. from pymupdf import Font, Document
  9. from pdfminer.pdfpage import PDFPage
  10. from pdfminer.pdfinterp import PDFResourceManager
  11. from pdfminer.pdfdocument import PDFDocument
  12. from pdfminer.pdfparser import PDFParser
  13. from pdfminer.pdfexceptions import PDFValueError
  14. from pdf2zh.converter import TranslateConverter
  15. from pdf2zh.pdfinterp import PDFPageInterpreterEx
  16. from pdf2zh.doclayout import DocLayoutModel
  17. from pathlib import Path
  18. from typing import Any, List, Optional
  19. import urllib.request
  20. import requests
  21. import tempfile
  22. import os
  23. import io
  24. model = DocLayoutModel.load_available()
  25. resfont_map = {
  26. "zh-cn": "china-ss",
  27. "zh-tw": "china-ts",
  28. "zh-hans": "china-ss",
  29. "zh-hant": "china-ts",
  30. "zh": "china-ss",
  31. "ja": "japan-s",
  32. "ko": "korea-s",
  33. }
  34. noto_list = [
  35. "am", # Amharic
  36. "ar", # Arabic
  37. "bn", # Bengali
  38. "bg", # Bulgarian
  39. "chr", # Cherokee
  40. "el", # Greek
  41. "gu", # Gujarati
  42. "iw", # Hebrew
  43. "hi", # Hindi
  44. # "ja", # Japanese
  45. "kn", # Kannada
  46. # "ko", # Korean
  47. "ml", # Malayalam
  48. "mr", # Marathi
  49. "ru", # Russian
  50. "sr", # Serbian
  51. # "zh-cn",# SC
  52. "ta", # Tamil
  53. "te", # Telugu
  54. "th", # Thai
  55. # "zh-tw",# TC
  56. "ur", # Urdu
  57. "uk", # Ukrainian
  58. ]
  59. def check_files(files: List[str]) -> List[str]:
  60. files = [
  61. f for f in files if not f.startswith("http://")
  62. ] # exclude online files, http
  63. files = [
  64. f for f in files if not f.startswith("https://")
  65. ] # exclude online files, https
  66. missing_files = [file for file in files if not os.path.exists(file)]
  67. return missing_files
  68. def translate_patch(
  69. inf: BinaryIO,
  70. pages: Optional[list[int]] = None,
  71. vfont: str = "",
  72. vchar: str = "",
  73. thread: int = 0,
  74. doc_zh: Document = None,
  75. lang_in: str = "",
  76. lang_out: str = "",
  77. service: str = "",
  78. resfont: str = "",
  79. noto: Font = None,
  80. callback: object = None,
  81. cancellation_event: asyncio.Event = None,
  82. **kwarg: Any,
  83. ) -> None:
  84. rsrcmgr = PDFResourceManager()
  85. layout = {}
  86. device = TranslateConverter(
  87. rsrcmgr,
  88. vfont,
  89. vchar,
  90. thread,
  91. layout,
  92. lang_in,
  93. lang_out,
  94. service,
  95. resfont,
  96. noto,
  97. kwarg.get("envs", {}),
  98. )
  99. assert device is not None
  100. obj_patch = {}
  101. interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
  102. if pages:
  103. total_pages = len(pages)
  104. else:
  105. total_pages = doc_zh.page_count
  106. parser = PDFParser(inf)
  107. doc = PDFDocument(parser)
  108. with tqdm.tqdm(total=total_pages) as progress:
  109. for pageno, page in enumerate(PDFPage.create_pages(doc)):
  110. if cancellation_event and cancellation_event.is_set():
  111. raise CancelledError("task cancelled")
  112. if pages and (pageno not in pages):
  113. continue
  114. progress.update()
  115. if callback:
  116. callback(progress)
  117. page.pageno = pageno
  118. pix = doc_zh[page.pageno].get_pixmap()
  119. image = np.fromstring(pix.samples, np.uint8).reshape(
  120. pix.height, pix.width, 3
  121. )[:, :, ::-1]
  122. page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
  123. # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
  124. box = np.ones((pix.height, pix.width))
  125. h, w = box.shape
  126. vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
  127. for i, d in enumerate(page_layout.boxes):
  128. if not page_layout.names[int(d.cls)] in vcls:
  129. x0, y0, x1, y1 = d.xyxy.squeeze()
  130. x0, y0, x1, y1 = (
  131. np.clip(int(x0 - 1), 0, w - 1),
  132. np.clip(int(h - y1 - 1), 0, h - 1),
  133. np.clip(int(x1 + 1), 0, w - 1),
  134. np.clip(int(h - y0 + 1), 0, h - 1),
  135. )
  136. box[y0:y1, x0:x1] = i + 2
  137. for i, d in enumerate(page_layout.boxes):
  138. if page_layout.names[int(d.cls)] in vcls:
  139. x0, y0, x1, y1 = d.xyxy.squeeze()
  140. x0, y0, x1, y1 = (
  141. np.clip(int(x0 - 1), 0, w - 1),
  142. np.clip(int(h - y1 - 1), 0, h - 1),
  143. np.clip(int(x1 + 1), 0, w - 1),
  144. np.clip(int(h - y0 + 1), 0, h - 1),
  145. )
  146. box[y0:y1, x0:x1] = 0
  147. layout[page.pageno] = box
  148. # 新建一个 xref 存放新指令流
  149. page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref
  150. doc_zh.update_object(page.page_xref, "<<>>")
  151. doc_zh.update_stream(page.page_xref, b"")
  152. doc_zh[page.pageno].set_contents(page.page_xref)
  153. interpreter.process_page(page)
  154. device.close()
  155. return obj_patch
  156. def translate_stream(
  157. stream: bytes,
  158. pages: Optional[list[int]] = None,
  159. lang_in: str = "",
  160. lang_out: str = "",
  161. service: str = "",
  162. thread: int = 0,
  163. vfont: str = "",
  164. vchar: str = "",
  165. callback: object = None,
  166. cancellation_event: asyncio.Event = None,
  167. **kwarg: Any,
  168. ):
  169. font_list = [("tiro", None)]
  170. noto = None
  171. if lang_out.lower() in resfont_map: # CJK
  172. resfont = resfont_map[lang_out.lower()]
  173. font_list.append((resfont, None))
  174. elif lang_out.lower() in noto_list: # noto
  175. resfont = "noto"
  176. ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
  177. if not os.path.exists(ttf_path):
  178. print("Downloading Noto font...")
  179. urllib.request.urlretrieve(
  180. "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
  181. ttf_path,
  182. )
  183. font_list.append(("noto", ttf_path))
  184. noto = Font("noto", ttf_path)
  185. else: # fallback
  186. resfont = "china-ss"
  187. font_list.append(("china-ss", None))
  188. doc_en = Document(stream=stream)
  189. doc_zh = Document(stream=stream)
  190. page_count = doc_zh.page_count
  191. # font_list = [("china-ss", None), ("tiro", None)]
  192. font_id = {}
  193. for page in doc_zh:
  194. for font in font_list:
  195. font_id[font[0]] = page.insert_font(font[0], font[1])
  196. xreflen = doc_zh.xref_length()
  197. for xref in range(1, xreflen):
  198. for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
  199. try: # xref 读写可能出错
  200. font_res = doc_zh.xref_get_key(xref, f"{label}Font")
  201. if font_res[0] == "dict":
  202. for font in font_list:
  203. font_exist = doc_zh.xref_get_key(xref, f"{label}Font/{font[0]}")
  204. if font_exist[0] == "null":
  205. doc_zh.xref_set_key(
  206. xref,
  207. f"{label}Font/{font[0]}",
  208. f"{font_id[font[0]]} 0 R",
  209. )
  210. except Exception:
  211. pass
  212. fp = io.BytesIO()
  213. doc_zh.save(fp)
  214. obj_patch: dict = translate_patch(fp, envs=kwarg["envs"], **locals())
  215. for obj_id, ops_new in obj_patch.items():
  216. # ops_old=doc_en.xref_stream(obj_id)
  217. # print(obj_id)
  218. # print(ops_old)
  219. # print(ops_new.encode())
  220. doc_zh.update_stream(obj_id, ops_new.encode())
  221. doc_en.insert_file(doc_zh)
  222. for id in range(page_count):
  223. doc_en.move_page(page_count + id, id * 2 + 1)
  224. return doc_zh.write(deflate=1), doc_en.write(deflate=1)
  225. def translate(
  226. files: list[str],
  227. output: str = "",
  228. pages: Optional[list[int]] = None,
  229. lang_in: str = "",
  230. lang_out: str = "",
  231. service: str = "",
  232. thread: int = 0,
  233. vfont: str = "",
  234. vchar: str = "",
  235. callback: object = None,
  236. cancellation_event: asyncio.Event = None,
  237. **kwarg: Any,
  238. ):
  239. if not files:
  240. raise PDFValueError("No files to process.")
  241. missing_files = check_files(files)
  242. if missing_files:
  243. print("The following files do not exist:", file=sys.stderr)
  244. for file in missing_files:
  245. print(f" {file}", file=sys.stderr)
  246. raise PDFValueError("Some files do not exist.")
  247. result_files = []
  248. for file in files:
  249. if file is str and (file.startswith("http://") or file.startswith("https://")):
  250. print("Online files detected, downloading...")
  251. try:
  252. r = requests.get(file, allow_redirects=True)
  253. if r.status_code == 200:
  254. if not os.path.exists("./pdf2zh_files"):
  255. print("Making a temporary dir for downloading PDF files...")
  256. os.mkdir(os.path.dirname("./pdf2zh_files"))
  257. with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
  258. print(f"Writing the file: {file}...")
  259. f.write(r.content)
  260. file = "./pdf2zh_files/tmp_download.pdf"
  261. else:
  262. r.raise_for_status()
  263. except Exception as e:
  264. raise PDFValueError(
  265. f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
  266. )
  267. filename = os.path.splitext(os.path.basename(file))[0]
  268. doc_raw = open(file, "rb")
  269. s_raw = doc_raw.read()
  270. s_mono, s_dual = translate_stream(s_raw, envs=kwarg.get('envs'), **locals())
  271. file_mono = Path(output) / f"{filename}-mono.pdf"
  272. file_dual = Path(output) / f"{filename}-dual.pdf"
  273. doc_mono = open(file_mono, "wb")
  274. doc_dual = open(file_dual, "wb")
  275. doc_mono.write(s_mono)
  276. doc_dual.write(s_dual)
  277. result_files.append((str(file_mono), str(file_dual)))
  278. return result_files