11 luni în urmă · 73b02166c2
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -45,6 +45,10 @@ from pymupdf import Font
 
				 log = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				+shs_name = "shs"
			
 
				+noto_name = "noto"
			
 
				+
			
 
				+
			
 
				 class PDFConverterEx(PDFConverter):
			
 
				     def __init__(
			
 
				         self,
			
@@ -138,6 +142,7 @@ class TranslateConverter(PDFConverterEx):
 
				         lang_out: str = "",
			
 
				         service: str = "",
			
 
				         resfont: str = "",
			
 
				+        shs: Font = None,
			
 
				         noto: Font = None,
			
 
				         envs: Dict = None,
			
 
				         prompt: List = None,
			
@@ -148,6 +153,7 @@ class TranslateConverter(PDFConverterEx):
 
				         self.thread = thread
			
 
				         self.layout = layout
			
 
				         self.resfont = resfont
			
 
				+        self.shs = shs
			
 
				         self.noto = noto
			
 
				         self.translator: BaseTranslator = None
			
 
				         param = service.split(":", 1)
			
@@ -358,8 +364,10 @@ class TranslateConverter(PDFConverterEx):
 
				         ############################################################
			
 
				         # C. 新文档排版
			
 
				         def raw_string(fcur: str, cstk: str):  # 编码字符串
			
 
				-            if fcur == 'noto':
			
 
				+            if fcur == noto_name:
			
 
				                 return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
			
 
				+            elif fcur == shs_name:
			
 
				+                return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk])
			
 
				             elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
			
 
				                 return "".join(["%04x" % ord(c) for c in cstk])
			
 
				             else:
			
@@ -403,8 +411,10 @@ class TranslateConverter(PDFConverterEx):
 
				                         pass
			
 
				                     if fcur_ is None:
			
 
				                         fcur_ = self.resfont  # 默认非拉丁字体
			
 
				-                    if fcur_ == 'noto':
			
 
				+                    if fcur_ == noto_name: # FIXME: change to CONST
			
 
				                         adv = self.noto.char_lengths(ch, size)[0]
			
 
				+                    elif fcur_ == shs_name: # FIXME: change to CONST
			
 
				+                        adv = self.shs.char_lengths(ch, size)[0]
			
 
				                     else:
			
 
				                         adv = self.fontmap[fcur_].char_width(ord(ch)) * size
			
 
				                     ptr += 1
			
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -4,6 +4,7 @@ import asyncio
 
				 import io
			
 
				 import os
			
 
				 import sys
			
 
				+from tabnanny import verbose
			
 
				 import tempfile
			
 
				 import urllib.request
			
 
				 from asyncio import CancelledError
			
@@ -20,10 +21,13 @@ from pdfminer.pdfpage import PDFPage
 
				 from pdfminer.pdfparser import PDFParser
			
 
				 from pymupdf import Document, Font
			
 
				 
			
 
				-from pdf2zh.converter import TranslateConverter
			
 
				+from pdf2zh.converter import TranslateConverter, shs_name, noto_name
			
 
				 from pdf2zh.doclayout import OnnxModel
			
 
				 from pdf2zh.pdfinterp import PDFPageInterpreterEx
			
 
				 
			
 
				+# FIXME
			
 
				+USE_SHS_FONT = True
			
 
				+
			
 
				 resfont_map = {
			
 
				     "zh-cn": "china-ss",
			
 
				     "zh-tw": "china-ts",
			
@@ -83,6 +87,7 @@ def translate_patch(
 
				     lang_out: str = "",
			
 
				     service: str = "",
			
 
				     resfont: str = "",
			
 
				+    shs: Font = None,
			
 
				     noto: Font = None,
			
 
				     callback: object = None,
			
 
				     cancellation_event: asyncio.Event = None,
			
@@ -103,6 +108,7 @@ def translate_patch(
 
				         lang_out,
			
 
				         service,
			
 
				         resfont,
			
 
				+        shs,
			
 
				         noto,
			
 
				         envs,
			
 
				         prompt,
			
@@ -187,11 +193,29 @@ def translate_stream(
 
				 ):
			
 
				     font_list = [("tiro", None)]
			
 
				     noto = None
			
 
				+    shs = None
			
 
				     if lang_out.lower() in resfont_map:  # CJK
			
 
				-        resfont = resfont_map[lang_out.lower()]
			
 
				-        font_list.append((resfont, None))
			
 
				+        if not USE_SHS_FONT:
			
 
				+            resfont = resfont_map[lang_out.lower()]
			
 
				+            font_list.append((resfont, None))
			
 
				+        else:
			
 
				+            resfont = shs_name
			
 
				+            # docker
			
 
				+            ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
			
 
				+            if not os.path.exists(ttf_path):
			
 
				+                ttf_path = os.path.join(
			
 
				+                    tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
			
 
				+                )
			
 
				+            if not os.path.exists(ttf_path):
			
 
				+                print("Downloading SourceHanSerif font...")
			
 
				+                urllib.request.urlretrieve(
			
 
				+                    "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
			
 
				+                    ttf_path,
			
 
				+                )
			
 
				+            font_list.append((shs_name, ttf_path))
			
 
				+            shs = Font(shs_name, ttf_path)
			
 
				     elif lang_out.lower() in noto_list:  # noto
			
 
				-        resfont = "noto"
			
 
				+        resfont = noto_name
			
 
				         # docker
			
 
				         ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
			
 
				 
			
@@ -203,8 +227,8 @@ def translate_stream(
 
				                 "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
			
 
				                 ttf_path,
			
 
				             )
			
 
				-        font_list.append(("noto", ttf_path))
			
 
				-        noto = Font("noto", ttf_path)
			
 
				+        font_list.append((noto_name, ttf_path))
			
 
				+        noto = Font(noto_name, ttf_path)
			
 
				     else:  # fallback
			
 
				         resfont = "china-ss"
			
 
				         font_list.append(("china-ss", None))
			
@@ -237,6 +261,7 @@ def translate_stream(
 
				                 pass
			
 
				 
			
 
				     fp = io.BytesIO()
			
 
				+
			
 
				     doc_zh.save(fp)
			
 
				     obj_patch: dict = translate_patch(fp, **locals())
			
 
				 
			
@@ -251,7 +276,12 @@ def translate_stream(
 
				     for id in range(page_count):
			
 
				         doc_en.move_page(page_count + id, id * 2 + 1)
			
 
				 
			
 
				-    return doc_zh.write(deflate=1), doc_en.write(deflate=1)
			
 
				+    doc_zh.subset_fonts(fallback=True)
			
 
				+    doc_en.subset_fonts(fallback=True)
			
 
				+    return (
			
 
				+        doc_zh.write(deflate=True, garbage=3, use_objstms=1),
			
 
				+        doc_en.write(deflate=True, garbage=3, use_objstms=1),
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def convert_to_pdfa(input_path, output_path):