Răsfoiți Sursa

basic logic

timel 11 luni în urmă
părinte
comite
73b02166c2
2 a modificat fișierele cu 49 adăugiri și 9 ștergeri
  1. 12 2
      pdf2zh/converter.py
  2. 37 7
      pdf2zh/high_level.py

+ 12 - 2
pdf2zh/converter.py

@@ -45,6 +45,10 @@ from pymupdf import Font
 log = logging.getLogger(__name__)
 
 
+shs_name = "shs"
+noto_name = "noto"
+
+
 class PDFConverterEx(PDFConverter):
     def __init__(
         self,
@@ -138,6 +142,7 @@ class TranslateConverter(PDFConverterEx):
         lang_out: str = "",
         service: str = "",
         resfont: str = "",
+        shs: Font = None,
         noto: Font = None,
         envs: Dict = None,
         prompt: List = None,
@@ -148,6 +153,7 @@ class TranslateConverter(PDFConverterEx):
         self.thread = thread
         self.layout = layout
         self.resfont = resfont
+        self.shs = shs
         self.noto = noto
         self.translator: BaseTranslator = None
         param = service.split(":", 1)
@@ -358,8 +364,10 @@ class TranslateConverter(PDFConverterEx):
         ############################################################
         # C. 新文档排版
         def raw_string(fcur: str, cstk: str):  # 编码字符串
-            if fcur == 'noto':
+            if fcur == noto_name:
                 return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
+            elif fcur == shs_name:
+                return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk])
             elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                 return "".join(["%04x" % ord(c) for c in cstk])
             else:
@@ -403,8 +411,10 @@ class TranslateConverter(PDFConverterEx):
                         pass
                     if fcur_ is None:
                         fcur_ = self.resfont  # 默认非拉丁字体
-                    if fcur_ == 'noto':
+                    if fcur_ == noto_name: # FIXME: change to CONST
                         adv = self.noto.char_lengths(ch, size)[0]
+                    elif fcur_ == shs_name: # FIXME: change to CONST
+                        adv = self.shs.char_lengths(ch, size)[0]
                     else:
                         adv = self.fontmap[fcur_].char_width(ord(ch)) * size
                     ptr += 1

+ 37 - 7
pdf2zh/high_level.py

@@ -4,6 +4,7 @@ import asyncio
 import io
 import os
 import sys
+from tabnanny import verbose
 import tempfile
 import urllib.request
 from asyncio import CancelledError
@@ -20,10 +21,13 @@ from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pymupdf import Document, Font
 
-from pdf2zh.converter import TranslateConverter
+from pdf2zh.converter import TranslateConverter, shs_name, noto_name
 from pdf2zh.doclayout import OnnxModel
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
 
+# FIXME
+USE_SHS_FONT = True
+
 resfont_map = {
     "zh-cn": "china-ss",
     "zh-tw": "china-ts",
@@ -83,6 +87,7 @@ def translate_patch(
     lang_out: str = "",
     service: str = "",
     resfont: str = "",
+    shs: Font = None,
     noto: Font = None,
     callback: object = None,
     cancellation_event: asyncio.Event = None,
@@ -103,6 +108,7 @@ def translate_patch(
         lang_out,
         service,
         resfont,
+        shs,
         noto,
         envs,
         prompt,
@@ -187,11 +193,29 @@ def translate_stream(
 ):
     font_list = [("tiro", None)]
     noto = None
+    shs = None
     if lang_out.lower() in resfont_map:  # CJK
-        resfont = resfont_map[lang_out.lower()]
-        font_list.append((resfont, None))
+        if not USE_SHS_FONT:
+            resfont = resfont_map[lang_out.lower()]
+            font_list.append((resfont, None))
+        else:
+            resfont = shs_name
+            # docker
+            ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
+            if not os.path.exists(ttf_path):
+                ttf_path = os.path.join(
+                    tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
+                )
+            if not os.path.exists(ttf_path):
+                print("Downloading SourceHanSerif font...")
+                urllib.request.urlretrieve(
+                    "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
+                    ttf_path,
+                )
+            font_list.append((shs_name, ttf_path))
+            shs = Font(shs_name, ttf_path)
     elif lang_out.lower() in noto_list:  # noto
-        resfont = "noto"
+        resfont = noto_name
         # docker
         ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
 
@@ -203,8 +227,8 @@ def translate_stream(
                 "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
                 ttf_path,
             )
-        font_list.append(("noto", ttf_path))
-        noto = Font("noto", ttf_path)
+        font_list.append((noto_name, ttf_path))
+        noto = Font(noto_name, ttf_path)
     else:  # fallback
         resfont = "china-ss"
         font_list.append(("china-ss", None))
@@ -237,6 +261,7 @@ def translate_stream(
                 pass
 
     fp = io.BytesIO()
+
     doc_zh.save(fp)
     obj_patch: dict = translate_patch(fp, **locals())
 
@@ -251,7 +276,12 @@ def translate_stream(
     for id in range(page_count):
         doc_en.move_page(page_count + id, id * 2 + 1)
 
-    return doc_zh.write(deflate=1), doc_en.write(deflate=1)
+    doc_zh.subset_fonts(fallback=True)
+    doc_en.subset_fonts(fallback=True)
+    return (
+        doc_zh.write(deflate=True, garbage=3, use_objstms=1),
+        doc_en.write(deflate=True, garbage=3, use_objstms=1),
+    )
 
 
 def convert_to_pdfa(input_path, output_path):