Ver Fonte

optimize font logic

timel há 1 ano atrás
pai
commit
a5c376cc26
3 ficheiros alterados com 43 adições e 69 exclusões
  1. 8 2
      Dockerfile
  2. 0 8
      pdf2zh/converter.py
  3. 35 59
      pdf2zh/high_level.py

+ 8 - 2
Dockerfile

@@ -6,8 +6,14 @@ WORKDIR /app
 EXPOSE 7860
 
 ENV PYTHONUNBUFFERED=1
-ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app
-ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc" /app
+
+# Download all required fonts
+ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifCN-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifTW-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifJP-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifKR-Regular.ttf" /app/
+
 RUN apt-get update && \
      apt-get install --no-install-recommends -y libgl1 && \
      rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \

+ 0 - 8
pdf2zh/converter.py

@@ -44,8 +44,6 @@ from pymupdf import Font
 
 log = logging.getLogger(__name__)
 
-
-shs_name = "shs"
 noto_name = "noto"
 
 
@@ -142,7 +140,6 @@ class TranslateConverter(PDFConverterEx):
         lang_out: str = "",
         service: str = "",
         resfont: str = "",
-        shs: Font = None,
         noto: Font = None,
         envs: Dict = None,
         prompt: List = None,
@@ -153,7 +150,6 @@ class TranslateConverter(PDFConverterEx):
         self.thread = thread
         self.layout = layout
         self.resfont = resfont
-        self.shs = shs
         self.noto = noto
         self.translator: BaseTranslator = None
         param = service.split(":", 1)
@@ -366,8 +362,6 @@ class TranslateConverter(PDFConverterEx):
         def raw_string(fcur: str, cstk: str):  # 编码字符串
             if fcur == noto_name:
                 return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
-            elif fcur == shs_name:
-                return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk])
             elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                 return "".join(["%04x" % ord(c) for c in cstk])
             else:
@@ -413,8 +407,6 @@ class TranslateConverter(PDFConverterEx):
                         fcur_ = self.resfont  # 默认非拉丁字体
                     if fcur_ == noto_name: # FIXME: change to CONST
                         adv = self.noto.char_lengths(ch, size)[0]
-                    elif fcur_ == shs_name: # FIXME: change to CONST
-                        adv = self.shs.char_lengths(ch, size)[0]
                     else:
                         adv = self.fontmap[fcur_].char_width(ord(ch)) * size
                     ptr += 1

+ 35 - 59
pdf2zh/high_level.py

@@ -21,23 +21,10 @@ from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pymupdf import Document, Font
 
-from pdf2zh.converter import TranslateConverter, shs_name, noto_name
+from pdf2zh.converter import TranslateConverter, noto_name
 from pdf2zh.doclayout import OnnxModel
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
 
-# FIXME
-USE_SHS_FONT = True
-
-resfont_map = {
-    "zh-cn": "china-ss",
-    "zh-tw": "china-ts",
-    "zh-hans": "china-ss",
-    "zh-hant": "china-ts",
-    "zh": "china-ss",
-    "ja": "japan-s",
-    "ko": "korea-s",
-}
-
 noto_list = [
     "am",  # Amharic
     "ar",  # Arabic
@@ -48,18 +35,14 @@ noto_list = [
     "gu",  # Gujarati
     "iw",  # Hebrew
     "hi",  # Hindi
-    # "ja",  # Japanese
     "kn",  # Kannada
-    # "ko",  # Korean
     "ml",  # Malayalam
     "mr",  # Marathi
     "ru",  # Russian
     "sr",  # Serbian
-    # "zh-cn",# SC
     "ta",  # Tamil
     "te",  # Telugu
     "th",  # Thai
-    # "zh-tw",# TC
     "ur",  # Urdu
     "uk",  # Ukrainian
 ]
@@ -87,7 +70,6 @@ def translate_patch(
     lang_out: str = "",
     service: str = "",
     resfont: str = "",
-    shs: Font = None,
     noto: Font = None,
     callback: object = None,
     cancellation_event: asyncio.Event = None,
@@ -108,7 +90,6 @@ def translate_patch(
         lang_out,
         service,
         resfont,
-        shs,
         noto,
         envs,
         prompt,
@@ -192,51 +173,18 @@ def translate_stream(
     **kwarg: Any,
 ):
     font_list = [("tiro", None)]
-    noto = None
-    shs = None
-    ttf_path = None
-    if lang_out.lower() in resfont_map:  # CJK
-        if not USE_SHS_FONT:
-            resfont = resfont_map[lang_out.lower()]
-        else:
-            resfont = shs_name
-            # docker
-            ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
-            if not os.path.exists(ttf_path):
-                ttf_path = os.path.join(
-                    tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
-                )
-            if not os.path.exists(ttf_path):
-                print("Downloading SourceHanSerif font...")
-                urllib.request.urlretrieve(
-                    "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
-                    ttf_path,
-                )
-            shs = Font(shs_name, ttf_path)
-    elif lang_out.lower() in noto_list:  # noto
-        resfont = noto_name
-        # docker
-        ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
-
-        if not os.path.exists(ttf_path):
-            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
-        if not os.path.exists(ttf_path):
-            print("Downloading Noto font...")
-            urllib.request.urlretrieve(
-                "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
-                ttf_path,
-            )
-        noto = Font(noto_name, ttf_path)
-    else:  # fallback
-        resfont = "china-ss"
-    font_list.append((resfont, ttf_path))
+
+    font_path = download_remote_fonts(lang_out.lower())
+    resfont = noto_name
+    noto = Font(noto_name, font_path)
+    font_list.append((resfont, font_path))
 
     doc_en = Document(stream=stream)
     stream = io.BytesIO()
     doc_en.save(stream)
     doc_zh = Document(stream=stream)
     page_count = doc_zh.page_count
-    # font_list = [("china-ss", None), ("tiro", None)]
+    # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)]
     font_id = {}
     for page in doc_zh:
         for font in font_list:
@@ -414,3 +362,31 @@ def translate(
         result_files.append((str(file_mono), str(file_dual)))
 
     return result_files
+
+
+def download_remote_fonts(lang: str):
+    URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/"
+    LANG_NAME_MAP = {
+        **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list},
+        **{
+            la: f"SourceHanSerif{region}-Regular.ttf"
+            for region, langs in {
+                "CN": ["zh-cn", "zh-hans", "zh"],
+                "TW": ["zh-tw", "zh-hant"],
+                "JP": ["ja"],
+                "KR": ["ko"],
+            }.items()
+            for la in langs
+        },
+    }
+    font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf")
+
+    # docker
+    font_path = os.environ.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix())
+    if not Path(font_path).exists():
+        font_path = Path(tempfile.gettempdir(), font_name).as_posix()
+    if not Path(font_path).exists():
+        print(f"Downloading {font_name}...")
+        urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path)
+
+    return font_path