Sfoglia il codice sorgente

Merge pull request #337 from timelic/dev/SourceHanSerif

Byaidu 1 anno fa
parent
commit
d22bbc6cf4
4 ha cambiato i file con 59 aggiunte e 46 eliminazioni
  1. 9 2
      Dockerfile
  2. 5 5
      pdf2zh/converter.py
  3. 44 39
      pdf2zh/high_level.py
  4. 1 0
      pyproject.toml

+ 9 - 2
Dockerfile

@@ -6,7 +6,14 @@ WORKDIR /app
 EXPOSE 7860
 
 ENV PYTHONUNBUFFERED=1
-ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app
+
+# Download all required fonts
+ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifCN-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifTW-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifJP-Regular.ttf" /app/
+ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifKR-Regular.ttf" /app/
+
 RUN apt-get update && \
      apt-get install --no-install-recommends -y libgl1 && \
      rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \
@@ -16,4 +23,4 @@ COPY . .
 
 RUN uv pip install --system --no-cache .
 
-CMD ["pdf2zh", "-i"]
+CMD ["pdf2zh", "-i"]

+ 5 - 5
pdf2zh/converter.py

@@ -138,7 +138,7 @@ class TranslateConverter(PDFConverterEx):
         lang_in: str = "",
         lang_out: str = "",
         service: str = "",
-        resfont: str = "",
+        noto_name: str = "",
         noto: Font = None,
         envs: Dict = None,
         prompt: List = None,
@@ -148,7 +148,7 @@ class TranslateConverter(PDFConverterEx):
         self.vchar = vchar
         self.thread = thread
         self.layout = layout
-        self.resfont = resfont
+        self.noto_name = noto_name
         self.noto = noto
         self.translator: BaseTranslator = None
         param = service.split(":", 1)
@@ -359,7 +359,7 @@ class TranslateConverter(PDFConverterEx):
         ############################################################
         # C. 新文档排版
         def raw_string(fcur: str, cstk: str):  # 编码字符串
-            if fcur == 'noto':
+            if fcur == self.noto_name:
                 return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
             elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                 return "".join(["%04x" % ord(c) for c in cstk])
@@ -403,8 +403,8 @@ class TranslateConverter(PDFConverterEx):
                     except Exception:
                         pass
                     if fcur_ is None:
-                        fcur_ = self.resfont  # 默认非拉丁字体
-                    if fcur_ == 'noto':
+                        fcur_ = self.noto_name  # 默认非拉丁字体
+                    if fcur_ == self.noto_name: # FIXME: change to CONST
                         adv = self.noto.char_lengths(ch, size)[0]
                     else:
                         adv = self.fontmap[fcur_].char_width(ord(ch)) * size

+ 44 - 39
pdf2zh/high_level.py

@@ -24,15 +24,7 @@ from pdf2zh.converter import TranslateConverter
 from pdf2zh.doclayout import OnnxModel
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
 
-resfont_map = {
-    "zh-cn": "china-ss",
-    "zh-tw": "china-ts",
-    "zh-hans": "china-ss",
-    "zh-hant": "china-ts",
-    "zh": "china-ss",
-    "ja": "japan-s",
-    "ko": "korea-s",
-}
+NOTO_NAME = "noto"
 
 noto_list = [
     "am",  # Amharic
@@ -44,18 +36,14 @@ noto_list = [
     "gu",  # Gujarati
     "iw",  # Hebrew
     "hi",  # Hindi
-    # "ja",  # Japanese
     "kn",  # Kannada
-    # "ko",  # Korean
     "ml",  # Malayalam
     "mr",  # Marathi
     "ru",  # Russian
     "sr",  # Serbian
-    # "zh-cn",# SC
     "ta",  # Tamil
     "te",  # Telugu
     "th",  # Thai
-    # "zh-tw",# TC
     "ur",  # Urdu
     "uk",  # Ukrainian
 ]
@@ -82,7 +70,7 @@ def translate_patch(
     lang_in: str = "",
     lang_out: str = "",
     service: str = "",
-    resfont: str = "",
+    noto_name: str = "",
     noto: Font = None,
     callback: object = None,
     cancellation_event: asyncio.Event = None,
@@ -102,7 +90,7 @@ def translate_patch(
         lang_in,
         lang_out,
         service,
-        resfont,
+        noto_name,
         noto,
         envs,
         prompt,
@@ -186,35 +174,18 @@ def translate_stream(
     **kwarg: Any,
 ):
     font_list = [("tiro", None)]
-    noto = None
-    if lang_out.lower() in resfont_map:  # CJK
-        resfont = resfont_map[lang_out.lower()]
-        font_list.append((resfont, None))
-    elif lang_out.lower() in noto_list:  # noto
-        resfont = "noto"
-        # docker
-        ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
-
-        if not os.path.exists(ttf_path):
-            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
-        if not os.path.exists(ttf_path):
-            print("Downloading Noto font...")
-            urllib.request.urlretrieve(
-                "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
-                ttf_path,
-            )
-        font_list.append(("noto", ttf_path))
-        noto = Font("noto", ttf_path)
-    else:  # fallback
-        resfont = "china-ss"
-        font_list.append(("china-ss", None))
+
+    font_path = download_remote_fonts(lang_out.lower())
+    noto_name = NOTO_NAME
+    noto = Font(noto_name, font_path)
+    font_list.append((noto_name, font_path))
 
     doc_en = Document(stream=stream)
     stream = io.BytesIO()
     doc_en.save(stream)
     doc_zh = Document(stream=stream)
     page_count = doc_zh.page_count
-    # font_list = [("china-ss", None), ("tiro", None)]
+    # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)]
     font_id = {}
     for page in doc_zh:
         for font in font_list:
@@ -237,6 +208,7 @@ def translate_stream(
                 pass
 
     fp = io.BytesIO()
+
     doc_zh.save(fp)
     obj_patch: dict = translate_patch(fp, **locals())
 
@@ -251,7 +223,12 @@ def translate_stream(
     for id in range(page_count):
         doc_en.move_page(page_count + id, id * 2 + 1)
 
-    return doc_zh.write(deflate=1), doc_en.write(deflate=1)
+    doc_zh.subset_fonts(fallback=True)
+    doc_en.subset_fonts(fallback=True)
+    return (
+        doc_zh.write(deflate=True, garbage=3, use_objstms=1),
+        doc_en.write(deflate=True, garbage=3, use_objstms=1),
+    )
 
 
 def convert_to_pdfa(input_path, output_path):
@@ -386,3 +363,31 @@ def translate(
         result_files.append((str(file_mono), str(file_dual)))
 
     return result_files
+
+
+def download_remote_fonts(lang: str):
+    URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/"
+    LANG_NAME_MAP = {
+        **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list},
+        **{
+            la: f"SourceHanSerif{region}-Regular.ttf"
+            for region, langs in {
+                "CN": ["zh-cn", "zh-hans", "zh"],
+                "TW": ["zh-tw", "zh-hant"],
+                "JP": ["ja"],
+                "KR": ["ko"],
+            }.items()
+            for la in langs
+        },
+    }
+    font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf")
+
+    # docker
+    font_path = os.environ.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix())
+    if not Path(font_path).exists():
+        font_path = Path(tempfile.gettempdir(), font_name).as_posix()
+    if not Path(font_path).exists():
+        print(f"Downloading {font_name}...")
+        urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path)
+
+    return font_path

+ 1 - 0
pyproject.toml

@@ -32,6 +32,7 @@ dependencies = [
     "pikepdf",
     "peewee>=3.17.8",
     "argostranslate",
+    "fontTools"
 ]
 
 [project.optional-dependencies]