Byaidu 1 год назад
Родитель
Сommit
6981951ef5
3 измененных файлов с 18 добавлено и 33 удалено
  1. 2 2
      pdf2zh/converter.py
  2. 13 10
      pdf2zh/pdf2zh.py
  3. 3 21
      pdf2zh/translator.py

+ 2 - 2
pdf2zh/converter.py

@@ -414,8 +414,8 @@ class TranslateConverter(PDFConverterEx):
                         cstk = ""
                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                     x = x0
-                    lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
-                    y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
+                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
+                    y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
                 if vy_regex:  # 插入公式
                     fix = 0
                     if fcur is not None:  # 段落内公式修正纵向偏移

+ 13 - 10
pdf2zh/pdf2zh.py

@@ -27,8 +27,11 @@ logging.basicConfig()
 model = DocLayoutModel.load_available()
 
 resfont_map = {
-    "zh-CN": "china-ss",
-    "zh-TW": "china-ts",
+    "zh-cn": "china-ss",
+    "zh-tw": "china-ts",
+    "zh-hans": "china-ss",
+    "zh-hant": "china-ts",
+    "zh": "china-ss",
     "ja": "japan-s",
     "ko": "korea-s",
 }
@@ -49,11 +52,11 @@ noto_list = [
     "mr",  # Marathi
     "ru",  # Russian
     "sr",  # Serbian
-    # "zh-CN",# Chinese (PRC)
+    # "zh-cn",# SC
     "ta",  # Tamil
     "te",  # Telugu
     "th",  # Thai
-    # "zh-TW",# Chinese (Taiwan)
+    # "zh-tw",# TC
     "ur",  # Urdu
     "uk",  # Ukrainian
 ]
@@ -114,10 +117,10 @@ def extract_text(
 
         font_list = [("tiro", None)]
         noto = None
-        if lang_out in resfont_map:  # CJK
-            resfont = resfont_map[lang_out]
+        if lang_out.lower() in resfont_map:  # CJK
+            resfont = resfont_map[lang_out.lower()]
             font_list.append((resfont, None))
-        elif lang_out in noto_list:  # noto
+        elif lang_out.lower() in noto_list:  # noto
             resfont = "noto"
             ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
             if not os.path.exists(ttf_path):
@@ -128,7 +131,7 @@ def extract_text(
                 )
             font_list.append(("noto", ttf_path))
             noto = pymupdf.Font("noto", ttf_path)
-        else:  # auto
+        else:  # fallback
             resfont = "china-ss"
             font_list.append(("china-ss", None))
 
@@ -240,14 +243,14 @@ def create_parser() -> argparse.ArgumentParser:
         "--lang-in",
         "-li",
         type=str,
-        default="auto",
+        default="en",
         help="The code of source language.",
     )
     parse_params.add_argument(
         "--lang-out",
         "-lo",
         type=str,
-        default="auto",
+        default="zh",
         help="The code of target language.",
     )
     parse_params.add_argument(

+ 3 - 21
pdf2zh/translator.py

@@ -25,11 +25,9 @@ class BaseTranslator:
     envs = {}
     lang_map = {}
 
-    def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
-        lang_out = self.lang_map.get(lang_out, lang_out)
-        lang_in = self.lang_map.get(lang_in, lang_in)
+    def __init__(self, service, lang_out: str, lang_in: str, model):
+        lang_out = self.lang_map.get(lang_out.lower(), lang_out)
+        lang_in = self.lang_map.get(lang_in.lower(), lang_in)
         self.service = service
         self.lang_out = lang_out
         self.lang_in = lang_in
@@ -59,8 +57,6 @@ class GoogleTranslator(BaseTranslator):
     lang_map = {"zh": "zh-CN"}
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh-CN" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.session = requests.Session()
         self.endpoint = "http://translate.google.com/m"
@@ -92,8 +88,6 @@ class BingTranslator(BaseTranslator):
     lang_map = {"zh": "zh-Hans"}
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh-Hans" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.session = requests.Session()
         self.endpoint = "https://www.bing.com/ttranslatev3"
@@ -136,8 +130,6 @@ class TencentTranslator(BaseTranslator):
     }
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         cred = credential.DefaultCredentialProvider().get_credential()
         self.client = TmtClient(cred, "ap-beijing")
@@ -162,8 +154,6 @@ class DeepLTranslator(BaseTranslator):
     lang_map = {"zh": "zh-Hans"}
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.session = requests.Session()
         server_url = os.getenv("DEEPL_SERVER_URL")
@@ -186,8 +176,6 @@ class DeepLXTranslator(BaseTranslator):
     lang_map = {"zh": "zh-Hans"}
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.endpoint = os.getenv("DEEPLX_ENDPOINT")
         self.session = requests.Session()
@@ -213,8 +201,6 @@ class OllamaTranslator(BaseTranslator):
     }
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh-CN" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         if not model:
             model = os.getenv("OLLAMA_MODEL", self.envs["OLLAMA_MODEL"])
         super().__init__(service, lang_out, lang_in, model)
@@ -240,8 +226,6 @@ class OpenAITranslator(BaseTranslator):
     }
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh-CN" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         if not model:
             model = os.getenv("OPENAI_MODEL", self.envs["OPENAI_MODEL"])
         super().__init__(service, lang_out, lang_in, model)
@@ -267,8 +251,6 @@ class AzureTranslator(BaseTranslator):
     lang_map = {"zh": "zh-Hans"}
 
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "zh-Hans" if lang_out == "auto" else lang_out
-        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         endpoint = os.environ["AZURE_ENDPOINT"]
         api_key = os.environ["AZURE_APIKEY"]