Эх сурвалжийг харах

feat (translator, convertor): add support for DeepLX

Rongxin 1 жил өмнө
parent
commit
a0d87c73aa
3 өөрчлөгдсөн 80 нэмэгдсэн , 6 устгасан
  1. 1 0
      .gitignore
  2. 16 4
      pdf2zh/converter.py
  3. 63 2
      pdf2zh/translator.py

+ 1 - 0
.gitignore

@@ -160,3 +160,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.vscode

+ 16 - 4
pdf2zh/converter.py

@@ -19,7 +19,11 @@ import numpy as np
 import unicodedata
 from tenacity import retry
 from pdf2zh import cache
-from pdf2zh.translator import BaseTranslator, GoogleTranslator, OllamaTranslator
+from pdf2zh.translator import (
+    BaseTranslator,
+    GoogleTranslator,
+    DeepLXTranslator, OllamaTranslator,
+)
 def remove_control_characters(s):
     return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")
 
@@ -359,10 +363,18 @@ class TextConverter(PDFConverter[AnyIO]):
         self.vchar = vchar
         self.thread = thread
         self.layout = layout
-        if service=='google':
-            self.translator: BaseTranslator = GoogleTranslator(service,lang_out,lang_in)
+        if service == "google":
+            self.translator: BaseTranslator = GoogleTranslator(
+                service, lang_out, lang_in
+            )
+        elif service == "deeplx":
+            self.translator: BaseTranslator = DeepLXTranslator(
+                service, lang_out, lang_in
+            )
         else:
-            self.translator: BaseTranslator = OllamaTranslator(service,lang_out,lang_in)
+            self.translator: BaseTranslator = OllamaTranslator(
+                service, lang_out, lang_in
+            )
 
     def write_text(self, text: str) -> None:
         text = utils.compatible_encode_method(text, self.codec, "ignore")

+ 63 - 2
pdf2zh/translator.py

@@ -1,7 +1,11 @@
-import re
 import html
-import requests
+import json
+import re
+from os import getenv
+
 import ollama
+import requests
+
 
 class BaseTranslator:
     def __init__(self,service,lang_out,lang_in):
@@ -35,6 +39,63 @@ class GoogleTranslator(BaseTranslator):
             result=html.unescape(re_result[0])
         return result
 
+
+class DeepLXTranslator(BaseTranslator):
+    def __init__(
+        self,
+        service,
+        lang_out,
+        lang_in,
+    ):
+        super().__init__(service, lang_out, lang_in)
+        self.session = requests.Session()
+        self.base_link = ""
+        self.model = service
+        self.headers = {
+            "Content-Type": "application/json; charset=utf-8",
+        }
+
+    def translate(self, text):
+        text = text[:5000]  # Max Length
+        if getenv("DEEPLX_TOKEN"):
+            DEEPLX_TOKEN = getenv("DEEPLX_TOKEN")
+        else:
+            DEEPLX_TOKEN = ""
+        if DEEPLX_TOKEN == "":
+            raise ValueError("No valid env `DEEPLX_TOKEN`")
+        self.base_link = f"https://api.deeplx.org/{DEEPLX_TOKEN}/translate"
+
+        response = self.session.post(
+            self.base_link,
+            json.dumps(
+                {
+                    "target_lang": "zh",
+                    # "source_lang": self.lang_in,
+                    "text": text,
+                }
+            ),
+            headers=self.headers,
+        )
+        # re_result = re.findall(
+        #     r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
+        # )
+        if response.status_code == 200:
+            result = json.loads(response.text)
+        else:
+            raise ValueError("HTTP Error")
+        try:
+            result = result["data"]
+            return "Deepl" + result
+        except KeyError:
+            result = ""
+            raise ValueError("No valid key in DeepLX's response")
+        # if len(result) == 0:
+        #     raise ValueError("Empty translation result")
+        # else:
+        #     result = html.unescape(result[0])
+        # return result
+
+
 class OllamaTranslator(BaseTranslator):
     def __init__(self,service,lang_out,lang_in):
         super().__init__(service,lang_out,lang_in)