Byaidu 1 год назад
Родитель
Сommit
7a2ef0ae79
2 измененных файлов с 68 добавлено и 184 удалено
  1. 67 184
      pdf2zh/translator.py
  2. 1 0
      pyproject.toml

+ 67 - 184
pdf2zh/translator.py

@@ -1,13 +1,7 @@
-import hashlib
-import hmac
 import html
 import logging
 import os
 import re
-import time
-from datetime import timezone, datetime
-
-from json import dumps, loads
 import unicodedata
 
 import deepl
@@ -16,6 +10,10 @@ import openai
 import requests
 from azure.ai.translation.text import TextTranslationClient
 from azure.core.credentials import AzureKeyCredential
+from tencentcloud.common import credential
+from tencentcloud.tmt.v20180321.tmt_client import TmtClient
+from tencentcloud.tmt.v20180321.models import TextTranslateRequest
+from tencentcloud.tmt.v20180321.models import TextTranslateResponse
 
 
 def remove_control_characters(s):
@@ -23,13 +21,16 @@ def remove_control_characters(s):
 
 
 class BaseTranslator:
+    envs = {}
+
     def __init__(self, service, lang_out, lang_in, model):
         self.service = service
         self.lang_out = lang_out
         self.lang_in = lang_in
         self.model = model
 
-    def translate(self, text) -> str: ...  # noqa: E704
+    def translate(self, text):
+        pass
 
     def __str__(self):
         return f"{self.service} {self.lang_out} {self.lang_in}"
@@ -41,7 +42,7 @@ class GoogleTranslator(BaseTranslator):
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.session = requests.Session()
-        self.base_link = "http://translate.google.com/m"
+        self.endpoint = "http://translate.google.com/m"
         self.headers = {
             "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
         }
@@ -49,7 +50,7 @@ class GoogleTranslator(BaseTranslator):
     def translate(self, text):
         text = text[:5000]  # google translate max length
         response = self.session.get(
-            self.base_link,
+            self.endpoint,
             params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
             headers=self.headers,
         )
@@ -58,195 +59,74 @@ class GoogleTranslator(BaseTranslator):
         )
         if response.status_code == 400:
             result = "IRREPARABLE TRANSLATION ERROR"
-        elif len(re_result) == 0:
-            raise ValueError("Empty translation result")
         else:
             result = html.unescape(re_result[0])
         return remove_control_characters(result)
 
 
 class TencentTranslator(BaseTranslator):
-    def sign(self, key, msg):
-        return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
+    # https://github.com/TencentCloud/tencentcloud-sdk-python
+    envs = {
+        "TENCENTCLOUD_SECRET_ID": None,
+        "TENCENTCLOUD_SECRET_KEY": None,
+    }
 
     def __init__(self, service, lang_out, lang_in, model):
         lang_out = "zh" if lang_out == "auto" else lang_out
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
-        try:
-            server_url = "tmt.tencentcloudapi.com"
-            self.secret_id = os.getenv("TENCENT_SECRET_ID")
-            self.secret_key = os.getenv("TENCENT_SECRET_KEY")
-
-        except KeyError as e:
-            missing_var = e.args[0]
-            raise ValueError(
-                f"The environment variable '{missing_var}' is required but not set."
-            ) from e
-
-        self.session = requests.Session()
-        self.base_link = f"{server_url}"
+        cred = credential.DefaultCredentialProvider().get_credential()
+        self.client = TmtClient(cred, "ap-beijing")
+        self.req = TextTranslateRequest()
+        self.req.Source = self.lang_in
+        self.req.Target = self.lang_out
+        self.req.ProjectId = 0
 
     def translate(self, text):
-        text = text[:5000]
-        data = {
-            "SourceText": text,
-            "Source": self.lang_in,
-            "Target": self.lang_out,
-            "ProjectId": 0,
-        }
-        payloadx = dumps(data)
-        hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
-        canonical_request = (
-            "POST"
-            + "\n"
-            + "/"
-            + "\n"
-            + ""
-            + "\n"
-            + "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
-            + "\n"
-            + "content-type;host;x-tc-action"
-            + "\n"
-            + hashed_request_payload
-        )
-
-        timestamp = int(time.time())
-        date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
-        credential_scope = date + "/tmt/tc3_request"
-        hashed_canonical_request = hashlib.sha256(
-            canonical_request.encode("utf-8")
-        ).hexdigest()
-        algorithm = "TC3-HMAC-SHA256"
-        string_to_sign = (
-            algorithm
-            + "\n"
-            + str(timestamp)
-            + "\n"
-            + credential_scope
-            + "\n"
-            + hashed_canonical_request
-        )
-        secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
-        secret_service = self.sign(secret_date, "tmt")
-        secret_signing = self.sign(secret_service, "tc3_request")
-        signed_headers = "content-type;host;x-tc-action"
-        signature = hmac.new(
-            secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
-        ).hexdigest()
-        authorization = (
-            algorithm
-            + " "
-            + "Credential="
-            + str(self.secret_id)
-            + "/"
-            + credential_scope
-            + ", "
-            + "SignedHeaders="
-            + signed_headers
-            + ", "
-            + "Signature="
-            + signature
-        )
-        self.headers = {
-            "Authorization": authorization,
-            "Content-Type": "application/json; charset=utf-8",
-            "Host": "tmt.tencentcloudapi.com",
-            "X-TC-Action": "TextTranslate",
-            "X-TC-Region": "ap-beijing",
-            "X-TC-Timestamp": str(timestamp),
-            "X-TC-Version": "2018-03-21",
-        }
-
-        response = self.session.post(
-            "https://" + self.base_link,
-            json=data,
-            headers=self.headers,
-        )
-        # 1. Status code test
-        if response.status_code == 200:
-            result = loads(response.text)
-        else:
-            raise ValueError("HTTP error: " + str(response.status_code))
-        # 2. Result test
-        try:
-            result = result["Response"]["TargetText"]
-            # return result
-        except KeyError:
-            result = ""
-        #     raise ValueError("No valid key in Tencent's response")
-        # # 3. Result length check
-        # if len(result) == 0:
-        #     raise ValueError("Empty translation result")
-        return result
+        self.req.SourceText = text
+        resp: TextTranslateResponse = self.client.TextTranslate(self.req)
+        return resp.TargetText
 
 
 class DeepLXTranslator(BaseTranslator):
+    # https://deeplx.owo.network/endpoints/free.html
+    envs = {
+        "DEEPLX_ENDPOINT": "https://api.deepl.com/v2/translate",
+    }
+
     def __init__(self, service, lang_out, lang_in, model):
         lang_out = "zh" if lang_out == "auto" else lang_out
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
-        try:
-            auth_key = os.getenv("DEEPLX_AUTH_KEY")
-            server_url = (
-                "https://api.deeplx.org"
-                if not os.getenv("DEEPLX_SERVER_URL")
-                else os.getenv("DEEPLX_SERVER_URL")
-            )
-        except KeyError as e:
-            missing_var = e.args[0]
-            raise ValueError(
-                f"The environment variable '{missing_var}' is required but not set."
-            ) from e
-
+        self.endpoint = os.getenv("DEEPLX_ENDPOINT")
         self.session = requests.Session()
-        server_url = str(server_url).rstrip("/")
-        if auth_key:
-            self.base_link = f"{server_url}/{auth_key}/translate"
-        else:
-            self.base_link = f"{server_url}/translate"
-        self.headers = {
-            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
-        }
 
     def translate(self, text):
-        text = text[:5000]  # google translate max length
-        response = self.session.post(
-            self.base_link,
-            dumps(
-                {
-                    "target_lang": self.lang_out,
-                    "text": text,
-                }
-            ),
-            headers=self.headers,
+        resp = self.session.post(
+            self.endpoint,
+            json={
+                "source_lang": self.lang_in,
+                "target_lang": self.lang_out,
+                "text": text,
+            },
         )
-        # 1. Status code test
-        if response.status_code == 200:
-            result = loads(response.text)
-        else:
-            raise ValueError("HTTP error: " + str(response.status_code))
-        # 2. Result test
-        try:
-            result = result["data"]
-            return result
-        except KeyError:
-            result = ""
-            raise ValueError("No valid key in DeepLX's response")
-        # 3. Result length check
-        if len(result) == 0:
-            raise ValueError("Empty translation result")
-        return result
+        return resp.json()["data"]
 
 
 class DeepLTranslator(BaseTranslator):
+    # https://github.com/DeepLcom/deepl-python
+    envs = {
+        "DEEPL_SERVER_URL": "https://api.deepl.com",
+        "DEEPL_AUTH_KEY": None,
+    }
+
     def __init__(self, service, lang_out, lang_in, model):
-        lang_out = "ZH" if lang_out == "auto" else lang_out
-        lang_in = "EN" if lang_in == "auto" else lang_in
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.session = requests.Session()
-        auth_key = os.getenv("DEEPL_AUTH_KEY")
         server_url = os.getenv("DEEPL_SERVER_URL")
+        auth_key = os.getenv("DEEPL_AUTH_KEY")
         self.client = deepl.Translator(auth_key, server_url=server_url)
 
     def translate(self, text):
@@ -257,12 +137,16 @@ class DeepLTranslator(BaseTranslator):
 
 
 class OllamaTranslator(BaseTranslator):
+    # https://github.com/ollama/ollama-python
+    envs = {
+        "OLLAMA_HOST": "http://127.0.0.1:11434",
+    }
+
     def __init__(self, service, lang_out, lang_in, model):
         lang_out = "zh-CN" if lang_out == "auto" else lang_out
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
-        # OLLAMA_HOST
         self.client = ollama.Client()
 
     def translate(self, text):
@@ -284,13 +168,17 @@ class OllamaTranslator(BaseTranslator):
 
 
 class OpenAITranslator(BaseTranslator):
+    # https://github.com/openai/openai-python
+    envs = {
+        "OPENAI_BASE_URL": "https://api.openai.com/v1",
+        "OPENAI_API_KEY": None,
+    }
+
     def __init__(self, service, lang_out, lang_in, model):
         lang_out = "zh-CN" if lang_out == "auto" else lang_out
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
         self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
-        # OPENAI_BASE_URL
-        # OPENAI_API_KEY
         self.client = openai.OpenAI()
 
     def translate(self, text) -> str:
@@ -312,26 +200,22 @@ class OpenAITranslator(BaseTranslator):
 
 
 class AzureTranslator(BaseTranslator):
+    # https://github.com/Azure/azure-sdk-for-python
+    envs = {
+        "AZURE_ENDPOINT": "https://api.translator.azure.cn",
+        "AZURE_APIKEY": None,
+    }
+
     def __init__(self, service, lang_out, lang_in, model):
         lang_out = "zh-Hans" if lang_out == "auto" else lang_out
         lang_in = "en" if lang_in == "auto" else lang_in
         super().__init__(service, lang_out, lang_in, model)
-
-        try:
-            api_key = os.environ["AZURE_APIKEY"]
-            endpoint = os.environ["AZURE_ENDPOINT"]
-            region = os.environ["AZURE_REGION"]
-        except KeyError as e:
-            missing_var = e.args[0]
-            raise ValueError(
-                f"The environment variable '{missing_var}' is required but not set."
-            ) from e
-
+        endpoint = os.environ["AZURE_ENDPOINT"]
+        api_key = os.environ["AZURE_APIKEY"]
         credential = AzureKeyCredential(api_key)
         self.client = TextTranslationClient(
-            endpoint=endpoint, credential=credential, region=region
+            endpoint=endpoint, credential=credential, region="chinaeast2"
         )
-
         # https://github.com/Azure/azure-sdk-for-python/issues/9422
         logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
         logger.setLevel(logging.WARNING)
@@ -342,6 +226,5 @@ class AzureTranslator(BaseTranslator):
             from_language=self.lang_in,
             to_language=[self.lang_out],
         )
-
         translated_text = response[0].translations[0].text
         return translated_text

+ 1 - 0
pyproject.toml

@@ -25,6 +25,7 @@ dependencies = [
     "onnx",
     "onnxruntime",
     "opencv-python-headless",
+    "tencentcloud-sdk-python",
     "pdfminer.six>=20240706",
 ]