Browse Source

Merge pull request #127 from hellofinch/main

Byaidu 1 year ago
parent
commit
98cd503078
4 changed files with 128 additions and 1 deletions
  1. 11 0
      README.md
  2. 13 0
      README_zh-CN.md
  3. 5 0
      pdf2zh/converter.py
  4. 99 1
      pdf2zh/translator.py

+ 11 - 0
README.md

@@ -239,6 +239,17 @@ pdf2zh example.pdf -li en -lo ja
   ```bash
   pdf2zh example.pdf -s azure
   ```
+- **Tencent Machine Translation**
+
+  See [Tencent Machine Translation](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)
+
+  Following ENVs are required:
+  - `Tencent_SECRET_ID`, e.g., `export Tencent_SECRET_ID=AKIDxxx`
+  - `Tencent_SECRET_KEY`, e.g, `export Tencent_SECRET_KEY=xxx`
+
+  ```bash
+  pdf2zh example.pdf -s tmt
+  ```
 
 <h3 id="exceptions">Translate wih exceptions</h3>
 

+ 13 - 0
README_zh-CN.md

@@ -245,6 +245,19 @@ pdf2zh example.pdf -s openai:gpt-4o
 pdf2zh example.pdf -s azure
 ```
 
+- **腾讯机器翻译**
+
+参考 [腾讯机器翻译](https://cloud.tencent.com/product/tmt)
+
+需设置以下环境变量:
+
+- `Tencent_SECRET_ID`, e.g., `export Tencent_SECRET_ID=AKIDxxx`
+- `Tencent_SECRET_KEY`, e.g., `export Tencent_SECRET_KEY=xxx`
+
+```bash
+pdf2zh example.pdf -s tmt
+```
+
 <h3 id="exceptions">指定例外规则</h3>
 
 使用正则表达式指定需保留的公式字体与字符

+ 5 - 0
pdf2zh/converter.py

@@ -69,6 +69,7 @@ from pdf2zh.translator import (
     OllamaTranslator,
     OpenAITranslator,
     AzureTranslator,
+    TencentTranslator,
 )
 
 
@@ -394,6 +395,10 @@ class TextConverter(PDFConverter[AnyIO]):
             self.translator: BaseTranslator = AzureTranslator(
                 service, lang_out, lang_in, None
             )
+        elif param[0] == "tencent":
+            self.translator: BaseTranslator = TencentTranslator(
+                service, lang_out, lang_in, None
+            )
         else:
             raise ValueError("Unsupported translation service")
 

+ 99 - 1
pdf2zh/translator.py

@@ -11,6 +11,10 @@ import requests
 from azure.ai.translation.text import TextTranslationClient
 from azure.core.credentials import AzureKeyCredential
 
+import hmac
+import hashlib
+import time
+from datetime import datetime,UTC
 
 class BaseTranslator:
     def __init__(self, service, lang_out, lang_in, model):
@@ -54,6 +58,96 @@ class GoogleTranslator(BaseTranslator):
             result = html.unescape(re_result[0])
         return result
 
+class TencentTranslator(BaseTranslator):
+    def sign(self,key, msg):
+        return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
+
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            server_url = (
+                "tmt.tencentcloudapi.com"
+            )
+            self.secret_id = os.getenv("TENCENT_SECRET_ID")
+            self.secret_key = os.getenv("TENCENT_SECRET_KEY")
+            
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+
+        self.session = requests.Session()
+        self.base_link = f"{server_url}"
+
+    def translate(self, text):
+        text = text[:5000]
+        data={
+            "SourceText":text,
+            "Source":self.lang_in,
+            "Target":self.lang_out,
+            "ProjectId":0
+        }
+        payloadx = dumps(data)
+        hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
+        canonical_request = ("POST" + "\n" +
+                            "/" + "\n" +
+                            "" + "\n" +
+                            "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n" + "\n" +
+                            "content-type;host;x-tc-action" + "\n" +
+                            hashed_request_payload)
+
+        timestamp = int(time.time())
+        date = datetime.fromtimestamp(timestamp, UTC).strftime("%Y-%m-%d")
+        credential_scope = date + "/tmt/tc3_request"
+        hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
+        algorithm = "TC3-HMAC-SHA256"
+        string_to_sign = (algorithm + "\n" +
+                        str(timestamp) + "\n" +
+                        credential_scope + "\n" +
+                        hashed_canonical_request)
+        secret_date = self.sign(("TC3" + self.secret_key).encode("utf-8"), date)
+        secret_service = self.sign(secret_date, "tmt")
+        secret_signing = self.sign(secret_service, "tc3_request")
+        signed_headers = "content-type;host;x-tc-action"
+        signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
+        authorization = (algorithm + " " +
+                 "Credential=" + self.secret_id + "/" + credential_scope + ", " +
+                 "SignedHeaders=" + signed_headers + ", " +
+                 "Signature=" + signature)
+        self.headers = {
+            "Authorization": authorization,
+            "Content-Type": "application/json; charset=utf-8",
+            "Host": "tmt.tencentcloudapi.com",
+            "X-TC-Action": "TextTranslate",
+            "X-TC-Region":"ap-beijing",
+            "X-TC-Timestamp": str(timestamp),
+            "X-TC-Version": "2018-03-21"
+        }
+
+        response = self.session.post(
+            "https://"+self.base_link,
+            json=data,
+            headers=self.headers,
+        )
+        # 1. Status code test
+        if response.status_code == 200:
+            result = loads(response.text)
+        else:
+            raise ValueError("HTTP error: " + str(response.status_code))
+        # 2. Result test
+        try:
+            result = result['Response']['TargetText']
+            return result
+        except KeyError:
+            result = ""
+            raise ValueError("No valid key in Tencent's response")
+        # 3. Result length check
+        if len(result) == 0:
+            raise ValueError("Empty translation result")
+        return result
 
 class DeepLXTranslator(BaseTranslator):
     def __init__(self, service, lang_out, lang_in, model):
@@ -74,7 +168,11 @@ class DeepLXTranslator(BaseTranslator):
             ) from e
 
         self.session = requests.Session()
-        self.base_link = f"{server_url}/{auth_key}/translate"
+        server_url=server_url.rstrip('/')
+        if auth_key:
+            self.base_link = f"{server_url}/{auth_key}/translate"
+        else:
+            self.base_link = f"{server_url}/translate"
         self.headers = {
             "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
         }