| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- # 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
- import html
- import logging
- import os
- import re
- from json import dumps, loads
- from dotenv import load_dotenv
- import openai
- import requests
- # Load environment variables
- load_dotenv()
- class BaseTranslator:
- def __init__(self, service, lang_out, lang_in, model):
- self.service = service
- self.lang_out = lang_out
- self.lang_in = lang_in
- self.model = model
- def translate(self, text) -> str: ... # noqa: E704
- def __str__(self):
- return f"{self.service} {self.lang_out} {self.lang_in}"
- class GoogleTranslator(BaseTranslator):
- def __init__(self, service, lang_out, lang_in, model):
- lang_out = "zh-CN" if lang_out == "auto" else lang_out
- lang_in = "en" if lang_in == "auto" else lang_in
- super().__init__(service, lang_out, lang_in, model)
- self.session = requests.Session()
- self.base_link = "http://translate.google.com/m"
- self.headers = {
- "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
- }
- def translate(self, text):
- text = text[:5000] # google translate max length
- response = self.session.get(
- self.base_link,
- params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
- headers=self.headers,
- )
- re_result = re.findall(
- r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
- )
- if response.status_code == 400:
- result = "IRREPARABLE TRANSLATION ERROR"
- elif len(re_result) == 0:
- raise ValueError("Empty translation result")
- else:
- result = html.unescape(re_result[0])
- return result
- class OpenAITranslator(BaseTranslator):
- def __init__(self, service='opeanai', lang_out='zh-CN', lang_in='auto', model=os.getenv('TRANSLATE_MODEL'), max_tokens=2000):
- lang_out = "zh-CN" if lang_out == "auto" else lang_out
- lang_in = "en" if lang_in == "auto" else lang_in
- super().__init__(service, lang_out, lang_in, model)
- self.options = {"temperature": 0} # 随机采样可能会打断公式标记
- self.max_tokens = max_tokens
- # Configure OpenAI client with environment variables
- self.client = openai.OpenAI(
- api_key=os.getenv('OPENAI_API_KEY'),
- base_url=os.getenv('OPENAI_API_BASE')
- )
- def translate(self, text) -> str:
- if isinstance(text, list):
- return self._batch_translate(text)
- return self._single_translate(text)
- def _single_translate(self, text) -> str:
- response = self.client.chat.completions.create(
- model=self.model,
- **self.options,
- messages=[
- {
- "role": "system",
- "content": "You are a professional,authentic machine translation engine.",
- },
- {
- "role": "user",
- "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",
- },
- ],
- )
- return response.choices[0].message.content.strip()
- def _batch_translate(self, texts) -> list:
- # 将文本列表转换为带索引的格式
- indexed_texts = [f"{i}: {text}" for i, text in enumerate(texts)]
- combined_text = "\n".join(indexed_texts)
-
- # 计算总token数并分块处理
- total_length = len(combined_text)
- if total_length > self.max_tokens:
- # 如果超过最大token数,分成多个批次处理
- batch_size = len(texts) // (total_length // self.max_tokens + 1)
- results = []
- for i in range(0, len(texts), batch_size):
- batch = texts[i:i + batch_size]
- results.extend(self._batch_translate(batch))
- return results
- response = self.client.chat.completions.create(
- model=self.model,
- **self.options,
- messages=[
- {
- "role": "system",
- "content": "You are a professional,authentic machine translation engine.",
- },
- {
- "role": "user",
- "content": f"Translate the following list of texts to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translations in the same order with their original indexes. Each line should be in format 'index: translation'.\nSource Texts:\n{combined_text}\nTranslated Texts:",
- },
- ],
- )
-
- # 解析返回结果并保持顺序
- translated_lines = response.choices[0].message.content.strip().split("\n")
- translations = [""] * len(texts)
- for line in translated_lines:
- try:
- index, translation = line.split(":", 1)
- translations[int(index)] = translation.strip()
- except (ValueError, IndexError):
- continue
-
- return translations
- if __name__ == "__main__":
- # 测试翻译示例
- translator = OpenAITranslator("openai", "zh-CN", "en", "gpt-3.5-turbo")
-
- # 测试简单文本翻译
- test_text = "Hello world, this is a test translation."
- translated_text = translator.translate(test_text)
- print(f"Original: {test_text}")
- print(f"Translated: {translated_text}")
-
- # 测试包含公式的文本
- math_text = "The equation $E=mc^2$ is famous."
- translated_math = translator.translate(math_text)
- print(f"\nOriginal with math: {math_text}")
- print(f"Translated with math: {translated_math}")
-
- # 测试批量翻译
- batch_texts = ["apple", "banana", "orange", "grape"]
- translated_batch = translator.translate(batch_texts)
- print("\nBatch translation results:")
- for original, translated in zip(batch_texts, translated_batch):
- print(f"{original} -> {translated}")
- '''
- translator = OpenAITranslator("openai", "zh-CN", "en", "openai/deepseek-chat")
- # 单个翻译
- result = translator.translate("Hello world")
- # 批量翻译
- results = translator.translate(["apple", "banana", "orange"])
- '''
|