# 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py import html import logging import os import re from json import dumps, loads from dotenv import load_dotenv import openai import requests # Load environment variables load_dotenv() class BaseTranslator: def __init__(self, service, lang_out, lang_in, model): self.service = service self.lang_out = lang_out self.lang_in = lang_in self.model = model def translate(self, text) -> str: ... # noqa: E704 def __str__(self): return f"{self.service} {self.lang_out} {self.lang_in}" class GoogleTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh-CN" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.session = requests.Session() self.base_link = "http://translate.google.com/m" self.headers = { "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 } def translate(self, text): text = text[:5000] # google translate max length response = self.session.get( self.base_link, params={"tl": self.lang_out, "sl": self.lang_in, "q": text}, headers=self.headers, ) re_result = re.findall( r'(?s)class="(?:t0|result-container)">(.*?)<', response.text ) if response.status_code == 400: result = "IRREPARABLE TRANSLATION ERROR" elif len(re_result) == 0: raise ValueError("Empty translation result") else: result = html.unescape(re_result[0]) return result class OpenAITranslator(BaseTranslator): def __init__(self, service='opeanai', lang_out='zh-CN', lang_in='auto', model=os.getenv('LLM_MODEL'), max_tokens=2000): lang_out = "zh-CN" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 self.max_tokens = max_tokens # Configure OpenAI client with environment variables self.client = openai.OpenAI( api_key=os.getenv('OPENAI_API_KEY'), base_url=os.getenv('OPENAI_API_BASE') ) def translate(self, text) -> str: if isinstance(text, list): return self._batch_translate(text) return self._single_translate(text) def _single_translate(self, text) -> str: response = self.client.chat.completions.create( model=self.model, **self.options, messages=[ { "role": "system", "content": "You are a professional,authentic machine translation engine.", }, { "role": "user", "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", }, ], ) return response.choices[0].message.content.strip() def _batch_translate(self, texts) -> list: # 将文本列表转换为带索引的格式 indexed_texts = [f"{i}: {text}" for i, text in enumerate(texts)] combined_text = "\n".join(indexed_texts) # 计算总token数并分块处理 total_length = len(combined_text) if total_length > self.max_tokens: # 如果超过最大token数,分成多个批次处理 batch_size = len(texts) // (total_length // self.max_tokens + 1) results = [] for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] results.extend(self._batch_translate(batch)) return results response = self.client.chat.completions.create( model=os.getenv('LLM_MODEL', self.model), **self.options, messages=[ { "role": "system", "content": "You are a professional,authentic machine translation engine.", }, { "role": "user", "content": f"Translate the following list of texts to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translations in the same order with their original indexes. Each line should be in format 'index: translation'.\nSource Texts:\n{combined_text}\nTranslated Texts:", }, ], ) # 解析返回结果并保持顺序 translated_lines = response.choices[0].message.content.strip().split("\n") translations = [""] * len(texts) for line in translated_lines: try: index, translation = line.split(":", 1) translations[int(index)] = translation.strip() except (ValueError, IndexError): continue return translations if __name__ == "__main__": # 测试翻译示例 translator = OpenAITranslator("openai", "zh-CN", "en", "gpt-3.5-turbo") # 测试简单文本翻译 test_text = "Hello world, this is a test translation." translated_text = translator.translate(test_text) print(f"Original: {test_text}") print(f"Translated: {translated_text}") # 测试包含公式的文本 math_text = "The equation $E=mc^2$ is famous." translated_math = translator.translate(math_text) print(f"\nOriginal with math: {math_text}") print(f"Translated with math: {translated_math}") # 测试批量翻译 batch_texts = ["apple", "banana", "orange", "grape"] translated_batch = translator.translate(batch_texts) print("\nBatch translation results:") for original, translated in zip(batch_texts, translated_batch): print(f"{original} -> {translated}") ''' translator = OpenAITranslator("openai", "zh-CN", "en", "openai/deepseek-chat") # 单个翻译 result = translator.translate("Hello world") # 批量翻译 results = translator.translate(["apple", "banana", "orange"]) '''