amazon
/
excel_tool


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
							# 改编自： https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
import html
import logging
import os
import re
from json import dumps, loads
from dotenv import load_dotenv

import openai
import requests

# Load environment variables
load_dotenv()

class BaseTranslator:
    def __init__(self, service, lang_out, lang_in, model):
        self.service = service
        self.lang_out = lang_out
        self.lang_in = lang_in
        self.model = model

    def translate(self, text) -> str: ...  # noqa: E704

    def __str__(self):
        return f"{self.service} {self.lang_out} {self.lang_in}"


class GoogleTranslator(BaseTranslator):
    def __init__(self, service, lang_out, lang_in, model):
        lang_out = "zh-CN" if lang_out == "auto" else lang_out
        lang_in = "en" if lang_in == "auto" else lang_in
        super().__init__(service, lang_out, lang_in, model)
        self.session = requests.Session()
        self.base_link = "http://translate.google.com/m"
        self.headers = {
            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
        }

    def translate(self, text):
        text = text[:5000]  # google translate max length
        response = self.session.get(
            self.base_link,
            params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
            headers=self.headers,
        )
        re_result = re.findall(
            r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
        )
        if response.status_code == 400:
            result = "IRREPARABLE TRANSLATION ERROR"
        elif len(re_result) == 0:
            raise ValueError("Empty translation result")
        else:
            result = html.unescape(re_result[0])
        return result


class OpenAITranslator(BaseTranslator):
    def __init__(self, service='opeanai', lang_out='zh-CN', lang_in='auto', model=os.getenv('TRANSLATE_MODEL'), max_tokens=2000):
        lang_out = "zh-CN" if lang_out == "auto" else lang_out
        lang_in = "en" if lang_in == "auto" else lang_in
        super().__init__(service, lang_out, lang_in, model)
        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
        self.max_tokens = max_tokens
        # Configure OpenAI client with environment variables
        self.client = openai.OpenAI(
            api_key=os.getenv('OPENAI_API_KEY'),
            base_url=os.getenv('OPENAI_API_BASE')
        )

    def translate(self, text) -> str:
        if isinstance(text, list):
            return self._batch_translate(text)
        return self._single_translate(text)

    def _single_translate(self, text) -> str:
        response = self.client.chat.completions.create(
            model=self.model,
            **self.options,
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional,authentic machine translation engine.",
                },
                {
                    "role": "user",
                    "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",
                },
            ],
        )
        return response.choices[0].message.content.strip()

    def _batch_translate(self, texts) -> list:
        # 将文本列表转换为带索引的格式
        indexed_texts = [f"{i}: {text}" for i, text in enumerate(texts)]
        combined_text = "\n".join(indexed_texts)
        
        # 计算总token数并分块处理
        total_length = len(combined_text)
        if total_length > self.max_tokens:
            # 如果超过最大token数，分成多个批次处理
            batch_size = len(texts) // (total_length // self.max_tokens + 1)
            results = []
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i + batch_size]
                results.extend(self._batch_translate(batch))
            return results

        response = self.client.chat.completions.create(
            model=self.model,
            **self.options,
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional,authentic machine translation engine.",
                },
                {
                    "role": "user",
                    "content": f"Translate the following list of texts to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translations in the same order with their original indexes. Each line should be in format 'index: translation'.\nSource Texts:\n{combined_text}\nTranslated Texts:",
                },
            ],
        )
        
        # 解析返回结果并保持顺序
        translated_lines = response.choices[0].message.content.strip().split("\n")
        translations = [""] * len(texts)
        for line in translated_lines:
            try:
                index, translation = line.split(":", 1)
                translations[int(index)] = translation.strip()
            except (ValueError, IndexError):
                continue
                
        return translations

if __name__ == "__main__":
    # 测试翻译示例
    translator = OpenAITranslator("openai", "zh-CN", "en", "gpt-3.5-turbo")
    
    # 测试简单文本翻译
    test_text = "Hello world, this is a test translation."
    translated_text = translator.translate(test_text)
    print(f"Original: {test_text}")
    print(f"Translated: {translated_text}")
    
    # 测试包含公式的文本
    math_text = "The equation $E=mc^2$ is famous."
    translated_math = translator.translate(math_text)
    print(f"\nOriginal with math: {math_text}")
    print(f"Translated with math: {translated_math}")
    
    # 测试批量翻译
    batch_texts = ["apple", "banana", "orange", "grape"]
    translated_batch = translator.translate(batch_texts)
    print("\nBatch translation results:")
    for original, translated in zip(batch_texts, translated_batch):
        print(f"{original} -> {translated}")

'''
translator = OpenAITranslator("openai", "zh-CN", "en", "openai/deepseek-chat")                                           
 # 单个翻译                                                                                                        
 result = translator.translate("Hello world")                                                                      
 # 批量翻译                                                                                                        
 results = translator.translate(["apple", "banana", "orange"])     
'''