| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- # 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
- import html
- import logging
- import os
- import re
- from json import dumps, loads
- from dotenv import load_dotenv
- import openai
- import requests
- # Load environment variables
- load_dotenv()
- class BaseTranslator:
- def __init__(self, service, lang_out, lang_in, model):
- self.service = service
- self.lang_out = lang_out
- self.lang_in = lang_in
- self.model = model
- def translate(self, text) -> str: ... # noqa: E704
- def __str__(self):
- return f"{self.service} {self.lang_out} {self.lang_in}"
- class GoogleTranslator(BaseTranslator):
- def __init__(self, service, lang_out, lang_in, model):
- lang_out = "zh-CN" if lang_out == "auto" else lang_out
- lang_in = "en" if lang_in == "auto" else lang_in
- super().__init__(service, lang_out, lang_in, model)
- self.session = requests.Session()
- self.base_link = "http://translate.google.com/m"
- self.headers = {
- "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
- }
- def translate(self, text):
- text = text[:5000] # google translate max length
- response = self.session.get(
- self.base_link,
- params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
- headers=self.headers,
- )
- re_result = re.findall(
- r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
- )
- if response.status_code == 400:
- result = "IRREPARABLE TRANSLATION ERROR"
- elif len(re_result) == 0:
- raise ValueError("Empty translation result")
- else:
- result = html.unescape(re_result[0])
- return result
- class OpenAITranslator(BaseTranslator):
- def __init__(self, service, lang_out, lang_in, model):
- lang_out = "zh-CN" if lang_out == "auto" else lang_out
- lang_in = "en" if lang_in == "auto" else lang_in
- super().__init__(service, lang_out, lang_in, model)
- self.options = {"temperature": 0} # 随机采样可能会打断公式标记
- # Configure OpenAI client with environment variables
- self.client = openai.OpenAI(
- api_key=os.getenv('OPENAI_API_KEY')
- )
- def translate(self, text) -> str:
- response = self.client.chat.completions.create(
- model=os.getenv('LLM_MODEL', self.model), # Use env var or fallback to default
- **self.options,
- messages=[
- {
- "role": "system",
- "content": "You are a professional,authentic machine translation engine.",
- },
- {
- "role": "user",
- "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
- },
- ],
- )
- return response.choices[0].message.content.strip()
|