pdfzh_translator.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. # 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
  2. import html
  3. import logging
  4. import os
  5. import re
  6. from json import dumps, loads
  7. from dotenv import load_dotenv
  8. import openai
  9. import requests
  10. # Load environment variables
  11. load_dotenv()
  12. class BaseTranslator:
  13. def __init__(self, service, lang_out, lang_in, model):
  14. self.service = service
  15. self.lang_out = lang_out
  16. self.lang_in = lang_in
  17. self.model = model
  18. def translate(self, text) -> str: ... # noqa: E704
  19. def __str__(self):
  20. return f"{self.service} {self.lang_out} {self.lang_in}"
  21. class GoogleTranslator(BaseTranslator):
  22. def __init__(self, service, lang_out, lang_in, model):
  23. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  24. lang_in = "en" if lang_in == "auto" else lang_in
  25. super().__init__(service, lang_out, lang_in, model)
  26. self.session = requests.Session()
  27. self.base_link = "http://translate.google.com/m"
  28. self.headers = {
  29. "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
  30. }
  31. def translate(self, text):
  32. text = text[:5000] # google translate max length
  33. response = self.session.get(
  34. self.base_link,
  35. params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
  36. headers=self.headers,
  37. )
  38. re_result = re.findall(
  39. r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
  40. )
  41. if response.status_code == 400:
  42. result = "IRREPARABLE TRANSLATION ERROR"
  43. elif len(re_result) == 0:
  44. raise ValueError("Empty translation result")
  45. else:
  46. result = html.unescape(re_result[0])
  47. return result
  48. class OpenAITranslator(BaseTranslator):
  49. def __init__(self, service, lang_out, lang_in, model):
  50. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  51. lang_in = "en" if lang_in == "auto" else lang_in
  52. super().__init__(service, lang_out, lang_in, model)
  53. self.options = {"temperature": 0} # 随机采样可能会打断公式标记
  54. # Configure OpenAI client with environment variables
  55. self.client = openai.OpenAI(
  56. api_key=os.getenv('OPENAI_API_KEY')
  57. )
  58. def translate(self, text) -> str:
  59. response = self.client.chat.completions.create(
  60. model=os.getenv('LLM_MODEL', self.model), # Use env var or fallback to default
  61. **self.options,
  62. messages=[
  63. {
  64. "role": "system",
  65. "content": "You are a professional,authentic machine translation engine.",
  66. },
  67. {
  68. "role": "user",
  69. "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
  70. },
  71. ],
  72. )
  73. return response.choices[0].message.content.strip()