pdfzh_translator.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
  2. import html
  3. import logging
  4. import os
  5. import re
  6. from json import dumps, loads
  7. from dotenv import load_dotenv
  8. import openai
  9. import requests
  10. # Load environment variables
  11. load_dotenv()
  12. class BaseTranslator:
  13. def __init__(self, service, lang_out, lang_in, model):
  14. self.service = service
  15. self.lang_out = lang_out
  16. self.lang_in = lang_in
  17. self.model = model
  18. def translate(self, text) -> str: ... # noqa: E704
  19. def __str__(self):
  20. return f"{self.service} {self.lang_out} {self.lang_in}"
  21. class GoogleTranslator(BaseTranslator):
  22. def __init__(self, service, lang_out, lang_in, model):
  23. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  24. lang_in = "en" if lang_in == "auto" else lang_in
  25. super().__init__(service, lang_out, lang_in, model)
  26. self.session = requests.Session()
  27. self.base_link = "http://translate.google.com/m"
  28. self.headers = {
  29. "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
  30. }
  31. def translate(self, text):
  32. text = text[:5000] # google translate max length
  33. response = self.session.get(
  34. self.base_link,
  35. params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
  36. headers=self.headers,
  37. )
  38. re_result = re.findall(
  39. r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
  40. )
  41. if response.status_code == 400:
  42. result = "IRREPARABLE TRANSLATION ERROR"
  43. elif len(re_result) == 0:
  44. raise ValueError("Empty translation result")
  45. else:
  46. result = html.unescape(re_result[0])
  47. return result
  48. class OpenAITranslator(BaseTranslator):
  49. def __init__(self, service='opeanai', lang_out='zh-CN', lang_in='auto', model=os.getenv('TRANSLATE_MODEL'), max_tokens=2000):
  50. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  51. lang_in = "en" if lang_in == "auto" else lang_in
  52. super().__init__(service, lang_out, lang_in, model)
  53. self.options = {"temperature": 0} # 随机采样可能会打断公式标记
  54. self.max_tokens = max_tokens
  55. # Configure OpenAI client with environment variables
  56. self.client = openai.OpenAI(
  57. api_key=os.getenv('OPENAI_API_KEY'),
  58. base_url=os.getenv('OPENAI_API_BASE')
  59. )
  60. def translate(self, text) -> str:
  61. if isinstance(text, list):
  62. return self._batch_translate(text)
  63. return self._single_translate(text)
  64. def _single_translate(self, text) -> str:
  65. response = self.client.chat.completions.create(
  66. model=self.model,
  67. **self.options,
  68. messages=[
  69. {
  70. "role": "system",
  71. "content": "You are a professional,authentic machine translation engine.",
  72. },
  73. {
  74. "role": "user",
  75. "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",
  76. },
  77. ],
  78. )
  79. return response.choices[0].message.content.strip()
  80. def _batch_translate(self, texts) -> list:
  81. # 将文本列表转换为带索引的格式
  82. indexed_texts = [f"{i}: {text}" for i, text in enumerate(texts)]
  83. combined_text = "\n".join(indexed_texts)
  84. # 计算总token数并分块处理
  85. total_length = len(combined_text)
  86. if total_length > self.max_tokens:
  87. # 如果超过最大token数,分成多个批次处理
  88. batch_size = len(texts) // (total_length // self.max_tokens + 1)
  89. results = []
  90. for i in range(0, len(texts), batch_size):
  91. batch = texts[i:i + batch_size]
  92. results.extend(self._batch_translate(batch))
  93. return results
  94. response = self.client.chat.completions.create(
  95. model=self.model,
  96. **self.options,
  97. messages=[
  98. {
  99. "role": "system",
  100. "content": "You are a professional,authentic machine translation engine.",
  101. },
  102. {
  103. "role": "user",
  104. "content": f"Translate the following list of texts to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translations in the same order with their original indexes. Each line should be in format 'index: translation'.\nSource Texts:\n{combined_text}\nTranslated Texts:",
  105. },
  106. ],
  107. )
  108. # 解析返回结果并保持顺序
  109. translated_lines = response.choices[0].message.content.strip().split("\n")
  110. translations = [""] * len(texts)
  111. for line in translated_lines:
  112. try:
  113. index, translation = line.split(":", 1)
  114. translations[int(index)] = translation.strip()
  115. except (ValueError, IndexError):
  116. continue
  117. return translations
  118. if __name__ == "__main__":
  119. # 测试翻译示例
  120. translator = OpenAITranslator("openai", "zh-CN", "en", "gpt-3.5-turbo")
  121. # 测试简单文本翻译
  122. test_text = "Hello world, this is a test translation."
  123. translated_text = translator.translate(test_text)
  124. print(f"Original: {test_text}")
  125. print(f"Translated: {translated_text}")
  126. # 测试包含公式的文本
  127. math_text = "The equation $E=mc^2$ is famous."
  128. translated_math = translator.translate(math_text)
  129. print(f"\nOriginal with math: {math_text}")
  130. print(f"Translated with math: {translated_math}")
  131. # 测试批量翻译
  132. batch_texts = ["apple", "banana", "orange", "grape"]
  133. translated_batch = translator.translate(batch_texts)
  134. print("\nBatch translation results:")
  135. for original, translated in zip(batch_texts, translated_batch):
  136. print(f"{original} -> {translated}")
  137. '''
  138. translator = OpenAITranslator("openai", "zh-CN", "en", "openai/deepseek-chat")
  139. # 单个翻译
  140. result = translator.translate("Hello world")
  141. # 批量翻译
  142. results = translator.translate(["apple", "banana", "orange"])
  143. '''