pdfzh_translator.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # 改编自: https://github.com/Byaidu/PDFMathTranslate/blob/main/pdf2zh/translator.py
  2. import html
  3. import logging
  4. import os
  5. import re
  6. from json import dumps, loads
  7. import openai
  8. import requests
  9. class BaseTranslator:
  10. def __init__(self, service, lang_out, lang_in, model):
  11. self.service = service
  12. self.lang_out = lang_out
  13. self.lang_in = lang_in
  14. self.model = model
  15. def translate(self, text) -> str: ... # noqa: E704
  16. def __str__(self):
  17. return f"{self.service} {self.lang_out} {self.lang_in}"
  18. class GoogleTranslator(BaseTranslator):
  19. def __init__(self, service, lang_out, lang_in, model):
  20. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  21. lang_in = "en" if lang_in == "auto" else lang_in
  22. super().__init__(service, lang_out, lang_in, model)
  23. self.session = requests.Session()
  24. self.base_link = "http://translate.google.com/m"
  25. self.headers = {
  26. "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
  27. }
  28. def translate(self, text):
  29. text = text[:5000] # google translate max length
  30. response = self.session.get(
  31. self.base_link,
  32. params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
  33. headers=self.headers,
  34. )
  35. re_result = re.findall(
  36. r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
  37. )
  38. if response.status_code == 400:
  39. result = "IRREPARABLE TRANSLATION ERROR"
  40. elif len(re_result) == 0:
  41. raise ValueError("Empty translation result")
  42. else:
  43. result = html.unescape(re_result[0])
  44. return result
  45. class OpenAITranslator(BaseTranslator):
  46. def __init__(self, service, lang_out, lang_in, model):
  47. lang_out = "zh-CN" if lang_out == "auto" else lang_out
  48. lang_in = "en" if lang_in == "auto" else lang_in
  49. super().__init__(service, lang_out, lang_in, model)
  50. self.options = {"temperature": 0} # 随机采样可能会打断公式标记
  51. # OPENAI_BASE_URL
  52. # OPENAI_API_KEY
  53. self.client = openai.OpenAI()
  54. def translate(self, text) -> str:
  55. response = self.client.chat.completions.create(
  56. model=self.model,
  57. **self.options,
  58. messages=[
  59. {
  60. "role": "system",
  61. "content": "You are a professional,authentic machine translation engine.",
  62. },
  63. {
  64. "role": "user",
  65. "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
  66. },
  67. ],
  68. )
  69. return response.choices[0].message.content.strip()