translator.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. import html
  2. import logging
  3. import os
  4. import re
  5. import unicodedata
  6. import deepl
  7. import ollama
  8. import openai
  9. import requests
  10. from azure.ai.translation.text import TextTranslationClient
  11. from azure.core.credentials import AzureKeyCredential
  12. from tencentcloud.common import credential
  13. from tencentcloud.tmt.v20180321.tmt_client import TmtClient
  14. from tencentcloud.tmt.v20180321.models import TextTranslateRequest
  15. from tencentcloud.tmt.v20180321.models import TextTranslateResponse
  16. import json
  17. def remove_control_characters(s):
  18. return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
  19. class BaseTranslator:
  20. name = "base"
  21. envs = {}
  22. lang_map = {}
  23. def __init__(self, lang_in, lang_out, model):
  24. lang_in = self.lang_map.get(lang_in.lower(), lang_in)
  25. lang_out = self.lang_map.get(lang_out.lower(), lang_out)
  26. self.lang_in = lang_in
  27. self.lang_out = lang_out
  28. self.model = model
  29. def translate(self, text):
  30. pass
  31. def prompt(self, text):
  32. return [
  33. {
  34. "role": "system",
  35. "content": "You are a professional,authentic machine translation engine.",
  36. },
  37. {
  38. "role": "user",
  39. "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
  40. },
  41. ]
  42. def __str__(self):
  43. return f"{self.name} {self.lang_in} {self.lang_out} {self.model}"
  44. class GoogleTranslator(BaseTranslator):
  45. name = "google"
  46. lang_map = {"zh": "zh-CN"}
  47. def __init__(self, lang_in, lang_out, model):
  48. super().__init__(lang_in, lang_out, model)
  49. self.session = requests.Session()
  50. self.endpoint = "http://translate.google.com/m"
  51. self.headers = {
  52. "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
  53. }
  54. def translate(self, text):
  55. text = text[:5000] # google translate max length
  56. response = self.session.get(
  57. self.endpoint,
  58. params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
  59. headers=self.headers,
  60. )
  61. re_result = re.findall(
  62. r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
  63. )
  64. if response.status_code == 400:
  65. result = "IRREPARABLE TRANSLATION ERROR"
  66. else:
  67. response.raise_for_status()
  68. result = html.unescape(re_result[0])
  69. return remove_control_characters(result)
  70. class BingTranslator(BaseTranslator):
  71. # https://github.com/immersive-translate/old-immersive-translate/blob/6df13da22664bea2f51efe5db64c63aca59c4e79/src/background/translationService.js
  72. name = "bing"
  73. lang_map = {"zh": "zh-Hans"}
  74. def __init__(self, lang_in, lang_out, model):
  75. super().__init__(lang_in, lang_out, model)
  76. self.session = requests.Session()
  77. self.endpoint = "https://www.bing.com/translator"
  78. self.headers = {
  79. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", # noqa: E501
  80. }
  81. def findSID(self):
  82. response = self.session.get(self.endpoint)
  83. response.raise_for_status()
  84. url = response.url[:-10]
  85. ig = re.findall(r"\"ig\":\"(.*?)\"", response.text)[0]
  86. iid = re.findall(r"data-iid=\"(.*?)\"", response.text)[-1]
  87. key, token = re.findall(
  88. r"params_AbusePreventionHelper\s=\s\[(.*?),\"(.*?)\",", response.text
  89. )[0]
  90. return url, ig, iid, key, token
  91. def translate(self, text):
  92. text = text[:1000] # bing translate max length
  93. url, ig, iid, key, token = self.findSID()
  94. response = self.session.post(
  95. f"{url}ttranslatev3?IG={ig}&IID={iid}",
  96. data={
  97. "fromLang": self.lang_in,
  98. "to": self.lang_out,
  99. "text": text,
  100. "token": token,
  101. "key": key,
  102. },
  103. headers=self.headers,
  104. )
  105. response.raise_for_status()
  106. return response.json()[0]["translations"][0]["text"]
  107. class DeepLTranslator(BaseTranslator):
  108. # https://github.com/DeepLcom/deepl-python
  109. name = "deepl"
  110. envs = {
  111. "DEEPL_AUTH_KEY": None,
  112. }
  113. lang_map = {"zh": "zh-Hans"}
  114. def __init__(self, lang_in, lang_out, model):
  115. super().__init__(lang_in, lang_out, model)
  116. auth_key = os.getenv("DEEPL_AUTH_KEY")
  117. self.client = deepl.Translator(auth_key)
  118. def translate(self, text):
  119. response = self.client.translate_text(
  120. text, target_lang=self.lang_out, source_lang=self.lang_in
  121. )
  122. return response.text
  123. class DeepLXTranslator(BaseTranslator):
  124. # https://deeplx.owo.network/endpoints/free.html
  125. name = "deeplx"
  126. envs = {
  127. "DEEPLX_ENDPOINT": "https://api.deepl.com/translate",
  128. }
  129. lang_map = {"zh": "zh-Hans"}
  130. def __init__(self, lang_in, lang_out, model):
  131. super().__init__(lang_in, lang_out, model)
  132. self.endpoint = os.getenv("DEEPLX_ENDPOINT", self.envs["DEEPLX_ENDPOINT"])
  133. self.session = requests.Session()
  134. def translate(self, text):
  135. response = self.session.post(
  136. self.endpoint,
  137. json={
  138. "source_lang": self.lang_in,
  139. "target_lang": self.lang_out,
  140. "text": text,
  141. },
  142. )
  143. response.raise_for_status()
  144. return response.json()["data"]
  145. class OllamaTranslator(BaseTranslator):
  146. # https://github.com/ollama/ollama-python
  147. name = "ollama"
  148. envs = {
  149. "OLLAMA_HOST": "http://127.0.0.1:11434",
  150. "OLLAMA_MODEL": "gemma2",
  151. }
  152. def __init__(self, lang_in, lang_out, model):
  153. if not model:
  154. model = os.getenv("OLLAMA_MODEL", self.envs["OLLAMA_MODEL"])
  155. super().__init__(lang_in, lang_out, model)
  156. self.options = {"temperature": 0} # 随机采样可能会打断公式标记
  157. self.client = ollama.Client()
  158. def translate(self, text):
  159. response = self.client.chat(
  160. model=self.model,
  161. options=self.options,
  162. messages=self.prompt(text),
  163. )
  164. return response["message"]["content"].strip()
  165. class OpenAITranslator(BaseTranslator):
  166. # https://github.com/openai/openai-python
  167. name = "openai"
  168. envs = {
  169. "OPENAI_BASE_URL": "https://api.openai.com/v1",
  170. "OPENAI_API_KEY": None,
  171. "OPENAI_MODEL": "gpt-4o-mini",
  172. }
  173. def __init__(self, lang_in, lang_out, model, base_url=None, api_key=None):
  174. if not model:
  175. model = os.getenv("OPENAI_MODEL", self.envs["OPENAI_MODEL"])
  176. super().__init__(lang_in, lang_out, model)
  177. self.options = {"temperature": 0} # 随机采样可能会打断公式标记
  178. self.client = openai.OpenAI(base_url=base_url, api_key=api_key)
  179. def translate(self, text) -> str:
  180. response = self.client.chat.completions.create(
  181. model=self.model,
  182. **self.options,
  183. messages=self.prompt(text),
  184. )
  185. return response.choices[0].message.content.strip()
  186. class AzureOpenAITranslator(BaseTranslator):
  187. name = "azure-openai"
  188. envs = {
  189. "AZURE_OPENAI_BASE_URL": None, # e.g. "https://xxx.openai.azure.com"
  190. "AZURE_OPENAI_API_KEY": None,
  191. "AZURE_OPENAI_MODEL": "gpt-4o-mini",
  192. }
  193. def __init__(self, lang_in, lang_out, model, base_url=None, api_key=None):
  194. base_url = os.getenv(
  195. "AZURE_OPENAI_BASE_URL", self.envs["AZURE_OPENAI_BASE_URL"]
  196. )
  197. if not model:
  198. model = os.getenv("AZURE_OPENAI_MODEL", self.envs["AZURE_OPENAI_MODEL"])
  199. super().__init__(lang_in, lang_out, model)
  200. self.options = {"temperature": 0}
  201. self.client = openai.AzureOpenAI(
  202. azure_endpoint=base_url,
  203. azure_deployment=model,
  204. api_version="2024-06-01",
  205. api_key=api_key,
  206. )
  207. def translate(self, text) -> str:
  208. response = self.client.chat.completions.create(
  209. model=self.model,
  210. **self.options,
  211. messages=self.prompt(text),
  212. )
  213. return response.choices[0].message.content.strip()
  214. class ModelScopeTranslator(OpenAITranslator):
  215. name = "modelscope"
  216. envs = {
  217. "MODELSCOPE_BASE_URL": "https://api-inference.modelscope.cn/v1",
  218. "MODELSCOPE_API_KEY": None,
  219. "MODELSCOPE_MODEL": "Qwen/Qwen2.5-Coder-32B-Instruct",
  220. }
  221. def __init__(self, lang_in, lang_out, model, base_url=None, api_key=None):
  222. base_url = "https://api-inference.modelscope.cn/v1"
  223. api_key = os.getenv("MODELSCOPE_API_KEY")
  224. if not model:
  225. model = os.getenv("MODELSCOPE_MODEL", self.envs["MODELSCOPE_MODEL"])
  226. super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key)
  227. class ZhipuTranslator(OpenAITranslator):
  228. # https://bigmodel.cn/dev/api/thirdparty-frame/openai-sdk
  229. name = "zhipu"
  230. envs = {
  231. "ZHIPU_API_KEY": None,
  232. "ZHIPU_MODEL": "glm-4-flash",
  233. }
  234. def __init__(self, lang_in, lang_out, model):
  235. base_url = "https://open.bigmodel.cn/api/paas/v4"
  236. api_key = os.getenv("ZHIPU_API_KEY")
  237. if not model:
  238. model = os.getenv("ZHIPU_MODEL", self.envs["ZHIPU_MODEL"])
  239. super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key)
  240. def translate(self, text) -> str:
  241. try:
  242. response = self.client.chat.completions.create(
  243. model=self.model,
  244. **self.options,
  245. messages=self.prompt(text),
  246. )
  247. except openai.BadRequestError as e:
  248. if (
  249. json.loads(response.choices[0].message.content.strip())["error"]["code"]
  250. == "1301"
  251. ):
  252. return "IRREPARABLE TRANSLATION ERROR"
  253. raise e
  254. return response.choices[0].message.content.strip()
  255. class SiliconTranslator(OpenAITranslator):
  256. # https://docs.siliconflow.cn/quickstart
  257. name = "silicon"
  258. envs = {
  259. "SILICON_API_KEY": None,
  260. "SILICON_MODEL": "Qwen/Qwen2.5-7B-Instruct",
  261. }
  262. def __init__(self, lang_in, lang_out, model):
  263. base_url = "https://api.siliconflow.cn/v1"
  264. api_key = os.getenv("SILICON_API_KEY")
  265. if not model:
  266. model = os.getenv("SILICON_MODEL", self.envs["SILICON_MODEL"])
  267. super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key)
  268. class GeminiTranslator(OpenAITranslator):
  269. # https://ai.google.dev/gemini-api/docs/openai
  270. name = "gemini"
  271. envs = {
  272. "GEMINI_API_KEY": None,
  273. "GEMINI_MODEL": "gemini-1.5-flash",
  274. }
  275. def __init__(self, lang_in, lang_out, model):
  276. base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
  277. api_key = os.getenv("GEMINI_API_KEY")
  278. if not model:
  279. model = os.getenv("GEMINI_MODEL", self.envs["GEMINI_MODEL"])
  280. super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key)
  281. class AzureTranslator(BaseTranslator):
  282. # https://github.com/Azure/azure-sdk-for-python
  283. name = "azure"
  284. envs = {
  285. "AZURE_ENDPOINT": "https://api.translator.azure.cn",
  286. "AZURE_API_KEY": None,
  287. }
  288. lang_map = {"zh": "zh-Hans"}
  289. def __init__(self, lang_in, lang_out, model):
  290. super().__init__(lang_in, lang_out, model)
  291. endpoint = os.getenv("AZURE_ENDPOINT", self.envs["AZURE_ENDPOINT"])
  292. api_key = os.getenv("AZURE_API_KEY")
  293. credential = AzureKeyCredential(api_key)
  294. self.client = TextTranslationClient(
  295. endpoint=endpoint, credential=credential, region="chinaeast2"
  296. )
  297. # https://github.com/Azure/azure-sdk-for-python/issues/9422
  298. logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
  299. logger.setLevel(logging.WARNING)
  300. def translate(self, text) -> str:
  301. response = self.client.translate(
  302. body=[text],
  303. from_language=self.lang_in,
  304. to_language=[self.lang_out],
  305. )
  306. translated_text = response[0].translations[0].text
  307. return translated_text
  308. class TencentTranslator(BaseTranslator):
  309. # https://github.com/TencentCloud/tencentcloud-sdk-python
  310. name = "tencent"
  311. envs = {
  312. "TENCENTCLOUD_SECRET_ID": None,
  313. "TENCENTCLOUD_SECRET_KEY": None,
  314. }
  315. def __init__(self, lang_in, lang_out, model):
  316. super().__init__(lang_in, lang_out, model)
  317. cred = credential.DefaultCredentialProvider().get_credential()
  318. self.client = TmtClient(cred, "ap-beijing")
  319. self.req = TextTranslateRequest()
  320. self.req.Source = self.lang_in
  321. self.req.Target = self.lang_out
  322. self.req.ProjectId = 0
  323. def translate(self, text):
  324. self.req.SourceText = text
  325. resp: TextTranslateResponse = self.client.TextTranslate(self.req)
  326. return resp.TargetText
  327. class AnythingLLMTranslator(BaseTranslator):
  328. name = "anythingllm"
  329. envs = {
  330. "AnythingLLM_URL": None,
  331. "AnythingLLM_APIKEY": "api_key",
  332. }
  333. def __init__(self, lang_out, lang_in, model):
  334. super().__init__(lang_out, lang_in, model)
  335. self.api_url = os.getenv("AnythingLLM_URL", self.envs["AnythingLLM_URL"])
  336. self.api_key = os.getenv("AnythingLLM_APIKEY", self.envs["AnythingLLM_APIKEY"])
  337. self.headers = {
  338. "accept": "application/json",
  339. "Authorization": f"Bearer {self.api_key}",
  340. "Content-Type": "application/json",
  341. }
  342. def translate(self, text):
  343. messages = self.prompt(text)
  344. payload = {
  345. "message": messages,
  346. "mode": "chat",
  347. "sessionId": "translation_expert",
  348. }
  349. response = requests.post(
  350. self.api_url, headers=self.headers, data=json.dumps(payload)
  351. )
  352. response.raise_for_status()
  353. data = response.json()
  354. if "textResponse" in data:
  355. return data["textResponse"].strip()
  356. class DifyTranslator(BaseTranslator):
  357. name = "dify"
  358. envs = {
  359. "DIFY_API_URL": None, # 填写实际 Dify API 地址
  360. "DIFY_API_KEY": "api_key", # 替换为实际 API 密钥
  361. }
  362. def __init__(self, lang_out, lang_in, model):
  363. super().__init__(lang_out, lang_in, model)
  364. self.api_url = os.getenv("DIFY_API_URL", self.envs["DIFY_API_URL"])
  365. self.api_key = os.getenv("DIFY_API_KEY", self.envs["DIFY_API_KEY"])
  366. def translate(self, text):
  367. headers = {
  368. "Authorization": f"Bearer {self.api_key}",
  369. "Content-Type": "application/json",
  370. }
  371. payload = {
  372. "inputs": {
  373. "lang_out": self.lang_out,
  374. "lang_in": self.lang_in,
  375. "text": text,
  376. },
  377. "response_mode": "blocking",
  378. "user": "translator-service",
  379. }
  380. # 向 Dify 服务器发送请求
  381. response = requests.post(
  382. self.api_url, headers=headers, data=json.dumps(payload)
  383. )
  384. response.raise_for_status()
  385. response_data = response.json()
  386. # 解析响应
  387. return response_data.get("data", {}).get("outputs", {}).get("text", [])