Byaidu 1 năm trước cách đây
mục cha
commit
b8d5854bb0
5 tập tin đã thay đổi với 33 bổ sung3 xóa
  1. 6 0
      README.md
  2. 1 1
      pdf2zh/__init__.py
  3. 6 2
      pdf2zh/converter.py
  4. 4 0
      pdf2zh/high_level.py
  5. 16 0
      pdf2zh/pdf2zh.py

+ 6 - 0
README.md

@@ -40,6 +40,12 @@ pdf2zh example.pdf
 pdf2zh example.pdf -p 1-3,5
 ```
 
+### Translate with the specified language
+
+```bash
+pdf2zh example.pdf -li en -lo ja
+```
+
 ### Use regex to specify formula fonts and characters that need to be preserved
 
 Hint: Starting from `\ufb00` is English style ligature.

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.3.2"
+__version__ = "1.3.3"
 __author__ = "Byaidu"

+ 6 - 2
pdf2zh/converter.py

@@ -347,6 +347,8 @@ class TextConverter(PDFConverter[AnyIO]):
         vchar: str = None,
         thread: int = 0,
         layout = {},
+        lang_in: str = "",
+        lang_out: str = "",
     ) -> None:
         super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
         self.showpageno = showpageno
@@ -355,6 +357,8 @@ class TextConverter(PDFConverter[AnyIO]):
         self.vchar = vchar
         self.thread = thread
         self.layout = layout
+        self.lang_in = lang_in
+        self.lang_out = lang_out
 
     def write_text(self, text: str) -> None:
         text = utils.compatible_encode_method(text, self.codec, "ignore")
@@ -520,10 +524,10 @@ class TextConverter(PDFConverter[AnyIO]):
             def worker(s): # 多线程翻译
                 try:
                     if sum(map(str.islower,s))>1: # 包含小写字母
-                        hash_key_paragraph = cache.deterministic_hash(s)
+                        hash_key_paragraph = cache.deterministic_hash((s,self.lang_in,self.lang_out))
                         new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
                         if new is None:
-                            new=translator.translate(s,'zh-CN','en')
+                            new=translator.translate(s,self.lang_out,self.lang_in)
                             new=remove_control_characters(new)
                             cache.write_paragraph(hash_key, hash_key_paragraph, new)
                     else:

+ 4 - 0
pdf2zh/high_level.py

@@ -46,6 +46,8 @@ def extract_text_to_fp(
     thread: int = 0,
     doc_en = None,
     model = None,
+    lang_in: str = "",
+    lang_out: str = "",
     **kwargs: Any,
 ) -> None:
     """Parses text from inf-file and writes to outfp file-like object.
@@ -102,6 +104,8 @@ def extract_text_to_fp(
             vchar=vchar,
             thread=thread,
             layout=layout,
+            lang_in=lang_in,
+            lang_out=lang_out,
         )
 
     elif output_type == "xml":

+ 16 - 0
pdf2zh/pdf2zh.py

@@ -51,6 +51,8 @@ def extract_text(
     vfont: str = "",
     vchar: str = "",
     thread: int = 0,
+    lang_in: str = "",
+    lang_out: str = "",
     **kwargs: Any,
 ) -> AnyIO:
     if not files:
@@ -152,6 +154,20 @@ def create_parser() -> argparse.ArgumentParser:
         default="",
         help="The regex to math character of formula.",
     )
+    parse_params.add_argument(
+        "--lang-in",
+        "-li",
+        type=str,
+        default="en",
+        help="The code of source language.",
+    )
+    parse_params.add_argument(
+        "--lang-out",
+        "-lo",
+        type=str,
+        default="zh-CN",
+        help="The code of target language.",
+    )
     parse_params.add_argument(
         "--thread",
         "-t",