Răsfoiți Sursa

refactor: translate stream

Byaidu 1 an în urmă
părinte
comite
8ebaaa9984
8 a modificat fișierele cu 136 adăugiri și 132 ștergeri
  1. 1 1
      README.md
  2. 1 1
      README_zh-CN.md
  3. 2 0
      pdf2zh/__init__.py
  4. 6 6
      pdf2zh/gui.py
  5. 92 74
      pdf2zh/high_level.py
  6. 2 2
      pdf2zh/pdf2zh.py
  7. 32 22
      tools/backend.py
  8. 0 26
      tools/tasks.py

+ 1 - 1
README.md

@@ -144,7 +144,7 @@ For docker deployment on cloud service:
 
 <h2 id="usage">Advanced Options</h2>
 
-Execute the translation command in the command line to generate the translated document `example-zh.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service.
+Execute the translation command in the command line to generate the translated document `example-mono.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service.
 
 <img src="./docs/images/cmd.explained.png" width="580px"  alt="cmd"/>  
 

+ 1 - 1
README_zh-CN.md

@@ -144,7 +144,7 @@
 
 <h2 id="usage">高级选项</h2>
 
-在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-zh.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务
+在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-mono.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务
 
 <img src="./docs/images/cmd.explained.png" width="580px"  alt="cmd"/>  
 

+ 2 - 0
pdf2zh/__init__.py

@@ -1,6 +1,8 @@
 import logging
+from pdf2zh.high_level import translate, translate_stream
 
 log = logging.getLogger(__name__)
 
 __version__ = "1.8.7"
 __author__ = "Byaidu"
+__all__ = ["translate", "translate_stream"]

+ 6 - 6
pdf2zh/gui.py

@@ -146,8 +146,8 @@ def translate_file(
         )
 
     filename = os.path.splitext(os.path.basename(file_path))[0]
-    file_en = output / f"{filename}.pdf"
-    file_zh = output / f"{filename}-zh.pdf"
+    file_raw = output / f"{filename}.pdf"
+    file_mono = output / f"{filename}-mono.pdf"
     file_dual = output / f"{filename}-dual.pdf"
 
     translator = service_map[service]
@@ -164,7 +164,7 @@ def translate_file(
         progress(t.n / t.total, desc="Translating...")
 
     param = {
-        "files": [file_en],
+        "files": [file_raw],
         "pages": selected_page,
         "lang_in": lang_from,
         "lang_out": lang_to,
@@ -177,18 +177,18 @@ def translate_file(
     translate(**param)
     print(f"Files after translation: {os.listdir(output)}")
 
-    if not file_zh.exists() or not file_dual.exists():
+    if not file_mono.exists() or not file_dual.exists():
         raise gr.Error("No output")
 
     try:
-        translated_preview = pdf_preview(str(file_zh))
+        translated_preview = pdf_preview(str(file_mono))
     except Exception:
         raise gr.Error("No preview")
 
     progress(1.0, desc="Translation complete!")
 
     return (
-        str(file_zh),
+        str(file_mono),
         translated_preview,
         str(file_dual),
         gr.update(visible=True),

+ 92 - 74
pdf2zh/high_level.py

@@ -14,7 +14,7 @@ from pdf2zh.converter import TranslateConverter
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
 from pdf2zh.doclayout import DocLayoutModel
 from pathlib import Path
-from typing import Any, Container, Iterable, List, Optional
+from typing import Any, Iterable, List
 import urllib.request
 import requests
 import tempfile
@@ -75,7 +75,6 @@ def translate_patch(
     inf: BinaryIO,
     pages=None,
     password: str = "",
-    page_count: int = 0,
     vfont: str = "",
     vchar: str = "",
     thread: int = 0,
@@ -86,7 +85,7 @@ def translate_patch(
     resfont: str = "",
     noto: Font = None,
     callback: object = None,
-    **kwarg,
+    **kwarg: Any,
 ) -> None:
     rsrcmgr = PDFResourceManager()
     layout = {}
@@ -100,7 +99,7 @@ def translate_patch(
     if pages:
         total_pages = len(pages)
     else:
-        total_pages = page_count
+        total_pages = doc_zh.page_count
 
     parser = PDFParser(inf)
     doc = PDFDocument(parser, password=password)
@@ -153,9 +152,89 @@ def translate_patch(
     return obj_patch
 
 
+def translate_stream(
+    stream,
+    pages=None,
+    password: str = "",
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    doc_zh: Document = None,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    callback: object = None,
+    **kwarg: Any,
+):
+    font_list = [("tiro", None)]
+    noto = None
+    if lang_out.lower() in resfont_map:  # CJK
+        resfont = resfont_map[lang_out.lower()]
+        font_list.append((resfont, None))
+    elif lang_out.lower() in noto_list:  # noto
+        resfont = "noto"
+        ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
+        if not os.path.exists(ttf_path):
+            print("Downloading Noto font...")
+            urllib.request.urlretrieve(
+                "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
+                ttf_path,
+            )
+        font_list.append(("noto", ttf_path))
+        noto = Font("noto", ttf_path)
+    else:  # fallback
+        resfont = "china-ss"
+        font_list.append(("china-ss", None))
+
+    doc_en = Document(stream=stream)
+    if doc_en.is_encrypted:
+        doc_en.authenticate(password)
+    doc_zh = Document(stream=stream)
+    page_count = doc_zh.page_count
+    # font_list = [("china-ss", None), ("tiro", None)]
+    font_id = {}
+    for page in doc_zh:
+        for font in font_list:
+            font_id[font[0]] = page.insert_font(font[0], font[1])
+    xreflen = doc_zh.xref_length()
+    for xref in range(1, xreflen):
+        for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
+            try:  # xref 读写可能出错
+                font_res = doc_zh.xref_get_key(xref, f"{label}Font")
+                if font_res[0] == "dict":
+                    for font in font_list:
+                        font_exist = doc_zh.xref_get_key(xref, f"{label}Font/{font[0]}")
+                        if font_exist[0] == "null":
+                            doc_zh.xref_set_key(
+                                xref,
+                                f"{label}Font/{font[0]}",
+                                f"{font_id[font[0]]} 0 R",
+                            )
+            except Exception:
+                pass
+
+    fp = io.BytesIO()
+    doc_zh.save(fp)
+    obj_patch: dict = translate_patch(fp, **locals())
+
+    for obj_id, ops_new in obj_patch.items():
+        # ops_old=doc_en.xref_stream(obj_id)
+        # print(obj_id)
+        # print(ops_old)
+        # print(ops_new.encode())
+        doc_zh.update_stream(obj_id, ops_new.encode())
+
+    doc_en.insert_file(doc_zh)
+    for id in range(page_count):
+        doc_en.move_page(page_count + id, id * 2 + 1)
+
+    return doc_zh.write(deflate=1), doc_en.write(deflate=1)
+
+
 def translate(
     files: Iterable[str] = [],
-    pages: Optional[Container[int]] = None,
+    output: str = "",
+    pages=None,
     password: str = "",
     vfont: str = "",
     vchar: str = "",
@@ -164,8 +243,7 @@ def translate(
     lang_out: str = "",
     service: str = "",
     callback: object = None,
-    output: str = "",
-    **kwargs: Any,
+    **kwarg: Any,
 ):
     if not files:
         raise PDFValueError("No files to process.")
@@ -199,72 +277,12 @@ def translate(
                 )
         filename = os.path.splitext(os.path.basename(file))[0]
 
-        font_list = [("tiro", None)]
-        noto = None
-        if lang_out.lower() in resfont_map:  # CJK
-            resfont = resfont_map[lang_out.lower()]
-            font_list.append((resfont, None))
-        elif lang_out.lower() in noto_list:  # noto
-            resfont = "noto"
-            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
-            if not os.path.exists(ttf_path):
-                print("Downloading Noto font...")
-                urllib.request.urlretrieve(
-                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
-                    ttf_path,
-                )
-            font_list.append(("noto", ttf_path))
-            noto = Font("noto", ttf_path)
-        else:  # fallback
-            resfont = "china-ss"
-            font_list.append(("china-ss", None))
-
-        doc_en = Document(file)
-        if doc_en.is_encrypted:
-            doc_en.authenticate(password)
-        doc_zh = Document(doc_en)
-        page_count = doc_zh.page_count
-        # font_list = [("china-ss", None), ("tiro", None)]
-        font_id = {}
-        for page in doc_zh:
-            for font in font_list:
-                font_id[font[0]] = page.insert_font(font[0], font[1])
-        xreflen = doc_zh.xref_length()
-        for xref in range(1, xreflen):
-            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
-                try:  # xref 读写可能出错
-                    font_res = doc_zh.xref_get_key(xref, f"{label}Font")
-                    if font_res[0] == "dict":
-                        for font in font_list:
-                            font_exist = doc_zh.xref_get_key(
-                                xref, f"{label}Font/{font[0]}"
-                            )
-                            if font_exist[0] == "null":
-                                doc_zh.xref_set_key(
-                                    xref,
-                                    f"{label}Font/{font[0]}",
-                                    f"{font_id[font[0]]} 0 R",
-                                )
-                except Exception:
-                    pass
-
-        fp = io.BytesIO()
-        doc_zh.save(fp)
-        obj_patch: dict = translate_patch(fp, **locals())
-
-        for obj_id, ops_new in obj_patch.items():
-            # ops_old=doc_en.xref_stream(obj_id)
-            # print(obj_id)
-            # print(ops_old)
-            # print(ops_new.encode())
-            doc_zh.update_stream(obj_id, ops_new.encode())
-
-        doc_en.insert_file(doc_zh)
-        for id in range(page_count):
-            doc_en.move_page(page_count + id, id * 2 + 1)
-        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
-        doc_en.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
-        doc_zh.close()
-        doc_en.close()
+        doc_raw = open(file, "rb")
+        s_raw = doc_raw.read()
+        s_mono, s_dual = translate_stream(s_raw, **locals())
+        doc_mono = open(Path(output) / f"{filename}-mono.pdf", "wb")
+        doc_dual = open(Path(output) / f"{filename}-dual.pdf", "wb")
+        doc_mono.write(s_mono)
+        doc_dual.write(s_dual)
 
     return

+ 2 - 2
pdf2zh/pdf2zh.py

@@ -12,8 +12,6 @@ from typing import List, Optional
 from pdf2zh import __version__, log
 from pdf2zh.high_level import translate
 
-logging.basicConfig()
-
 
 def create_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description=__doc__, add_help=True)
@@ -135,6 +133,8 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
 
 
 def main(args: Optional[List[str]] = None) -> int:
+    logging.basicConfig()
+
     parsed_args = parse_args(args)
 
     if parsed_args.debug:

+ 32 - 22
tools/backend.py

@@ -1,12 +1,8 @@
 import os
-import tempfile
-
 from flask import Flask, request, send_file
 from celery import Celery, Task
-from celery.result import AsyncResult
-from pathlib import Path
-from tasks import translate_task
-
+from pdf2zh import translate_stream
+import tqdm
 
 app = Flask("pdf2zh")
 app.config.from_mapping(
@@ -36,28 +32,42 @@ def celery_init_app(app: Flask) -> Celery:
 celery_app = celery_init_app(app)
 
 
+@app.task(bind=True)
+def translate_task(
+    stream: bytes,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+):
+    def progress_bar(t: tqdm.tqdm):
+        self.update_state(state="PROGRESS", meta={"n": t.n, "total": t.total})  # noqa
+        print(f"Translating {t.n} / {t.total} pages")
+
+    doc_mono, doc_dual = translate_stream(
+        stream,
+        lang_in=lang_in,
+        lang_out=lang_out,
+        service=service,
+        thread=4,
+        callback=progress_bar,
+    )
+    return doc_mono, doc_dual
+
+
 @app.route("/api/translate", methods=["POST"])
 def create_translate_tasks():
-    f = request.files["source"]
-    output_dir = Path(tempfile.mkdtemp())
-    file_basename = ".".join(f.filename.split(".")[:-1])
-    if len(file_basename) == 0:
-        file_basename = "input"
-    origin_pdf = output_dir / f"{file_basename}.pdf"
-    f.save(origin_pdf)
-    lang_in = request.args.get("lang_in", "auto")
+    stream = request.files["file"]
+    lang_in = request.args.get("lang_in", "en")
     lang_out = request.args.get("lang_out", "zh")
     service = request.args.get("service", "google")
-    task = translate_task.delay(
-        str(output_dir), file_basename, lang_in, lang_out, service
-    )
-    return {"result_id": task.id}
+    task = translate_task.delay(stream, lang_in, lang_out, service)
+    return {"id": task.id}
 
 
 @app.route("/api/results/<id>", methods=["GET"])
 def check_translate_result(id: str):
-    result = AsyncResult(id)
-    return {"ready": result.ready(), "successful": result.successful()}
+    result = celery_app.AsyncResult(id)
+    return {"state": result.state, "info": result.info}
 
 
 @app.route("/api/results/<id>/<format>")
@@ -67,8 +77,8 @@ def get_translate_result(id: str, format: str):
         return {"error": "task not finished"}, 400
     if not result.successful():
         return {"error": "task failed"}, 400
-    translated_pdf, dual_pdf = result.get()
-    to_send = translated_pdf if format == "translated" else dual_pdf
+    doc_mono, doc_dual = result.get()
+    to_send = doc_mono if format == "mono" else doc_dual
     return send_file(to_send, "application/pdf")
 
 

+ 0 - 26
tools/tasks.py

@@ -1,26 +0,0 @@
-from celery import shared_task
-from pathlib import Path
-from pdf2zh.pdf2zh import translate
-
-
-@shared_task(ignore_result=False)
-def translate_task(
-    output_dir: str,
-    filename: str,
-    lang_in: str = "auto",
-    lang_out: str = "zh",
-    service: str = "google",
-):
-    output_dir = Path(output_dir)
-    origin_pdf = output_dir / f"{filename}.pdf"
-    translated_pdf = output_dir / f"{filename}-zh.pdf"
-    dual_pdf = output_dir / f"{filename}-dual.pdf"
-    translate(
-        files=[str(origin_pdf)],
-        lang_in=lang_in,
-        lang_out=lang_out,
-        service=service,
-        thread=4,
-        output=str(output_dir)
-    )
-    return str(translated_pdf), str(dual_pdf)