Эх сурвалжийг харах

refactor(main): add translate

Byaidu 1 жил өмнө
parent
commit
95e19e3ec4
3 өөрчлөгдсөн 187 нэмэгдсэн , 189 устгасан
  1. 4 4
      pdf2zh/gui.py
  2. 177 4
      pdf2zh/high_level.py
  3. 6 181
      pdf2zh/pdf2zh.py

+ 4 - 4
pdf2zh/gui.py

@@ -2,7 +2,7 @@ import os
 import shutil
 from pathlib import Path
 from pdf2zh import __version__
-from pdf2zh.pdf2zh import extract_text
+from pdf2zh.high_level import translate
 from pdf2zh.translator import (
     BaseTranslator,
     GoogleTranslator,
@@ -111,7 +111,7 @@ def download_with_limit(url, save_path, size_limit):
     return save_path / filename
 
 
-def translate(
+def translate_file(
     file_type,
     file_input,
     link_input,
@@ -174,7 +174,7 @@ def translate(
         "callback": progress_bar,
     }
     print(param)
-    extract_text(**param)
+    translate(**param)
     print(f"Files after translation: {os.listdir(output)}")
 
     if not file_zh.exists() or not file_dual.exists():
@@ -405,7 +405,7 @@ with gr.Blocks(
     )
 
     translate_btn.click(
-        translate,
+        translate_file,
         inputs=[
             file_type,
             file_input,

+ 177 - 4
pdf2zh/high_level.py

@@ -3,21 +3,77 @@
 from typing import BinaryIO
 import numpy as np
 import tqdm
-from pymupdf import Document
+import sys
+from pymupdf import Font, Document
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfexceptions import PDFValueError
 from pdf2zh.converter import TranslateConverter
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
-from pymupdf import Font
+from pdf2zh.doclayout import DocLayoutModel
+from pathlib import Path
+from typing import Any, Container, Iterable, List, Optional
+import urllib.request
+import requests
+import tempfile
+import os
 
+model = DocLayoutModel.load_available()
 
-def extract_text_to_fp(
+resfont_map = {
+    "zh-cn": "china-ss",
+    "zh-tw": "china-ts",
+    "zh-hans": "china-ss",
+    "zh-hant": "china-ts",
+    "zh": "china-ss",
+    "ja": "japan-s",
+    "ko": "korea-s",
+}
+
+noto_list = [
+    "am",  # Amharic
+    "ar",  # Arabic
+    "bn",  # Bengali
+    "bg",  # Bulgarian
+    "chr",  # Cherokee
+    "el",  # Greek
+    "gu",  # Gujarati
+    "iw",  # Hebrew
+    "hi",  # Hindi
+    # "ja",  # Japanese
+    "kn",  # Kannada
+    # "ko",  # Korean
+    "ml",  # Malayalam
+    "mr",  # Marathi
+    "ru",  # Russian
+    "sr",  # Serbian
+    # "zh-cn",# SC
+    "ta",  # Tamil
+    "te",  # Telugu
+    "th",  # Thai
+    # "zh-tw",# TC
+    "ur",  # Urdu
+    "uk",  # Ukrainian
+]
+
+
+def check_files(files: List[str]) -> List[str]:
+    files = [
+        f for f in files if not f.startswith("http://")
+    ]  # exclude online files, http
+    files = [
+        f for f in files if not f.startswith("https://")
+    ]  # exclude online files, https
+    missing_files = [file for file in files if not os.path.exists(file)]
+    return missing_files
+
+
+def translate_patch(
     inf: BinaryIO,
     pages=None,
     password: str = "",
-    debug: bool = False,
     page_count: int = 0,
     vfont: str = "",
     vchar: str = "",
@@ -95,3 +151,120 @@ def extract_text_to_fp(
 
     device.close()
     return obj_patch
+
+
+def translate(
+    files: Iterable[str] = [],
+    pages: Optional[Container[int]] = None,
+    password: str = "",
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    callback: object = None,
+    output: str = "",
+    **kwargs: Any,
+):
+    if not files:
+        raise PDFValueError("No files to process.")
+
+    missing_files = check_files(files)
+
+    if missing_files:
+        print("The following files do not exist:", file=sys.stderr)
+        for file in missing_files:
+            print(f"  {file}", file=sys.stderr)
+        raise PDFValueError("Some files do not exist.")
+
+    for file in files:
+        if file is str and (file.startswith("http://") or file.startswith("https://")):
+            print("Online files detected, downloading...")
+            try:
+                r = requests.get(file, allow_redirects=True)
+                if r.status_code == 200:
+                    if not os.path.exists("./pdf2zh_files"):
+                        print("Making a temporary dir for downloading PDF files...")
+                        os.mkdir(os.path.dirname("./pdf2zh_files"))
+                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
+                        print(f"Writing the file: {file}...")
+                        f.write(r.content)
+                    file = "./pdf2zh_files/tmp_download.pdf"
+                else:
+                    r.raise_for_status()
+            except Exception as e:
+                raise PDFValueError(
+                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
+                )
+        filename = os.path.splitext(os.path.basename(file))[0]
+
+        font_list = [("tiro", None)]
+        noto = None
+        if lang_out.lower() in resfont_map:  # CJK
+            resfont = resfont_map[lang_out.lower()]
+            font_list.append((resfont, None))
+        elif lang_out.lower() in noto_list:  # noto
+            resfont = "noto"
+            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
+            if not os.path.exists(ttf_path):
+                print("Downloading Noto font...")
+                urllib.request.urlretrieve(
+                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
+                    ttf_path,
+                )
+            font_list.append(("noto", ttf_path))
+            noto = Font("noto", ttf_path)
+        else:  # fallback
+            resfont = "china-ss"
+            font_list.append(("china-ss", None))
+
+        doc_en = Document(file)
+        page_count = doc_en.page_count
+        # font_list = [("china-ss", None), ("tiro", None)]
+        font_id = {}
+        for page in doc_en:
+            for font in font_list:
+                font_id[font[0]] = page.insert_font(font[0], font[1])
+        xreflen = doc_en.xref_length()
+        for xref in range(1, xreflen):
+            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
+                try:  # xref 读写可能出错
+                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
+                    if font_res[0] == "dict":
+                        for font in font_list:
+                            font_exist = doc_en.xref_get_key(
+                                xref, f"{label}Font/{font[0]}"
+                            )
+                            if font_exist[0] == "null":
+                                doc_en.xref_set_key(
+                                    xref,
+                                    f"{label}Font/{font[0]}",
+                                    f"{font_id[font[0]]} 0 R",
+                                )
+                except Exception:
+                    pass
+        doc_en.save(Path(output) / f"{filename}-en.pdf")
+
+        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
+            obj_patch: dict = translate_patch(fp, model=model, **locals())
+
+        for obj_id, ops_new in obj_patch.items():
+            # ops_old=doc_en.xref_stream(obj_id)
+            # print(obj_id)
+            # print(ops_old)
+            # print(ops_new.encode())
+            doc_en.update_stream(obj_id, ops_new.encode())
+
+        doc_zh = doc_en
+        doc_dual = Document(Path(output) / f"{filename}-en.pdf")
+        doc_dual.insert_file(doc_zh)
+        for id in range(page_count):
+            doc_dual.move_page(page_count + id, id * 2 + 1)
+        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
+        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
+        doc_zh.close()
+        doc_dual.close()
+        os.remove(Path(output) / f"{filename}-en.pdf")
+
+    return

+ 6 - 181
pdf2zh/pdf2zh.py

@@ -6,185 +6,14 @@ output it to plain text, html, xml or tags.
 from __future__ import annotations
 
 import argparse
-import os
 import sys
 import logging
-from pathlib import Path
-from typing import Any, Container, Iterable, List, Optional
-import urllib.request
-from pdfminer.pdfexceptions import PDFValueError
-
-import pymupdf
-import requests
-import tempfile
-
+from typing import List, Optional
 from pdf2zh import __version__, log
-from pdf2zh.high_level import extract_text_to_fp
-from pdf2zh.doclayout import DocLayoutModel
+from pdf2zh.high_level import translate
 
 logging.basicConfig()
 
-model = DocLayoutModel.load_available()
-
-resfont_map = {
-    "zh-cn": "china-ss",
-    "zh-tw": "china-ts",
-    "zh-hans": "china-ss",
-    "zh-hant": "china-ts",
-    "zh": "china-ss",
-    "ja": "japan-s",
-    "ko": "korea-s",
-}
-noto_list = [
-    "am",  # Amharic
-    "ar",  # Arabic
-    "bn",  # Bengali
-    "bg",  # Bulgarian
-    "chr",  # Cherokee
-    "el",  # Greek
-    "gu",  # Gujarati
-    "iw",  # Hebrew
-    "hi",  # Hindi
-    # "ja",  # Japanese
-    "kn",  # Kannada
-    # "ko",  # Korean
-    "ml",  # Malayalam
-    "mr",  # Marathi
-    "ru",  # Russian
-    "sr",  # Serbian
-    # "zh-cn",# SC
-    "ta",  # Tamil
-    "te",  # Telugu
-    "th",  # Thai
-    # "zh-tw",# TC
-    "ur",  # Urdu
-    "uk",  # Ukrainian
-]
-
-
-def check_files(files: List[str]) -> List[str]:
-    files = [
-        f for f in files if not f.startswith("http://")
-    ]  # exclude online files, http
-    files = [
-        f for f in files if not f.startswith("https://")
-    ]  # exclude online files, https
-    missing_files = [file for file in files if not os.path.exists(file)]
-    return missing_files
-
-
-def extract_text(
-    files: Iterable[str] = [],
-    pages: Optional[Container[int]] = None,
-    password: str = "",
-    debug: bool = False,
-    vfont: str = "",
-    vchar: str = "",
-    thread: int = 0,
-    lang_in: str = "",
-    lang_out: str = "",
-    service: str = "",
-    callback: object = None,
-    output: str = "",
-    **kwargs: Any,
-):
-    if debug:
-        log.setLevel(logging.DEBUG)
-
-    if not files:
-        raise PDFValueError("Must provide files to work upon!")
-
-    for file in files:
-        if file is str and (file.startswith("http://") or file.startswith("https://")):
-            print("Online files detected, downloading...")
-            try:
-                r = requests.get(file, allow_redirects=True)
-                if r.status_code == 200:
-                    if not os.path.exists("./pdf2zh_files"):
-                        print("Making a temporary dir for downloading PDF files...")
-                        os.mkdir(os.path.dirname("./pdf2zh_files"))
-                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
-                        print(f"Writing the file: {file}...")
-                        f.write(r.content)
-                    file = "./pdf2zh_files/tmp_download.pdf"
-                else:
-                    r.raise_for_status()
-            except Exception as e:
-                raise PDFValueError(
-                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
-                )
-        filename = os.path.splitext(os.path.basename(file))[0]
-
-        font_list = [("tiro", None)]
-        noto = None
-        if lang_out.lower() in resfont_map:  # CJK
-            resfont = resfont_map[lang_out.lower()]
-            font_list.append((resfont, None))
-        elif lang_out.lower() in noto_list:  # noto
-            resfont = "noto"
-            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
-            if not os.path.exists(ttf_path):
-                print("Downloading Noto font...")
-                urllib.request.urlretrieve(
-                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
-                    ttf_path,
-                )
-            font_list.append(("noto", ttf_path))
-            noto = pymupdf.Font("noto", ttf_path)
-        else:  # fallback
-            resfont = "china-ss"
-            font_list.append(("china-ss", None))
-
-        doc_en = pymupdf.open(file)
-        page_count = doc_en.page_count
-        # font_list = [("china-ss", None), ("tiro", None)]
-        font_id = {}
-        for page in doc_en:
-            for font in font_list:
-                font_id[font[0]] = page.insert_font(font[0], font[1])
-        xreflen = doc_en.xref_length()
-        for xref in range(1, xreflen):
-            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
-                try:  # xref 读写可能出错
-                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
-                    if font_res[0] == "dict":
-                        for font in font_list:
-                            font_exist = doc_en.xref_get_key(
-                                xref, f"{label}Font/{font[0]}"
-                            )
-                            if font_exist[0] == "null":
-                                doc_en.xref_set_key(
-                                    xref,
-                                    f"{label}Font/{font[0]}",
-                                    f"{font_id[font[0]]} 0 R",
-                                )
-                except Exception:
-                    pass
-        doc_en.save(Path(output) / f"{filename}-en.pdf")
-
-        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
-            obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
-
-        for obj_id, ops_new in obj_patch.items():
-            # ops_old=doc_en.xref_stream(obj_id)
-            # print(obj_id)
-            # print(ops_old)
-            # print(ops_new.encode())
-            doc_en.update_stream(obj_id, ops_new.encode())
-
-        doc_zh = doc_en
-        doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
-        doc_dual.insert_file(doc_zh)
-        for id in range(page_count):
-            doc_dual.move_page(page_count + id, id * 2 + 1)
-        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
-        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
-        doc_zh.close()
-        doc_dual.close()
-        os.remove(Path(output) / f"{filename}-en.pdf")
-
-    return
-
 
 def create_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description=__doc__, add_help=True)
@@ -308,19 +137,15 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
 def main(args: Optional[List[str]] = None) -> int:
     parsed_args = parse_args(args)
 
-    missing_files = check_files(parsed_args.files)
-    if missing_files:
-        print("The following files do not exist:", file=sys.stderr)
-        for file in missing_files:
-            print(f"  {file}", file=sys.stderr)
-        return -1
+    if parsed_args.debug:
+        log.setLevel(logging.DEBUG)
+
     if parsed_args.interactive:
         from pdf2zh.gui import setup_gui
-
         setup_gui(parsed_args.share)
         return 0
 
-    extract_text(**vars(parsed_args))
+    translate(**vars(parsed_args))
     return 0