1 жил өмнө · 95e19e3ec4
--- a/pdf2zh/gui.py
+++ b/pdf2zh/gui.py
@@ -2,7 +2,7 @@ import os
 
				 import shutil
			
 
				 from pathlib import Path
			
 
				 from pdf2zh import __version__
			
 
				-from pdf2zh.pdf2zh import extract_text
			
 
				+from pdf2zh.high_level import translate
			
 
				 from pdf2zh.translator import (
			
 
				     BaseTranslator,
			
 
				     GoogleTranslator,
			
@@ -111,7 +111,7 @@ def download_with_limit(url, save_path, size_limit):
 
				     return save_path / filename
			
 
				 
			
 
				 
			
 
				-def translate(
			
 
				+def translate_file(
			
 
				     file_type,
			
 
				     file_input,
			
 
				     link_input,
			
@@ -174,7 +174,7 @@ def translate(
 
				         "callback": progress_bar,
			
 
				     }
			
 
				     print(param)
			
 
				-    extract_text(**param)
			
 
				+    translate(**param)
			
 
				     print(f"Files after translation: {os.listdir(output)}")
			
 
				 
			
 
				     if not file_zh.exists() or not file_dual.exists():
			
@@ -405,7 +405,7 @@ with gr.Blocks(
 
				     )
			
 
				 
			
 
				     translate_btn.click(
			
 
				-        translate,
			
 
				+        translate_file,
			
 
				         inputs=[
			
 
				             file_type,
			
 
				             file_input,
			
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -3,21 +3,77 @@
 
				 from typing import BinaryIO
			
 
				 import numpy as np
			
 
				 import tqdm
			
 
				-from pymupdf import Document
			
 
				+import sys
			
 
				+from pymupdf import Font, Document
			
 
				 from pdfminer.pdfpage import PDFPage
			
 
				 from pdfminer.pdfinterp import PDFResourceManager
			
 
				 from pdfminer.pdfdocument import PDFDocument
			
 
				 from pdfminer.pdfparser import PDFParser
			
 
				+from pdfminer.pdfexceptions import PDFValueError
			
 
				 from pdf2zh.converter import TranslateConverter
			
 
				 from pdf2zh.pdfinterp import PDFPageInterpreterEx
			
 
				-from pymupdf import Font
			
 
				+from pdf2zh.doclayout import DocLayoutModel
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Container, Iterable, List, Optional
			
 
				+import urllib.request
			
 
				+import requests
			
 
				+import tempfile
			
 
				+import os
			
 
				 
			
 
				+model = DocLayoutModel.load_available()
			
 
				 
			
 
				-def extract_text_to_fp(
			
 
				+resfont_map = {
			
 
				+    "zh-cn": "china-ss",
			
 
				+    "zh-tw": "china-ts",
			
 
				+    "zh-hans": "china-ss",
			
 
				+    "zh-hant": "china-ts",
			
 
				+    "zh": "china-ss",
			
 
				+    "ja": "japan-s",
			
 
				+    "ko": "korea-s",
			
 
				+}
			
 
				+
			
 
				+noto_list = [
			
 
				+    "am",  # Amharic
			
 
				+    "ar",  # Arabic
			
 
				+    "bn",  # Bengali
			
 
				+    "bg",  # Bulgarian
			
 
				+    "chr",  # Cherokee
			
 
				+    "el",  # Greek
			
 
				+    "gu",  # Gujarati
			
 
				+    "iw",  # Hebrew
			
 
				+    "hi",  # Hindi
			
 
				+    # "ja",  # Japanese
			
 
				+    "kn",  # Kannada
			
 
				+    # "ko",  # Korean
			
 
				+    "ml",  # Malayalam
			
 
				+    "mr",  # Marathi
			
 
				+    "ru",  # Russian
			
 
				+    "sr",  # Serbian
			
 
				+    # "zh-cn",# SC
			
 
				+    "ta",  # Tamil
			
 
				+    "te",  # Telugu
			
 
				+    "th",  # Thai
			
 
				+    # "zh-tw",# TC
			
 
				+    "ur",  # Urdu
			
 
				+    "uk",  # Ukrainian
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def check_files(files: List[str]) -> List[str]:
			
 
				+    files = [
			
 
				+        f for f in files if not f.startswith("http://")
			
 
				+    ]  # exclude online files, http
			
 
				+    files = [
			
 
				+        f for f in files if not f.startswith("https://")
			
 
				+    ]  # exclude online files, https
			
 
				+    missing_files = [file for file in files if not os.path.exists(file)]
			
 
				+    return missing_files
			
 
				+
			
 
				+
			
 
				+def translate_patch(
			
 
				     inf: BinaryIO,
			
 
				     pages=None,
			
 
				     password: str = "",
			
 
				-    debug: bool = False,
			
 
				     page_count: int = 0,
			
 
				     vfont: str = "",
			
 
				     vchar: str = "",
			
@@ -95,3 +151,120 @@ def extract_text_to_fp(
 
				 
			
 
				     device.close()
			
 
				     return obj_patch
			
 
				+
			
 
				+
			
 
				+def translate(
			
 
				+    files: Iterable[str] = [],
			
 
				+    pages: Optional[Container[int]] = None,
			
 
				+    password: str = "",
			
 
				+    vfont: str = "",
			
 
				+    vchar: str = "",
			
 
				+    thread: int = 0,
			
 
				+    lang_in: str = "",
			
 
				+    lang_out: str = "",
			
 
				+    service: str = "",
			
 
				+    callback: object = None,
			
 
				+    output: str = "",
			
 
				+    **kwargs: Any,
			
 
				+):
			
 
				+    if not files:
			
 
				+        raise PDFValueError("No files to process.")
			
 
				+
			
 
				+    missing_files = check_files(files)
			
 
				+
			
 
				+    if missing_files:
			
 
				+        print("The following files do not exist:", file=sys.stderr)
			
 
				+        for file in missing_files:
			
 
				+            print(f"  {file}", file=sys.stderr)
			
 
				+        raise PDFValueError("Some files do not exist.")
			
 
				+
			
 
				+    for file in files:
			
 
				+        if file is str and (file.startswith("http://") or file.startswith("https://")):
			
 
				+            print("Online files detected, downloading...")
			
 
				+            try:
			
 
				+                r = requests.get(file, allow_redirects=True)
			
 
				+                if r.status_code == 200:
			
 
				+                    if not os.path.exists("./pdf2zh_files"):
			
 
				+                        print("Making a temporary dir for downloading PDF files...")
			
 
				+                        os.mkdir(os.path.dirname("./pdf2zh_files"))
			
 
				+                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
			
 
				+                        print(f"Writing the file: {file}...")
			
 
				+                        f.write(r.content)
			
 
				+                    file = "./pdf2zh_files/tmp_download.pdf"
			
 
				+                else:
			
 
				+                    r.raise_for_status()
			
 
				+            except Exception as e:
			
 
				+                raise PDFValueError(
			
 
				+                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
			
 
				+                )
			
 
				+        filename = os.path.splitext(os.path.basename(file))[0]
			
 
				+
			
 
				+        font_list = [("tiro", None)]
			
 
				+        noto = None
			
 
				+        if lang_out.lower() in resfont_map:  # CJK
			
 
				+            resfont = resfont_map[lang_out.lower()]
			
 
				+            font_list.append((resfont, None))
			
 
				+        elif lang_out.lower() in noto_list:  # noto
			
 
				+            resfont = "noto"
			
 
				+            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
			
 
				+            if not os.path.exists(ttf_path):
			
 
				+                print("Downloading Noto font...")
			
 
				+                urllib.request.urlretrieve(
			
 
				+                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
			
 
				+                    ttf_path,
			
 
				+                )
			
 
				+            font_list.append(("noto", ttf_path))
			
 
				+            noto = Font("noto", ttf_path)
			
 
				+        else:  # fallback
			
 
				+            resfont = "china-ss"
			
 
				+            font_list.append(("china-ss", None))
			
 
				+
			
 
				+        doc_en = Document(file)
			
 
				+        page_count = doc_en.page_count
			
 
				+        # font_list = [("china-ss", None), ("tiro", None)]
			
 
				+        font_id = {}
			
 
				+        for page in doc_en:
			
 
				+            for font in font_list:
			
 
				+                font_id[font[0]] = page.insert_font(font[0], font[1])
			
 
				+        xreflen = doc_en.xref_length()
			
 
				+        for xref in range(1, xreflen):
			
 
				+            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
			
 
				+                try:  # xref 读写可能出错
			
 
				+                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
			
 
				+                    if font_res[0] == "dict":
			
 
				+                        for font in font_list:
			
 
				+                            font_exist = doc_en.xref_get_key(
			
 
				+                                xref, f"{label}Font/{font[0]}"
			
 
				+                            )
			
 
				+                            if font_exist[0] == "null":
			
 
				+                                doc_en.xref_set_key(
			
 
				+                                    xref,
			
 
				+                                    f"{label}Font/{font[0]}",
			
 
				+                                    f"{font_id[font[0]]} 0 R",
			
 
				+                                )
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+        doc_en.save(Path(output) / f"{filename}-en.pdf")
			
 
				+
			
 
				+        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
			
 
				+            obj_patch: dict = translate_patch(fp, model=model, **locals())
			
 
				+
			
 
				+        for obj_id, ops_new in obj_patch.items():
			
 
				+            # ops_old=doc_en.xref_stream(obj_id)
			
 
				+            # print(obj_id)
			
 
				+            # print(ops_old)
			
 
				+            # print(ops_new.encode())
			
 
				+            doc_en.update_stream(obj_id, ops_new.encode())
			
 
				+
			
 
				+        doc_zh = doc_en
			
 
				+        doc_dual = Document(Path(output) / f"{filename}-en.pdf")
			
 
				+        doc_dual.insert_file(doc_zh)
			
 
				+        for id in range(page_count):
			
 
				+            doc_dual.move_page(page_count + id, id * 2 + 1)
			
 
				+        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
			
 
				+        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
			
 
				+        doc_zh.close()
			
 
				+        doc_dual.close()
			
 
				+        os.remove(Path(output) / f"{filename}-en.pdf")
			
 
				+
			
 
				+    return
			
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -6,185 +6,14 @@ output it to plain text, html, xml or tags.
 
				 from __future__ import annotations
			
 
				 
			
 
				 import argparse
			
 
				-import os
			
 
				 import sys
			
 
				 import logging
			
 
				-from pathlib import Path
			
 
				-from typing import Any, Container, Iterable, List, Optional
			
 
				-import urllib.request
			
 
				-from pdfminer.pdfexceptions import PDFValueError
			
 
				-
			
 
				-import pymupdf
			
 
				-import requests
			
 
				-import tempfile
			
 
				-
			
 
				+from typing import List, Optional
			
 
				 from pdf2zh import __version__, log
			
 
				-from pdf2zh.high_level import extract_text_to_fp
			
 
				-from pdf2zh.doclayout import DocLayoutModel
			
 
				+from pdf2zh.high_level import translate
			
 
				 
			
 
				 logging.basicConfig()
			
 
				 
			
 
				-model = DocLayoutModel.load_available()
			
 
				-
			
 
				-resfont_map = {
			
 
				-    "zh-cn": "china-ss",
			
 
				-    "zh-tw": "china-ts",
			
 
				-    "zh-hans": "china-ss",
			
 
				-    "zh-hant": "china-ts",
			
 
				-    "zh": "china-ss",
			
 
				-    "ja": "japan-s",
			
 
				-    "ko": "korea-s",
			
 
				-}
			
 
				-noto_list = [
			
 
				-    "am",  # Amharic
			
 
				-    "ar",  # Arabic
			
 
				-    "bn",  # Bengali
			
 
				-    "bg",  # Bulgarian
			
 
				-    "chr",  # Cherokee
			
 
				-    "el",  # Greek
			
 
				-    "gu",  # Gujarati
			
 
				-    "iw",  # Hebrew
			
 
				-    "hi",  # Hindi
			
 
				-    # "ja",  # Japanese
			
 
				-    "kn",  # Kannada
			
 
				-    # "ko",  # Korean
			
 
				-    "ml",  # Malayalam
			
 
				-    "mr",  # Marathi
			
 
				-    "ru",  # Russian
			
 
				-    "sr",  # Serbian
			
 
				-    # "zh-cn",# SC
			
 
				-    "ta",  # Tamil
			
 
				-    "te",  # Telugu
			
 
				-    "th",  # Thai
			
 
				-    # "zh-tw",# TC
			
 
				-    "ur",  # Urdu
			
 
				-    "uk",  # Ukrainian
			
 
				-]
			
 
				-
			
 
				-
			
 
				-def check_files(files: List[str]) -> List[str]:
			
 
				-    files = [
			
 
				-        f for f in files if not f.startswith("http://")
			
 
				-    ]  # exclude online files, http
			
 
				-    files = [
			
 
				-        f for f in files if not f.startswith("https://")
			
 
				-    ]  # exclude online files, https
			
 
				-    missing_files = [file for file in files if not os.path.exists(file)]
			
 
				-    return missing_files
			
 
				-
			
 
				-
			
 
				-def extract_text(
			
 
				-    files: Iterable[str] = [],
			
 
				-    pages: Optional[Container[int]] = None,
			
 
				-    password: str = "",
			
 
				-    debug: bool = False,
			
 
				-    vfont: str = "",
			
 
				-    vchar: str = "",
			
 
				-    thread: int = 0,
			
 
				-    lang_in: str = "",
			
 
				-    lang_out: str = "",
			
 
				-    service: str = "",
			
 
				-    callback: object = None,
			
 
				-    output: str = "",
			
 
				-    **kwargs: Any,
			
 
				-):
			
 
				-    if debug:
			
 
				-        log.setLevel(logging.DEBUG)
			
 
				-
			
 
				-    if not files:
			
 
				-        raise PDFValueError("Must provide files to work upon!")
			
 
				-
			
 
				-    for file in files:
			
 
				-        if file is str and (file.startswith("http://") or file.startswith("https://")):
			
 
				-            print("Online files detected, downloading...")
			
 
				-            try:
			
 
				-                r = requests.get(file, allow_redirects=True)
			
 
				-                if r.status_code == 200:
			
 
				-                    if not os.path.exists("./pdf2zh_files"):
			
 
				-                        print("Making a temporary dir for downloading PDF files...")
			
 
				-                        os.mkdir(os.path.dirname("./pdf2zh_files"))
			
 
				-                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
			
 
				-                        print(f"Writing the file: {file}...")
			
 
				-                        f.write(r.content)
			
 
				-                    file = "./pdf2zh_files/tmp_download.pdf"
			
 
				-                else:
			
 
				-                    r.raise_for_status()
			
 
				-            except Exception as e:
			
 
				-                raise PDFValueError(
			
 
				-                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
			
 
				-                )
			
 
				-        filename = os.path.splitext(os.path.basename(file))[0]
			
 
				-
			
 
				-        font_list = [("tiro", None)]
			
 
				-        noto = None
			
 
				-        if lang_out.lower() in resfont_map:  # CJK
			
 
				-            resfont = resfont_map[lang_out.lower()]
			
 
				-            font_list.append((resfont, None))
			
 
				-        elif lang_out.lower() in noto_list:  # noto
			
 
				-            resfont = "noto"
			
 
				-            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
			
 
				-            if not os.path.exists(ttf_path):
			
 
				-                print("Downloading Noto font...")
			
 
				-                urllib.request.urlretrieve(
			
 
				-                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
			
 
				-                    ttf_path,
			
 
				-                )
			
 
				-            font_list.append(("noto", ttf_path))
			
 
				-            noto = pymupdf.Font("noto", ttf_path)
			
 
				-        else:  # fallback
			
 
				-            resfont = "china-ss"
			
 
				-            font_list.append(("china-ss", None))
			
 
				-
			
 
				-        doc_en = pymupdf.open(file)
			
 
				-        page_count = doc_en.page_count
			
 
				-        # font_list = [("china-ss", None), ("tiro", None)]
			
 
				-        font_id = {}
			
 
				-        for page in doc_en:
			
 
				-            for font in font_list:
			
 
				-                font_id[font[0]] = page.insert_font(font[0], font[1])
			
 
				-        xreflen = doc_en.xref_length()
			
 
				-        for xref in range(1, xreflen):
			
 
				-            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
			
 
				-                try:  # xref 读写可能出错
			
 
				-                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
			
 
				-                    if font_res[0] == "dict":
			
 
				-                        for font in font_list:
			
 
				-                            font_exist = doc_en.xref_get_key(
			
 
				-                                xref, f"{label}Font/{font[0]}"
			
 
				-                            )
			
 
				-                            if font_exist[0] == "null":
			
 
				-                                doc_en.xref_set_key(
			
 
				-                                    xref,
			
 
				-                                    f"{label}Font/{font[0]}",
			
 
				-                                    f"{font_id[font[0]]} 0 R",
			
 
				-                                )
			
 
				-                except Exception:
			
 
				-                    pass
			
 
				-        doc_en.save(Path(output) / f"{filename}-en.pdf")
			
 
				-
			
 
				-        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
			
 
				-            obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
			
 
				-
			
 
				-        for obj_id, ops_new in obj_patch.items():
			
 
				-            # ops_old=doc_en.xref_stream(obj_id)
			
 
				-            # print(obj_id)
			
 
				-            # print(ops_old)
			
 
				-            # print(ops_new.encode())
			
 
				-            doc_en.update_stream(obj_id, ops_new.encode())
			
 
				-
			
 
				-        doc_zh = doc_en
			
 
				-        doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
			
 
				-        doc_dual.insert_file(doc_zh)
			
 
				-        for id in range(page_count):
			
 
				-            doc_dual.move_page(page_count + id, id * 2 + 1)
			
 
				-        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
			
 
				-        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
			
 
				-        doc_zh.close()
			
 
				-        doc_dual.close()
			
 
				-        os.remove(Path(output) / f"{filename}-en.pdf")
			
 
				-
			
 
				-    return
			
 
				-
			
 
				 
			
 
				 def create_parser() -> argparse.ArgumentParser:
			
 
				     parser = argparse.ArgumentParser(description=__doc__, add_help=True)
			
@@ -308,19 +137,15 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
 
				 def main(args: Optional[List[str]] = None) -> int:
			
 
				     parsed_args = parse_args(args)
			
 
				 
			
 
				-    missing_files = check_files(parsed_args.files)
			
 
				-    if missing_files:
			
 
				-        print("The following files do not exist:", file=sys.stderr)
			
 
				-        for file in missing_files:
			
 
				-            print(f"  {file}", file=sys.stderr)
			
 
				-        return -1
			
 
				+    if parsed_args.debug:
			
 
				+        log.setLevel(logging.DEBUG)
			
 
				+
			
 
				     if parsed_args.interactive:
			
 
				         from pdf2zh.gui import setup_gui
			
 
				-
			
 
				         setup_gui(parsed_args.share)
			
 
				         return 0
			
 
				 
			
 
				-    extract_text(**vars(parsed_args))
			
 
				+    translate(**vars(parsed_args))
			
 
				     return 0