Răsfoiți Sursa

fix (main): permission error

Rongxin 1 an în urmă
părinte
comite
8a66da7842
1 a modificat fișierele cu 39 adăugiri și 27 ștergeri
  1. 39 27
      pdf2zh/pdf2zh.py

+ 39 - 27
pdf2zh/pdf2zh.py

@@ -9,16 +9,17 @@ import argparse
 import logging
 import os
 import sys
+from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
+
 import pymupdf
 from huggingface_hub import hf_hub_download
 
 from pdf2zh import __version__
 from pdf2zh.pdfexceptions import PDFValueError
-from typing import Any, Container, Iterable, List, Optional, TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from pdf2zh.utils import AnyIO
     from pdf2zh.layout import LAParams
+    from pdf2zh.utils import AnyIO
 
 OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
 
@@ -69,6 +70,7 @@ def extract_text(
     **kwargs: Any,
 ) -> AnyIO:
     import doclayout_yolo
+
     import pdf2zh.high_level
 
     if not files:
@@ -84,55 +86,64 @@ def extract_text(
     # if not os.path.exists(pth):
     #     print('Downloading...')
     #     urllib.request.urlretrieve("http://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt",pth)
-    pth = hf_hub_download(repo_id="juliozhao/DocLayout-YOLO-DocStructBench", filename="doclayout_yolo_docstructbench_imgsz1024.pt")
+    pth = hf_hub_download(
+        repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
+        filename="doclayout_yolo_docstructbench_imgsz1024.pt",
+    )
     model = doclayout_yolo.YOLOv10(pth)
 
     for file in files:
-
         filename = os.path.splitext(os.path.basename(file))[0]
 
         doc_en = pymupdf.open(file)
-        page_count=doc_en.page_count
-        font_list=['china-ss','tiro']
-        font_id={}
+        page_count = doc_en.page_count
+        font_list = ["china-ss", "tiro"]
+        font_id = {}
         for page in doc_en:
             for font in font_list:
-                font_id[font]=page.insert_font(font)
+                font_id[font] = page.insert_font(font)
         xreflen = doc_en.xref_length()
         for xref in range(1, xreflen):
-            for label in ['Resources/','']: # 可能是基于 xobj 的 res
-                try: # xref 读写可能出错
-                    font_res=doc_en.xref_get_key(xref,f'{label}Font')
-                    if font_res[0]=='dict':
+            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
+                try:  # xref 读写可能出错
+                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
+                    if font_res[0] == "dict":
                         for font in font_list:
-                            font_exist=doc_en.xref_get_key(xref,f'{label}Font/{font}')
-                            if font_exist[0]=='null':
-                                doc_en.xref_set_key(xref,f'{label}Font/{font}',f'{font_id[font]} 0 R')
+                            font_exist = doc_en.xref_get_key(
+                                xref, f"{label}Font/{font}"
+                            )
+                            if font_exist[0] == "null":
+                                doc_en.xref_set_key(
+                                    xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
+                                )
                 except:
                     pass
-        doc_en.save(f'{filename}-en.pdf')
+        doc_en.save(f"{filename}-en.pdf")
 
-        with open(f'{filename}-en.pdf', "rb") as fp:
-            obj_patch:dict=pdf2zh.high_level.extract_text_to_fp(fp, **locals())
+        with open(f"{filename}-en.pdf", "rb") as fp:
+            obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
 
-        for obj_id,ops_new in obj_patch.items():
+        for obj_id, ops_new in obj_patch.items():
             # ops_old=doc_en.xref_stream(obj_id)
             # print(obj_id)
             # print(ops_old)
             # print(ops_new.encode())
-            doc_en.update_stream(obj_id,ops_new.encode())
+            doc_en.update_stream(obj_id, ops_new.encode())
 
         doc_zh = doc_en
-        doc_dual = pymupdf.open(f'{filename}-en.pdf')
+        doc_dual = pymupdf.open(f"{filename}-en.pdf")
         doc_dual.insert_file(doc_zh)
         for id in range(page_count):
-            doc_dual.move_page(page_count+id,id*2+1)
-        doc_zh.save(f'{filename}-zh.pdf',deflate=1)
-        doc_dual.save(f'{filename}-dual.pdf',deflate=1)
+            doc_dual.move_page(page_count + id, id * 2 + 1)
+        doc_zh.save(f"{filename}-zh.pdf", deflate=1)
+        doc_dual.save(f"{filename}-dual.pdf", deflate=1)
         doc_zh.close()
         doc_dual.close()
-
-        os.remove(f'{filename}-en.pdf')
+        try:  # fix (main): permission error @ https://github.com/Byaidu/PDFMathTranslate/issues/84
+            os.remove(f"{filename}-en.pdf")
+        except Exception as e:
+            print(f"File removal failed due to occupation / not existing, pass.\n{e}")
+            pass
 
     return
 
@@ -249,12 +260,13 @@ def main(args: Optional[List[str]] = None) -> int:
 
     missing_files = check_files(parsed_args.files)
     if missing_files:
-        print(f"The following files do not exist:", file=sys.stderr)
+        print("The following files do not exist:", file=sys.stderr)
         for file in missing_files:
             print(f"  {file}", file=sys.stderr)
         return -1
     if parsed_args.interactive:
         from pdf2zh.gui import setup_gui
+
         setup_gui()
         return 0