|
|
@@ -9,16 +9,17 @@ import argparse
|
|
|
import logging
|
|
|
import os
|
|
|
import sys
|
|
|
+from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
|
|
|
+
|
|
|
import pymupdf
|
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
from pdf2zh import __version__
|
|
|
from pdf2zh.pdfexceptions import PDFValueError
|
|
|
-from typing import Any, Container, Iterable, List, Optional, TYPE_CHECKING
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
- from pdf2zh.utils import AnyIO
|
|
|
from pdf2zh.layout import LAParams
|
|
|
+ from pdf2zh.utils import AnyIO
|
|
|
|
|
|
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
|
|
|
|
|
|
@@ -69,6 +70,7 @@ def extract_text(
|
|
|
**kwargs: Any,
|
|
|
) -> AnyIO:
|
|
|
import doclayout_yolo
|
|
|
+
|
|
|
import pdf2zh.high_level
|
|
|
|
|
|
if not files:
|
|
|
@@ -84,55 +86,64 @@ def extract_text(
|
|
|
# if not os.path.exists(pth):
|
|
|
# print('Downloading...')
|
|
|
# urllib.request.urlretrieve("http://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt",pth)
|
|
|
- pth = hf_hub_download(repo_id="juliozhao/DocLayout-YOLO-DocStructBench", filename="doclayout_yolo_docstructbench_imgsz1024.pt")
|
|
|
+ pth = hf_hub_download(
|
|
|
+ repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
|
|
|
+ filename="doclayout_yolo_docstructbench_imgsz1024.pt",
|
|
|
+ )
|
|
|
model = doclayout_yolo.YOLOv10(pth)
|
|
|
|
|
|
for file in files:
|
|
|
-
|
|
|
filename = os.path.splitext(os.path.basename(file))[0]
|
|
|
|
|
|
doc_en = pymupdf.open(file)
|
|
|
- page_count=doc_en.page_count
|
|
|
- font_list=['china-ss','tiro']
|
|
|
- font_id={}
|
|
|
+ page_count = doc_en.page_count
|
|
|
+ font_list = ["china-ss", "tiro"]
|
|
|
+ font_id = {}
|
|
|
for page in doc_en:
|
|
|
for font in font_list:
|
|
|
- font_id[font]=page.insert_font(font)
|
|
|
+ font_id[font] = page.insert_font(font)
|
|
|
xreflen = doc_en.xref_length()
|
|
|
for xref in range(1, xreflen):
|
|
|
- for label in ['Resources/','']: # 可能是基于 xobj 的 res
|
|
|
- try: # xref 读写可能出错
|
|
|
- font_res=doc_en.xref_get_key(xref,f'{label}Font')
|
|
|
- if font_res[0]=='dict':
|
|
|
+ for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
|
|
|
+ try: # xref 读写可能出错
|
|
|
+ font_res = doc_en.xref_get_key(xref, f"{label}Font")
|
|
|
+ if font_res[0] == "dict":
|
|
|
for font in font_list:
|
|
|
- font_exist=doc_en.xref_get_key(xref,f'{label}Font/{font}')
|
|
|
- if font_exist[0]=='null':
|
|
|
- doc_en.xref_set_key(xref,f'{label}Font/{font}',f'{font_id[font]} 0 R')
|
|
|
+ font_exist = doc_en.xref_get_key(
|
|
|
+ xref, f"{label}Font/{font}"
|
|
|
+ )
|
|
|
+ if font_exist[0] == "null":
|
|
|
+ doc_en.xref_set_key(
|
|
|
+ xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
|
|
|
+ )
|
|
|
except:
|
|
|
pass
|
|
|
- doc_en.save(f'{filename}-en.pdf')
|
|
|
+ doc_en.save(f"{filename}-en.pdf")
|
|
|
|
|
|
- with open(f'{filename}-en.pdf', "rb") as fp:
|
|
|
- obj_patch:dict=pdf2zh.high_level.extract_text_to_fp(fp, **locals())
|
|
|
+ with open(f"{filename}-en.pdf", "rb") as fp:
|
|
|
+ obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
|
|
|
|
|
|
- for obj_id,ops_new in obj_patch.items():
|
|
|
+ for obj_id, ops_new in obj_patch.items():
|
|
|
# ops_old=doc_en.xref_stream(obj_id)
|
|
|
# print(obj_id)
|
|
|
# print(ops_old)
|
|
|
# print(ops_new.encode())
|
|
|
- doc_en.update_stream(obj_id,ops_new.encode())
|
|
|
+ doc_en.update_stream(obj_id, ops_new.encode())
|
|
|
|
|
|
doc_zh = doc_en
|
|
|
- doc_dual = pymupdf.open(f'{filename}-en.pdf')
|
|
|
+ doc_dual = pymupdf.open(f"{filename}-en.pdf")
|
|
|
doc_dual.insert_file(doc_zh)
|
|
|
for id in range(page_count):
|
|
|
- doc_dual.move_page(page_count+id,id*2+1)
|
|
|
- doc_zh.save(f'{filename}-zh.pdf',deflate=1)
|
|
|
- doc_dual.save(f'{filename}-dual.pdf',deflate=1)
|
|
|
+ doc_dual.move_page(page_count + id, id * 2 + 1)
|
|
|
+ doc_zh.save(f"{filename}-zh.pdf", deflate=1)
|
|
|
+ doc_dual.save(f"{filename}-dual.pdf", deflate=1)
|
|
|
doc_zh.close()
|
|
|
doc_dual.close()
|
|
|
-
|
|
|
- os.remove(f'{filename}-en.pdf')
|
|
|
+ try: # fix (main): permission error @ https://github.com/Byaidu/PDFMathTranslate/issues/84
|
|
|
+ os.remove(f"{filename}-en.pdf")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"File removal failed due to occupation / not existing, pass.\n{e}")
|
|
|
+ pass
|
|
|
|
|
|
return
|
|
|
|
|
|
@@ -249,12 +260,13 @@ def main(args: Optional[List[str]] = None) -> int:
|
|
|
|
|
|
missing_files = check_files(parsed_args.files)
|
|
|
if missing_files:
|
|
|
- print(f"The following files do not exist:", file=sys.stderr)
|
|
|
+ print("The following files do not exist:", file=sys.stderr)
|
|
|
for file in missing_files:
|
|
|
print(f" {file}", file=sys.stderr)
|
|
|
return -1
|
|
|
if parsed_args.interactive:
|
|
|
from pdf2zh.gui import setup_gui
|
|
|
+
|
|
|
setup_gui()
|
|
|
return 0
|
|
|
|