1 год назад · 963315d2fb
--- a/README.md
+++ b/README.md
@@ -59,9 +59,11 @@ Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
 
															 Document parsing: [Pdfminer.six](https://github.com/pdfminer/pdfminer.six)
														
 
															+Document extraction: [MinerU](https://github.com/opendatalab/MinerU)
														
 
															+
														
 
															 Multi-threaded translation: [MathTranslate](https://github.com/SUSYUSTC/MathTranslate)
														
 
															-Layout parsing: [LayoutParser](https://github.com/Layout-Parser/layout-parser)
														
 
															+Layout parsing: [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
														
 
															 ## Star History
														
--- a/pdf2zh/__init__.py
+++ b/pdf2zh/__init__.py
@@ -1,2 +1,2 @@
 
															-__version__ = "1.5.2"
														
 
															+__version__ = "1.5.3"
														
 
															 __author__ = "Byaidu"
														
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -15,6 +15,7 @@ from typing import (
 
															     cast,
														
 
															 )
														
 
															 import concurrent.futures
														
 
															+import numpy as np
														
 
															 import html
														
 
															 import requests
														
 
															 import unicodedata
														
@@ -434,16 +435,14 @@ class TextConverter(PDFConverter[AnyIO]):
 
															                     if child.matrix[0]==0 and child.matrix[3]==0: # 竖直段落
														
 
															                         cur_v=True
														
 
															                         ind_v=True
														
 
															-                        # print(child.get_text(),child.matrix[:4])
														
 
															-                    for box in self.layout[ltpage.pageid]: # 识别独立公式
														
 
															-                        b=box.block
														
 
															-                        if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1:
														
 
															-                            cur_v=True
														
 
															-                            ind_v=True
														
 
															-                            if log.isEnabledFor(logging.DEBUG):
														
 
															-                                lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_2),(b.x_2,ltpage.height-b.y_2)))
														
 
															-                                lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_1),(b.x_2,ltpage.height-b.y_1)))
														
 
															-                            break
														
 
															+                    layout=self.layout[ltpage.pageid]
														
 
															+                    x0,y0,x1,y1=int(child.x0),int(ltpage.height-child.y0),int(child.x1),int(ltpage.height-child.y1)
														
 
															+                    h,w=layout.shape
														
 
															+                    y0=np.clip(y0,0,h-1);y1=np.clip(y1,0,h-1)
														
 
															+                    x0=np.clip(x0,0,w-1);x1=np.clip(x1,0,w-1)
														
 
															+                    if layout[y0,x0] or layout[y0,x1] or layout[y1,x0] or layout[y1,x1]: # 识别图表和独立公式
														
 
															+                        cur_v=True
														
 
															+                        ind_v=True
														
 
															                     if not cur_v: #and re.match(r'CMR',fontname): # 根治正文 CMR 字体的懒狗编译器，判定括号组是否属于公式
														
 
															                         if vstk and child.get_text()=='(':
														
 
															                             cur_v=True
														
@@ -548,25 +547,12 @@ class TextConverter(PDFConverter[AnyIO]):
 
															             @retry
														
 
															             def worker(s): # 多线程翻译
														
 
															                 try:
														
 
															-                    if sum(map(str.islower,s))>1: # 包含小写字母
														
 
															-                        hash_key_paragraph = cache.deterministic_hash((s,self.lang_in,self.lang_out))
														
 
															-                        new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
														
 
															-                        if new is None:
														
 
															-                            # import ollama
														
 
															-                            # response = ollama.chat(model='llama3.2', messages=[
														
 
															-                            #     {
														
 
															-                            #         'role': 'system',
														
 
															-                            #         'content':
														
 
															-                            #             'You are a professional translation engine, please translate the text into a colloquial, professional, elegant and fluent content, without the style of machine translation. You must only translate the text content, never interpret it.',
														
 
															-                            #     },
														
 
															-                            #     { 'role': 'user', 'content': f'Translate into {self.lang_out}:\n"\n{s}\n"' },
														
 
															-                            # ])
														
 
															-                            # new=response['message']['content']
														
 
															-                            new=self.translator.translate(s,self.lang_out,self.lang_in)
														
 
															-                            new=remove_control_characters(new)
														
 
															-                            cache.write_paragraph(hash_key, hash_key_paragraph, new)
														
 
															-                    else:
														
 
															-                        new=s
														
 
															+                    hash_key_paragraph = cache.deterministic_hash((s,self.lang_in,self.lang_out))
														
 
															+                    new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
														
 
															+                    if new is None:
														
 
															+                        new=self.translator.translate(s,self.lang_out,self.lang_in)
														
 
															+                        new=remove_control_characters(new)
														
 
															+                        cache.write_paragraph(hash_key, hash_key_paragraph, new)
														
 
															                     return new
														
 
															                 except BaseException as e:
														
 
															                     log.exception(e,exc_info=False)
														
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -161,8 +161,17 @@ def extract_text_to_fp(
 
															     ), total=total_pages, position=0):
														
 
															         pix = doc_en[page.pageno].get_pixmap()
														
 
															         image = np.fromstring(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
														
 
															-        page_layout=model.detect(image)
														
 
															-        layout[page.pageno]=page_layout
														
 
															+        page_layout=model.predict(image)[0]
														
 
															+        # kdtree 是不可能 kdtree 的，不如直接渲染成图片，用空间换时间
														
 
															+        box=np.zeros((pix.height, pix.width))
														
 
															+        h,w=box.shape
														
 
															+        for d in page_layout.boxes:
														
 
															+            if page_layout.names[int(d.cls)] in ['abandon','figure','table','isolate_formula','formula_caption']:
														
 
															+                x0,y0,x1,y1=[int(i) for i in d.xyxy.squeeze().long()]
														
 
															+                y0=np.clip(y0,0,h-1);y1=np.clip(y1,0,h-1)
														
 
															+                x0=np.clip(x0,0,w-1);x1=np.clip(x1,0,w-1)
														
 
															+                box[y0:y1,x0:x1]=1
														
 
															+        layout[page.pageno]=box
														
 
															         # print(page.number,page_layout)
														
 
															         page.rotate = (page.rotate + rotation) % 360
														
 
															         interpreter.process_page(page)
														
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -9,7 +9,7 @@ import os
 
															 import sys
														
 
															 from typing import Any, Container, Iterable, List, Optional
														
 
															 import pymupdf
														
 
															-import layoutparser as lp
														
 
															+import doclayout_yolo
														
 
															 import tempfile
														
 
															 import urllib.request
														
@@ -20,6 +20,8 @@ from pdf2zh.utils import AnyIO
 
															 logging.basicConfig()
														
 
															+doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
														
 
															+
														
 
															 OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
														
@@ -64,11 +66,11 @@ def extract_text(
 
															                 output_type = alttype
														
 
															     outfp: AnyIO = sys.stdout
														
 
															-    pth = os.path.join(tempfile.gettempdir(), 'mfd-tf_efficientdet_d0.pth.tar')
														
 
															+    pth = os.path.join(tempfile.gettempdir(), 'doclayout_yolo_docstructbench_imgsz1024.pt')
														
 
															     if not os.path.exists(pth):
														
 
															         print('Downloading...')
														
 
															-        urllib.request.urlretrieve("https://www.dropbox.com/s/dkr22iux7thlhel/mfd-tf_efficientdet_d0.pth.tar?dl=1",pth)
														
 
															-    model = lp.EfficientDetLayoutModel("lp://efficientdet/MFD/tf_efficientdet_d0",pth)
														
 
															+        urllib.request.urlretrieve("https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt",pth)
														
 
															+    model = doclayout_yolo.YOLOv10(pth)
														
 
															     for file in files:
														
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@ setup(
 
															         "pymupdf",
														
 
															         "tqdm",
														
 
															         "tenacity",
														
 
															-        "layoutparser[effdet]",
														
 
															+        "doclayout-yolo",
														
 
															         "numpy",
														
 
															     ],
														
 
															     classifiers=[