Byaidu 1 год назад
Родитель
Сommit
963315d2fb
6 измененных файлов с 37 добавлено и 38 удалено
  1. 3 1
      README.md
  2. 1 1
      pdf2zh/__init__.py
  3. 15 29
      pdf2zh/converter.py
  4. 11 2
      pdf2zh/high_level.py
  5. 6 4
      pdf2zh/pdf2zh.py
  6. 1 1
      setup.py

+ 3 - 1
README.md

@@ -59,9 +59,11 @@ Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
 
 
 Document parsing: [Pdfminer.six](https://github.com/pdfminer/pdfminer.six)
 Document parsing: [Pdfminer.six](https://github.com/pdfminer/pdfminer.six)
 
 
+Document extraction: [MinerU](https://github.com/opendatalab/MinerU)
+
 Multi-threaded translation: [MathTranslate](https://github.com/SUSYUSTC/MathTranslate)
 Multi-threaded translation: [MathTranslate](https://github.com/SUSYUSTC/MathTranslate)
 
 
-Layout parsing: [LayoutParser](https://github.com/Layout-Parser/layout-parser)
+Layout parsing: [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
 
 
 ## Star History
 ## Star History
 
 

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.5.2"
+__version__ = "1.5.3"
 __author__ = "Byaidu"
 __author__ = "Byaidu"

+ 15 - 29
pdf2zh/converter.py

@@ -15,6 +15,7 @@ from typing import (
     cast,
     cast,
 )
 )
 import concurrent.futures
 import concurrent.futures
+import numpy as np
 import html
 import html
 import requests
 import requests
 import unicodedata
 import unicodedata
@@ -434,16 +435,14 @@ class TextConverter(PDFConverter[AnyIO]):
                     if child.matrix[0]==0 and child.matrix[3]==0: # 竖直段落
                     if child.matrix[0]==0 and child.matrix[3]==0: # 竖直段落
                         cur_v=True
                         cur_v=True
                         ind_v=True
                         ind_v=True
-                        # print(child.get_text(),child.matrix[:4])
-                    for box in self.layout[ltpage.pageid]: # 识别独立公式
-                        b=box.block
-                        if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1:
-                            cur_v=True
-                            ind_v=True
-                            if log.isEnabledFor(logging.DEBUG):
-                                lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_2),(b.x_2,ltpage.height-b.y_2)))
-                                lstk.append(LTLine(1,(b.x_1,ltpage.height-b.y_1),(b.x_2,ltpage.height-b.y_1)))
-                            break
+                    layout=self.layout[ltpage.pageid]
+                    x0,y0,x1,y1=int(child.x0),int(ltpage.height-child.y0),int(child.x1),int(ltpage.height-child.y1)
+                    h,w=layout.shape
+                    y0=np.clip(y0,0,h-1);y1=np.clip(y1,0,h-1)
+                    x0=np.clip(x0,0,w-1);x1=np.clip(x1,0,w-1)
+                    if layout[y0,x0] or layout[y0,x1] or layout[y1,x0] or layout[y1,x1]: # 识别图表和独立公式
+                        cur_v=True
+                        ind_v=True
                     if not cur_v: #and re.match(r'CMR',fontname): # 根治正文 CMR 字体的懒狗编译器,判定括号组是否属于公式
                     if not cur_v: #and re.match(r'CMR',fontname): # 根治正文 CMR 字体的懒狗编译器,判定括号组是否属于公式
                         if vstk and child.get_text()=='(':
                         if vstk and child.get_text()=='(':
                             cur_v=True
                             cur_v=True
@@ -548,25 +547,12 @@ class TextConverter(PDFConverter[AnyIO]):
             @retry
             @retry
             def worker(s): # 多线程翻译
             def worker(s): # 多线程翻译
                 try:
                 try:
-                    if sum(map(str.islower,s))>1: # 包含小写字母
-                        hash_key_paragraph = cache.deterministic_hash((s,self.lang_in,self.lang_out))
-                        new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
-                        if new is None:
-                            # import ollama
-                            # response = ollama.chat(model='llama3.2', messages=[
-                            #     {
-                            #         'role': 'system',
-                            #         'content':
-                            #             'You are a professional translation engine, please translate the text into a colloquial, professional, elegant and fluent content, without the style of machine translation. You must only translate the text content, never interpret it.',
-                            #     },
-                            #     { 'role': 'user', 'content': f'Translate into {self.lang_out}:\n"\n{s}\n"' },
-                            # ])
-                            # new=response['message']['content']
-                            new=self.translator.translate(s,self.lang_out,self.lang_in)
-                            new=remove_control_characters(new)
-                            cache.write_paragraph(hash_key, hash_key_paragraph, new)
-                    else:
-                        new=s
+                    hash_key_paragraph = cache.deterministic_hash((s,self.lang_in,self.lang_out))
+                    new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
+                    if new is None:
+                        new=self.translator.translate(s,self.lang_out,self.lang_in)
+                        new=remove_control_characters(new)
+                        cache.write_paragraph(hash_key, hash_key_paragraph, new)
                     return new
                     return new
                 except BaseException as e:
                 except BaseException as e:
                     log.exception(e,exc_info=False)
                     log.exception(e,exc_info=False)

+ 11 - 2
pdf2zh/high_level.py

@@ -161,8 +161,17 @@ def extract_text_to_fp(
     ), total=total_pages, position=0):
     ), total=total_pages, position=0):
         pix = doc_en[page.pageno].get_pixmap()
         pix = doc_en[page.pageno].get_pixmap()
         image = np.fromstring(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
         image = np.fromstring(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
-        page_layout=model.detect(image)
-        layout[page.pageno]=page_layout
+        page_layout=model.predict(image)[0]
+        # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
+        box=np.zeros((pix.height, pix.width))
+        h,w=box.shape
+        for d in page_layout.boxes:
+            if page_layout.names[int(d.cls)] in ['abandon','figure','table','isolate_formula','formula_caption']:
+                x0,y0,x1,y1=[int(i) for i in d.xyxy.squeeze().long()]
+                y0=np.clip(y0,0,h-1);y1=np.clip(y1,0,h-1)
+                x0=np.clip(x0,0,w-1);x1=np.clip(x1,0,w-1)
+                box[y0:y1,x0:x1]=1
+        layout[page.pageno]=box
         # print(page.number,page_layout)
         # print(page.number,page_layout)
         page.rotate = (page.rotate + rotation) % 360
         page.rotate = (page.rotate + rotation) % 360
         interpreter.process_page(page)
         interpreter.process_page(page)

+ 6 - 4
pdf2zh/pdf2zh.py

@@ -9,7 +9,7 @@ import os
 import sys
 import sys
 from typing import Any, Container, Iterable, List, Optional
 from typing import Any, Container, Iterable, List, Optional
 import pymupdf
 import pymupdf
-import layoutparser as lp
+import doclayout_yolo
 import tempfile
 import tempfile
 import urllib.request
 import urllib.request
 
 
@@ -20,6 +20,8 @@ from pdf2zh.utils import AnyIO
 
 
 logging.basicConfig()
 logging.basicConfig()
 
 
+doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
+
 OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
 OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
 
 
 
 
@@ -64,11 +66,11 @@ def extract_text(
                 output_type = alttype
                 output_type = alttype
 
 
     outfp: AnyIO = sys.stdout
     outfp: AnyIO = sys.stdout
-    pth = os.path.join(tempfile.gettempdir(), 'mfd-tf_efficientdet_d0.pth.tar')
+    pth = os.path.join(tempfile.gettempdir(), 'doclayout_yolo_docstructbench_imgsz1024.pt')
     if not os.path.exists(pth):
     if not os.path.exists(pth):
         print('Downloading...')
         print('Downloading...')
-        urllib.request.urlretrieve("https://www.dropbox.com/s/dkr22iux7thlhel/mfd-tf_efficientdet_d0.pth.tar?dl=1",pth)
-    model = lp.EfficientDetLayoutModel("lp://efficientdet/MFD/tf_efficientdet_d0",pth)
+        urllib.request.urlretrieve("https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt",pth)
+    model = doclayout_yolo.YOLOv10(pth)
 
 
     for file in files:
     for file in files:
 
 

+ 1 - 1
setup.py

@@ -24,7 +24,7 @@ setup(
         "pymupdf",
         "pymupdf",
         "tqdm",
         "tqdm",
         "tenacity",
         "tenacity",
-        "layoutparser[effdet]",
+        "doclayout-yolo",
         "numpy",
         "numpy",
     ],
     ],
     classifiers=[
     classifiers=[