Browse Source

fix figure,dt

Byaidu 1 year ago
parent
commit
d1e561821d
3 changed files with 29 additions and 18 deletions
  1. 1 1
      pdf2zh/__init__.py
  2. 10 10
      pdf2zh/converter.py
  3. 18 7
      pdf2zh/pdfinterp.py

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.4.5"
+__version__ = "1.4.6"
 __author__ = "Byaidu"

+ 10 - 10
pdf2zh/converter.py

@@ -418,7 +418,7 @@ class TextConverter(PDFConverter[AnyIO]):
                         # print(child.get_text(),child.matrix[:4])
                     for box in self.layout[ltpage.pageid]: # 识别独立公式
                         b=box.block
-                        if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1: # 图像识别的坐标是裁剪之后的,所以需要补偿回去
+                        if child.x1>b.x_1 and child.x0<b.x_2 and child.y1>ltpage.height-b.y_2 and child.y0<ltpage.height-b.y_1:
                             cur_v=True
                             ind_v=True
                             if log.isEnabledFor(logging.DEBUG):
@@ -483,15 +483,6 @@ class TextConverter(PDFConverter[AnyIO]):
                             pstk[-1][0]-=child.size-pstk[-1][4]
                             pstk[-1][4]=child.size
                             pstk[-1][5]=child.font
-                        # 更新段落边界
-                        if child.x0<lt.x0:
-                            pstk[-1][2]=child.x0
-                            lt=child
-                        if child.x1>rt.x1:
-                            pstk[-1][3]=child.x1
-                            rt=child
-                        if child.y0<dt.y0:
-                            dt=child
                         sstk[-1]+=child.get_text()
                     else: # 公式入栈
                         # 可能是 CMR 角标,需要在完全确定 cur_v 之后再计算修正,有些下角标可能需要向下的修正
@@ -499,6 +490,15 @@ class TextConverter(PDFConverter[AnyIO]):
                             if child.x0>xt.x0 and child.y1>xt.y0: # and cur_v: # and child.y0-xt.y0<xt.size: # 行内公式修正,前面已经判定过位于同一段落,所以不需要限制 y 范围
                                 vfix=child.y0-xt.y0
                         vstk.append(child)
+                    # 更新段落边界,段落内换行之后可能是公式开头,如果不更新 dt 后面换行检测会出错
+                    if child.x0<lt.x0:
+                        pstk[-1][2]=child.x0
+                        lt=child
+                    if child.x1>rt.x1:
+                        pstk[-1][3]=child.x1
+                        rt=child
+                    if child.y0<dt.y0:
+                        dt=child
                     xt=child
                     xt_ind=ind_v
                 elif isinstance(child, LTFigure): # 图表

+ 18 - 7
pdf2zh/pdfinterp.py

@@ -46,6 +46,7 @@ from pdf2zh.utils import (
     Rect,
     choplist,
     mult_matrix,
+    apply_matrix_pt,
 )
 
 log = logging.getLogger(__name__)
@@ -566,8 +567,18 @@ class PDFPageInterpreter:
 
     def do_S(self) -> None:
         """Stroke path"""
-        self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
-        self.curpath = []
+        def is_black(color: Color) -> bool:
+            if isinstance(color, Tuple):
+                return sum(color)==0
+            else:
+                return color==0
+        if len(self.curpath)==2 and self.curpath[0][0]=='m' and self.curpath[1][0]=='l' and apply_matrix_pt(self.ctm,self.curpath[0][-2:])[1]==apply_matrix_pt(self.ctm,self.curpath[1][-2:])[1] and is_black(self.graphicstate.scolor): # 独立直线,水平,黑色
+            # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
+            self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
+            self.curpath = []
+            return 'n'
+        else:
+            self.curpath = []
 
     def do_s(self) -> None:
         """Close and stroke path"""
@@ -576,7 +587,7 @@ class PDFPageInterpreter:
 
     def do_f(self) -> None:
         """Fill path using nonzero winding number rule"""
-        self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
+        # self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
         self.curpath = []
 
     def do_F(self) -> None:
@@ -584,17 +595,17 @@ class PDFPageInterpreter:
 
     def do_f_a(self) -> None:
         """Fill path using even-odd rule"""
-        self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
+        # self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
         self.curpath = []
 
     def do_B(self) -> None:
         """Fill and stroke path using nonzero winding number rule"""
-        self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
+        # self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
         self.curpath = []
 
     def do_B_a(self) -> None:
         """Fill and stroke path using even-odd rule"""
-        self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
+        # self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
         self.curpath = []
 
     def do_b(self) -> None:
@@ -1033,7 +1044,7 @@ class PDFPageInterpreter:
                         # log.debug("exec: %s %r", name, args)
                         if len(args) == nargs:
                             func(*args)
-                            if not name in ['TJ','Tj','Tm','Tf','l']:
+                            if not name in ['TJ','Tj','Tm','Tf']:
                                 p=" ".join([str(x).replace("\'","") for x in args])
                                 ops+=f'{p} {name} '
                     else: