Byaidu 1 年之前
父节点
当前提交
0cdec98423
共有 3 个文件被更改,包括 27 次插入16 次删除
  1. 1 1
      pdf2zh/__init__.py
  2. 7 5
      pdf2zh/converter.py
  3. 19 10
      pdf2zh/pdfinterp.py

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.1.3"
+__version__ = "1.1.4"
 __author__ = "Byaidu"

+ 7 - 5
pdf2zh/converter.py

@@ -501,8 +501,8 @@ class TextConverter(PDFConverter[AnyIO]):
                 else:
                     return "".join(["%02x" % ord(c) for c in cstk])
             for id,new in enumerate(news):
-                x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6];cstk='';fcur=fcur_=None
-                tx=x
+                tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6];cstk='';fcur=fcur_=None
+                log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
                 while True:
                     # print(new,ptr)
                     if ptr==len(new): # 到达段落结尾
@@ -514,7 +514,10 @@ class TextConverter(PDFConverter[AnyIO]):
                     if vy_regex: # 加载公式
                         vid=int(vy_regex.group(1).replace(' ',''))
                         ptr+=len(vy_regex.group(0))
-                        adv=vlen[vid]
+                        if vid<len(vlen):
+                            adv=vlen[vid]
+                        else:
+                            continue # 翻译器可能会自动补个越界的公式标记
                     else: # 加载文字
                         ch=new[ptr]
                         # cid=self.china.decode(ch.encode())
@@ -535,7 +538,7 @@ class TextConverter(PDFConverter[AnyIO]):
                     if vy_regex: # 插入公式
                         fix=0
                         if fcur!=None: # 段落内公式修正
-                            if re.match(r'.*\+(CMEX.*)',var[vid][0].fontname) and var[vid][0].cid in [80,88,112,33]: # 根式、积分与大小求和
+                            if re.match(r'.*\+(CMEX.*)',var[vid][0].fontname) and var[vid][0].cid in [80,88,112,33,82]: # 根式、积分与大小求和
                                 fix=var[vid][0].size*0.85
                             if re.match(r'.*\+(CMSY.*)',var[vid][0].fontname) and var[vid][0].cid in [112]: # 根式
                                 fix=var[vid][0].size*0.85
@@ -563,7 +566,6 @@ class TextConverter(PDFConverter[AnyIO]):
                             cstk+=ch
                     fcur=fcur_
                     x+=adv
-                log.debug(f"< {' '.join([f'{j:.1f}' for j in pstk[id][:5]])} {pstk[id][5].fontname} {pstk[id][6]} > {new}")
             for l in lstk:
                 ops+=f"ET q 1 0 0 1 {l.pts[0][0]} {l.pts[0][1]} cm [] 0 d 0 J {l.linewidth} w 0 0 m {l.pts[1][0]-l.pts[0][0]} {l.pts[1][1]-l.pts[0][1]} l S Q BT "
                 pass

+ 19 - 10
pdf2zh/pdfinterp.py

@@ -265,7 +265,8 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
             else:
                 raise PSEOF("Unexpected EOF, file truncated?")
             self.fp = BytesIO(strm.get_data())
-            # print('STREAM DATA',strm.get_data())
+            if log.isEnabledFor(logging.DEBUG):
+                log.debug(f'STREAM DATA {strm.get_data()}')
 
     def seek(self, pos: int) -> None:
         self.fillfp()
@@ -683,7 +684,9 @@ class PDFPageInterpreter:
             if settings.STRICT:
                 raise PDFInterpreterError("No colorspace specified!")
             n = 1
-        self.graphicstate.scolor = cast(Color, self.pop(n))
+        args=self.pop(n)
+        self.graphicstate.scolor = cast(Color, args)
+        return args
 
     def do_scn(self) -> None:
         """Set color for nonstroking operations"""
@@ -693,15 +696,17 @@ class PDFPageInterpreter:
             if settings.STRICT:
                 raise PDFInterpreterError("No colorspace specified!")
             n = 1
-        self.graphicstate.ncolor = cast(Color, self.pop(n))
+        args=self.pop(n)
+        self.graphicstate.ncolor = cast(Color, args)
+        return args
 
     def do_SC(self) -> None:
         """Set color for stroking operations"""
-        self.do_SCN()
+        return self.do_SCN()
 
     def do_sc(self) -> None:
         """Set color for nonstroking operations"""
-        self.do_scn()
+        return self.do_scn()
 
     def do_sh(self, name: object) -> None:
         """Paint area defined by shading pattern"""
@@ -975,9 +980,10 @@ class PDFPageInterpreter:
         ops_new=self.device.end_page(page)
         page_objid=page.contents[0].objid
         ops_full=f'{page_objid} 0 obj\n<<>>stream\n{ops_new}{ops_base}\nendstream\nendobj\n' # ops_base 可能有副作用,所以先输出 ops_new
-        # print('OP_BASE',ops_base)
-        # print('OP_NEW',ops_new)
-        # print('OP_FULL',ops_full)
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug(f'OP_BASE {ops_base}')
+            log.debug(f'OP_NEW {ops_new}')
+            log.debug(f'OP_FULL {ops_full}')
         return page_objid,ops_full
 
     def render_contents(
@@ -1031,8 +1037,11 @@ class PDFPageInterpreter:
                                 ops+=f'{p} {name} '
                     else:
                         # log.debug("exec: %s", name)
-                        func()
-                        ops+=f'{name} '
+                        targs=func()
+                        if targs==None:
+                            targs=[]
+                        p=" ".join([str(x).replace("\'","") for x in targs])
+                        ops+=f'{p} {name} '
                 elif settings.STRICT:
                     error_msg = "Unknown operator: %r" % name
                     raise PDFInterpreterError(error_msg)