Procházet zdrojové kódy

fix form,compile,scale

Byaidu před 1 rokem
rodič
revize
acea685986
6 změnil soubory, kde provedl 48 přidání a 51 odebrání
  1. 1 1
      pdf2zh/__init__.py
  2. 3 2
      pdf2zh/converter.py
  3. 3 25
      pdf2zh/high_level.py
  4. 26 12
      pdf2zh/pdf2zh.py
  5. 14 10
      pdf2zh/pdfinterp.py
  6. 1 1
      pdf2zh/utils.py

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.4.9"
+__version__ = "1.5.0"
 __author__ = "Byaidu"

+ 3 - 2
pdf2zh/converter.py

@@ -120,12 +120,14 @@ class PDFLayoutAnalyzer(PDFTextDevice):
     def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
         self._stack.append(self.cur_item)
         self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
+        self.cur_item.pageid = self._stack[-1].pageid
 
     def end_figure(self, _: str) -> None:
         fig = self.cur_item
         assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
         self.cur_item = self._stack.pop()
         self.cur_item.add(fig)
+        return self.receive_layout(fig)
 
     def render_image(self, name: str, stream: PDFStream) -> None:
         assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
@@ -429,7 +431,7 @@ class TextConverter(PDFConverter[AnyIO]):
                     fontname=child.fontname.split('+')[-1]
                     if vflag(fontname,child.get_text()): # 识别公式和字符
                         cur_v=True
-                    if child.matrix[:4]==(0,1,-1,0): # 竖直段落
+                    if child.matrix[0]==0 and child.matrix[3]==0: # 竖直段落
                         cur_v=True
                         ind_v=True
                         # print(child.get_text(),child.matrix[:4])
@@ -519,7 +521,6 @@ class TextConverter(PDFConverter[AnyIO]):
                     xt=child
                     xt_ind=ind_v
                 elif isinstance(child, LTFigure): # 图表
-                    # print(f'\n\n[FIGURE] {child.name}')
                     pass
                 elif isinstance(child, LTLine): # 线条
                     if vstk and abs(child.x0-xt.x0)<vmax and child.x1-child.x0<vmax and child.y0==child.y1 or xt_ind: # 公式线条

+ 3 - 25
pdf2zh/high_level.py

@@ -147,8 +147,8 @@ def extract_text_to_fp(
         raise PDFValueError(msg)
 
     assert device is not None
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
     obj_patch={}
+    interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch)
     if pages:
         total_pages=len(pages)
     else:
@@ -166,32 +166,10 @@ def extract_text_to_fp(
         layout[page.pageno]=page_layout
         # print(page.number,page_layout)
         page.rotate = (page.rotate + rotation) % 360
-        page_objids,ops_full=interpreter.process_page(page)
-        obj_patch[page_objids[0]]=ops_full
-        for objid in range(1,len(page_objids)):
-            obj_patch[page_objids[objid]]=f'{page_objids[objid]} 0 obj\n<<>>\nendobj\n'
-
-    # 用最后一页的page来解析doc
-    objs=[]
-    trailer=page.doc.xrefs[0].get_trailer()
-    for objid in range(1,trailer['Size']):
-        if objid in page.doc.xrefs[0].offsets:
-            (_, start, _) = page.doc.xrefs[0].get_pos(objid)
-            fp=page.doc._parser.fp
-            end, _=page.doc._getobj_parse(start,objid)
-            fp.seek(start)
-            raw=fp.read(end-start)
-            objs.append(raw)
-        else:
-            # print(f'OBJ {objid} missing')
-            objs.append(b'')
-    # 更新页面内容
-    for id,ops in obj_patch.items():
-        objs[id-1]=ops.encode()
-    # 编译文档
-    pdf_compile('output-zh.pdf',objs,trailer)
+        interpreter.process_page(page)
 
     device.close()
+    return obj_patch
 
 
 def extract_text(

+ 26 - 12
pdf2zh/pdf2zh.py

@@ -76,17 +76,32 @@ def extract_text(
 
         doc_en = pymupdf.open(file)
         page_count=doc_en.page_count
+        font_list=['china-ss','tiro']
+        font_id={}
         for page in doc_en:
-            page.insert_font('china-ss')
-            page.insert_font('tiro')
-        doc_en.save('output-en.pdf')
-
-        with open('output-en.pdf', "rb") as fp:
-            pdf2zh.high_level.extract_text_to_fp(fp, **locals())
-
-        doc_en.close()
-        doc_zh = pymupdf.open('output-zh.pdf')
-        doc_dual = pymupdf.open('output-en.pdf')
+            for font in font_list:
+                font_id[font]=page.insert_font(font)
+        xreflen = doc_en.xref_length()
+        for xref in range(1, xreflen):
+            font_res=doc_en.xref_get_key(xref,'Resources/Font')
+            if font_res[0]=='dict':
+                for font in font_list:
+                    font_exist=doc_en.xref_get_key(xref,f'Resources/Font/{font}')
+                    if font_exist[0]=='null':
+                        try:
+                            doc_en.xref_set_key(xref,f'Resources/Font/{font}',f'{font_id[font]} 0 R')
+                        except:
+                            pass
+        doc_en.save(f'{filename}-en.pdf')
+
+        with open(f'{filename}-en.pdf', "rb") as fp:
+            obj_patch:dict=pdf2zh.high_level.extract_text_to_fp(fp, **locals())
+
+        for obj_id,ops_full in obj_patch.items():
+            doc_en.update_stream(obj_id,ops_full.encode())
+
+        doc_zh = doc_en
+        doc_dual = pymupdf.open(f'{filename}-en.pdf')
         doc_dual.insert_file(doc_zh)
         for id in range(page_count):
             doc_dual.move_page(page_count+id,id*2+1)
@@ -95,8 +110,7 @@ def extract_text(
         doc_zh.close()
         doc_dual.close()
 
-        os.remove('output-en.pdf')
-        os.remove('output-zh.pdf')
+        os.remove(f'{filename}-en.pdf')
 
     return
 

+ 14 - 10
pdf2zh/pdfinterp.py

@@ -367,12 +367,13 @@ class PDFPageInterpreter:
     Reference: PDF Reference, Appendix A, Operator Summary
     """
 
-    def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
+    def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch) -> None:
         self.rsrcmgr = rsrcmgr
         self.device = device
+        self.obj_patch = obj_patch
 
     def dup(self) -> "PDFPageInterpreter":
-        return self.__class__(self.rsrcmgr, self.device)
+        return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
 
     def init_resources(self, resources: Dict[object, object]) -> None:
         """Prepare the fonts and XObjects listed in the Resource attribute."""
@@ -960,12 +961,18 @@ class PDFPageInterpreter:
             else:
                 resources = self.resources.copy()
             self.device.begin_figure(xobjid, bbox, matrix)
-            interpreter.render_contents(
+            ops_base=interpreter.render_contents(
                 resources,
                 [xobj],
                 ctm=mult_matrix(matrix, self.ctm),
             )
-            self.device.end_figure(xobjid)
+            self.device.fontmap=interpreter.fontmap # hack
+            try: # 有的时候 form 字体加不上这里会烂掉
+                ops_new=self.device.end_figure(xobjid)
+                xobjid=self.xobjmap[xobjid].objid
+                self.obj_patch[xobjid]=f'q {ops_base}Q 1 0 0 1 {-self.ctm[4]} {-self.ctm[5]} cm {ops_new}'
+            except:
+                pass
         elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
             self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
             self.device.render_image(xobjid, xobj)
@@ -993,12 +1000,9 @@ class PDFPageInterpreter:
         ops_new=self.device.end_page(page)
         page_objids=[i.objid for i in page.contents]
         # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
-        ops_full=f'{page_objids[0]} 0 obj\n<<>>stream\nq {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}\nendstream\nendobj\n' # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
-        # if log.isEnabledFor(logging.DEBUG):
-        #     log.debug(f'OP_BASE {ops_base}')
-        #     log.debug(f'OP_NEW {ops_new}')
-        #     log.debug(f'OP_FULL {ops_full}')
-        return page_objids,ops_full
+        self.obj_patch[page_objids[0]]=f'q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}' # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
+        for objid in range(1,len(page_objids)):
+            self.obj_patch[page_objids[objid]]=''
 
     def render_contents(
         self,

+ 1 - 1
pdf2zh/utils.py

@@ -286,7 +286,7 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point:
 
 def matrix_scale(m: Matrix) -> float:
     (a, b, c, d, e, f) = m
-    return (a**2+c*2)**0.5
+    return (a**2+c**2)**0.5
 
 #  Utility functions