Byaidu 1 год назад
Родитель
Сommit
5c0434a62f
4 измененных файлов с 20 добавлено и 16 удалено
  1. 1 1
      pdf2zh/__init__.py
  2. 3 3
      pdf2zh/converter.py
  3. 10 9
      pdf2zh/pdf2zh.py
  4. 6 3
      pdf2zh/pdfinterp.py

+ 1 - 1
pdf2zh/__init__.py

@@ -1,2 +1,2 @@
-__version__ = "1.5.8"
+__version__ = "1.5.9"
 __author__ = "Byaidu"

+ 3 - 3
pdf2zh/converter.py

@@ -555,7 +555,7 @@ class TextConverter(PDFConverter[AnyIO]):
                         fcur_=None
                         try:
                             if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
-                                fcur_=font.fontid # 原字体
+                                fcur_=self.fontid[font] # 原字体
                         except:
                             pass
                         try:
@@ -565,7 +565,7 @@ class TextConverter(PDFConverter[AnyIO]):
                             pass
                         if fcur_==None:
                             fcur_='china-ss' # 默认中文字体
-                        # print(font.fontid,fcur_,ch,font.char_width(ord(ch)))
+                        # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
                         adv=self.fontmap[fcur_].char_width(ord(ch))*size
                         ptr+=1
                     if fcur_!=fcur or vy_regex or x+adv>rt+0.1*size: # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
@@ -582,7 +582,7 @@ class TextConverter(PDFConverter[AnyIO]):
                             fix=varf[vid]
                         for vch in var[vid]: # 排版公式字符
                             vc=chr(vch.cid)
-                            ops+=f"/{vch.font.fontid} {vch.size} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0} {fix+y+vch.y0-var[vid][0].y0} Tm [<{raw_string(vch.font.fontid,vc)}>] TJ "
+                            ops+=f"/{self.fontid[vch.font]} {vch.size} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0} {fix+y+vch.y0-var[vid][0].y0} Tm [<{raw_string(self.fontid[vch.font],vc)}>] TJ "
                             if log.isEnabledFor(logging.DEBUG):
                                 lstk.append(LTLine(0.1,(_x,_y),(x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0)))
                                 _x,_y=x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0

+ 10 - 9
pdf2zh/pdf2zh.py

@@ -85,15 +85,16 @@ def extract_text(
                 font_id[font]=page.insert_font(font)
         xreflen = doc_en.xref_length()
         for xref in range(1, xreflen):
-            font_res=doc_en.xref_get_key(xref,'Resources/Font')
-            if font_res[0]=='dict':
-                for font in font_list:
-                    font_exist=doc_en.xref_get_key(xref,f'Resources/Font/{font}')
-                    if font_exist[0]=='null':
-                        try:
-                            doc_en.xref_set_key(xref,f'Resources/Font/{font}',f'{font_id[font]} 0 R')
-                        except:
-                            pass
+            for label in ['Resources/','']: # 可能是基于 xobj 的 res
+                font_res=doc_en.xref_get_key(xref,f'{label}Font')
+                if font_res[0]=='dict':
+                    for font in font_list:
+                        font_exist=doc_en.xref_get_key(xref,f'{label}Font/{font}')
+                        if font_exist[0]=='null':
+                            try:
+                                doc_en.xref_set_key(xref,f'{label}Font/{font}',f'{font_id[font]} 0 R')
+                            except:
+                                pass
         doc_en.save(f'{filename}-en.pdf')
 
         with open(f'{filename}-en.pdf', "rb") as fp:

+ 6 - 3
pdf2zh/pdfinterp.py

@@ -380,6 +380,7 @@ class PDFPageInterpreter:
         """Prepare the fonts and XObjects listed in the Resource attribute."""
         self.resources = resources
         self.fontmap: Dict[object, PDFFont] = {}
+        self.fontid: Dict[PDFFont, object] = {}
         self.xobjmap = {}
         self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
         if not resources:
@@ -406,7 +407,7 @@ class PDFPageInterpreter:
                         objid = spec.objid
                     spec = dict_value(spec)
                     self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
-                    self.fontmap[fontid].fontid=fontid # hack
+                    self.fontid[self.fontmap[fontid]]=fontid
             elif k == "ColorSpace":
                 for csid, spec in dict_value(v).items():
                     colorspace = get_colorspace(resolve1(spec))
@@ -969,7 +970,8 @@ class PDFPageInterpreter:
                 ctm=ctm,
             )
             try: # 有的时候 form 字体加不上这里会烂掉
-                self.device.fontmap=interpreter.fontmap # hack
+                self.device.fontid=interpreter.fontid
+                self.device.fontmap=interpreter.fontmap
                 ops_new=self.device.end_figure(xobjid)
                 ctm_inv=np.linalg.inv(np.array(ctm[:4]).reshape(2,2))
                 pos_inv=-np.mat(ctm[4:])*ctm_inv
@@ -1001,7 +1003,8 @@ class PDFPageInterpreter:
             ctm = (1, 0, 0, 1, -x0, -y0)
         self.device.begin_page(page, ctm)
         ops_base=self.render_contents(page.resources, page.contents, ctm=ctm)
-        self.device.fontmap=self.fontmap # hack
+        self.device.fontid=self.fontid
+        self.device.fontmap=self.fontmap
         ops_new=self.device.end_page(page)
         page_objids=[i.objid for i in page.contents]
         # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来