пре 1 година · 4caf3df43f
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -1,5 +1,5 @@
 
				 from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
			
 
				-from pdfminer.pdffont import PDFFont, PDFCIDFont
			
 
				+from pdfminer.pdffont import PDFCIDFont
			
 
				 from pdfminer.converter import PDFConverter
			
 
				 from pdfminer.pdffont import PDFUnicodeNotDefined
			
 
				 from pdfminer.utils import apply_matrix_pt, mult_matrix
			
@@ -105,13 +105,12 @@ class PDFConverterEx(PDFConverter):
 
				 
			
 
				 
			
 
				 class Paragraph:
			
 
				-    def __init__(self, y, x, x0, x1, size, font, brk):
			
 
				+    def __init__(self, y, x, x0, x1, size, brk):
			
 
				         self.y: float = y  # 初始纵坐标
			
 
				         self.x: float = x  # 初始横坐标
			
 
				         self.x0: float = x0  # 左边界
			
 
				         self.x1: float = x1  # 右边界
			
 
				         self.size: float = size  # 字体大小
			
 
				-        self.font: PDFFont = font  # 字体
			
 
				         self.brk: bool = brk  # 换行标记
			
 
				 
			
 
				 
			
@@ -258,21 +257,14 @@ class TranslateConverter(PDFConverterEx):
 
				                             pstk[-1].brk = True
			
 
				                     else:                           # 根据当前字符构建一个新的段落
			
 
				                         sstk.append("")
			
 
				-                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
			
 
				+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
			
 
				                 if not cur_v:                                               # 文字入栈
			
 
				                     if (                                                    # 根据当前字符修正段落属性
			
 
				                         child.size > pstk[-1].size / 0.79                   # 1. 当前字符显著比段落字体大
			
 
				                         or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
			
 
				-                        or vflag(pstk[-1].font.fontname, "")                # 3. 段落字体为公式字体
			
 
				-                        or re.match(                                        # 4. 段落字体为粗体
			
 
				-                            r"(.*Medi|.*Bold)",
			
 
				-                            pstk[-1].font.fontname,
			
 
				-                            re.IGNORECASE,
			
 
				-                        )
			
 
				                     ):
			
 
				                         pstk[-1].y -= child.size - pstk[-1].size             # hack 这个段落纵向位置的修正有问题，不过先凑合用吧
			
 
				                         pstk[-1].size = child.size
			
 
				-                        pstk[-1].font = child.font
			
 
				                     sstk[-1] += child.get_text()
			
 
				                 else:                                                       # 公式入栈
			
 
				                     if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
			
@@ -358,18 +350,17 @@ class TranslateConverter(PDFConverterEx):
 
				         _x, _y = 0, 0
			
 
				         for id, new in enumerate(news):
			
 
				             x: float = pstk[id].x           # 段落初始横坐标
			
 
				-            y: float = pstk[id].y           # 段落上边界
			
 
				+            y: float = pstk[id].y           # 段落初始纵坐标
			
 
				             x0: float = pstk[id].x0         # 段落左边界
			
 
				             x1: float = pstk[id].x1         # 段落右边界
			
 
				             size: float = pstk[id].size     # 段落字体大小
			
 
				-            font: PDFFont = pstk[id].font   # 段落字体
			
 
				-            brk: bool = pstk[id].brk        # 段落属性
			
 
				+            brk: bool = pstk[id].brk        # 段落换行标记
			
 
				             cstk: str = ""                  # 当前文字栈
			
 
				-            fcur: str = None                # 当前字体ID
			
 
				+            fcur: str = None                # 当前字体 ID
			
 
				             tx = x
			
 
				             fcur_ = fcur
			
 
				             ptr = 0
			
 
				-            log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
			
 
				+            log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
			
 
				             while ptr < len(new):
			
 
				                 vy_regex = re.match(
			
 
				                     r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
			
@@ -387,12 +378,6 @@ class TranslateConverter(PDFConverterEx):
 
				                 else:  # 加载文字
			
 
				                     ch = new[ptr]
			
 
				                     fcur_ = None
			
 
				-                    # 原字体编码容易出问题，这里直接放弃掉
			
 
				-                    # try:
			
 
				-                    #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
			
 
				-                    #         fcur_=self.fontid[font] # 原字体
			
 
				-                    # except:
			
 
				-                    #     pass
			
 
				                     try:
			
 
				                         if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
			
 
				                             fcur_ = "tiro"  # 默认拉丁字体
			
@@ -400,7 +385,6 @@ class TranslateConverter(PDFConverterEx):
 
				                         pass
			
 
				                     if fcur_ is None:
			
 
				                         fcur_ = self.resfont  # 默认非拉丁字体
			
 
				-                    # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
			
 
				                     if fcur_ == 'noto':
			
 
				                         adv = self.noto.char_lengths(ch, size)[0]
			
 
				                     else: