1 rok temu · f30133e040
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -1,4 +1,5 @@
 
				 from typing import Dict, List
			
 
				+from enum import Enum
			
 
				 
			
 
				 from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
			
 
				 from pdfminer.pdffont import PDFCIDFont
			
@@ -117,11 +118,13 @@ class PDFConverterEx(PDFConverter):
 
				 
			
 
				 
			
 
				 class Paragraph:
			
 
				-    def __init__(self, y, x, x0, x1, size, brk):
			
 
				+    def __init__(self, y, x, x0, x1, y0, y1, size, brk):
			
 
				         self.y: float = y  # 初始纵坐标
			
 
				         self.x: float = x  # 初始横坐标
			
 
				         self.x0: float = x0  # 左边界
			
 
				         self.x1: float = x1  # 右边界
			
 
				+        self.y0: float = y0  # 上边界
			
 
				+        self.y1: float = y1  # 下边界
			
 
				         self.size: float = size  # 字体大小
			
 
				         self.brk: bool = brk  # 换行标记
			
 
				 
			
@@ -186,6 +189,8 @@ class TranslateConverter(PDFConverterEx):
 
				         vmax: float = ltpage.width / 4  # 行内公式最大宽度
			
 
				         ops: str = ""                   # 渲染结果
			
 
				 
			
 
				+
			
 
				+
			
 
				         def vflag(font: str, char: str):    # 匹配公式（和角标）字体
			
 
				             if isinstance(font, bytes):     # 不一定能 decode，直接转 str
			
 
				                 try:
			
@@ -287,7 +292,7 @@ class TranslateConverter(PDFConverterEx):
 
				                             pstk[-1].brk = True
			
 
				                     else:                           # 根据当前字符构建一个新的段落
			
 
				                         sstk.append("")
			
 
				-                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
			
 
				+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
			
 
				                 if not cur_v:                                               # 文字入栈
			
 
				                     if (                                                    # 根据当前字符修正段落属性
			
 
				                         child.size > pstk[-1].size                          # 1. 当前字符比段落字体大
			
@@ -307,6 +312,8 @@ class TranslateConverter(PDFConverterEx):
 
				                 # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
			
 
				                 pstk[-1].x0 = min(pstk[-1].x0, child.x0)
			
 
				                 pstk[-1].x1 = max(pstk[-1].x1, child.x1)
			
 
				+                pstk[-1].y0 = min(pstk[-1].y0, child.y0)
			
 
				+                pstk[-1].y1 = max(pstk[-1].y1, child.y1)
			
 
				                 # 更新上一个字符
			
 
				                 xt = child
			
 
				                 xt_cls = cls
			
@@ -369,20 +376,36 @@ class TranslateConverter(PDFConverterEx):
 
				             else:
			
 
				                 return "".join(["%02x" % ord(c) for c in cstk])
			
 
				 
			
 
				+        # 根据目标语言获取默认行距
			
 
				+        LANG_LINEHEIGHT_MAP = {
			
 
				+            "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
			
 
				+            "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
			
 
				+        }
			
 
				+        default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1
			
 
				+
			
 
				         _x, _y = 0, 0
			
 
				+
			
 
				+        ops_list = []
			
 
				+        gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
			
 
				+        gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
			
 
				         for id, new in enumerate(news):
			
 
				-            x: float = pstk[id].x           # 段落初始横坐标
			
 
				-            y: float = pstk[id].y           # 段落初始纵坐标
			
 
				-            x0: float = pstk[id].x0         # 段落左边界
			
 
				-            x1: float = pstk[id].x1         # 段落右边界
			
 
				-            size: float = pstk[id].size     # 段落字体大小
			
 
				-            brk: bool = pstk[id].brk        # 段落换行标记
			
 
				-            cstk: str = ""                  # 当前文字栈
			
 
				-            fcur: str = None                # 当前字体 ID
			
 
				+            x: float = pstk[id].x                       # 段落初始横坐标
			
 
				+            y: float = pstk[id].y                       # 段落初始纵坐标
			
 
				+            x0: float = pstk[id].x0                     # 段落左边界
			
 
				+            x1: float = pstk[id].x1                     # 段落右边界
			
 
				+            height: float = pstk[id].y1 - pstk[id].y0   # 段落高度
			
 
				+            size: float = pstk[id].size                 # 段落字体大小
			
 
				+            brk: bool = pstk[id].brk                    # 段落换行标记
			
 
				+            cstk: str = ""                              # 当前文字栈
			
 
				+            fcur: str = None                            # 当前字体 ID
			
 
				+            lidx = 0                                    # 记录换行次数
			
 
				             tx = x
			
 
				             fcur_ = fcur
			
 
				             ptr = 0
			
 
				             log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
			
 
				+
			
 
				+            ops_vals: list[dict] = []
			
 
				+
			
 
				             while ptr < len(new):
			
 
				                 vy_regex = re.match(
			
 
				                     r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
			
@@ -418,25 +441,48 @@ class TranslateConverter(PDFConverterEx):
 
				                     or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
			
 
				                 ):
			
 
				                     if cstk:
			
 
				-                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				+                        ops_vals.append({
			
 
				+                            "type": OpType.TEXT,
			
 
				+                            "font": fcur,
			
 
				+                            "size": size,
			
 
				+                            "x": tx,
			
 
				+                            "dy": 0,
			
 
				+                            "rtxt": raw_string(fcur, cstk),
			
 
				+                            "lidx": lidx
			
 
				+                        })
			
 
				                         cstk = ""
			
 
				                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
			
 
				                     x = x0
			
 
				-                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
			
 
				-                    y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
			
 
				+                    lidx += 1
			
 
				                 if vy_regex:  # 插入公式
			
 
				                     fix = 0
			
 
				                     if fcur is not None:  # 段落内公式修正纵向偏移
			
 
				                         fix = varf[vid]
			
 
				                     for vch in var[vid]:  # 排版公式字符
			
 
				                         vc = chr(vch.cid)
			
 
				-                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
			
 
				+                        ops_vals.append({
			
 
				+                            "type": OpType.TEXT,
			
 
				+                            "font": self.fontid[vch.font],
			
 
				+                            "size": vch.size,
			
 
				+                            "x": x + vch.x0 - var[vid][0].x0,
			
 
				+                            "dy": fix + vch.y0 - var[vid][0].y0,
			
 
				+                            "rtxt": raw_string(self.fontid[vch.font], vc),
			
 
				+                            "lidx": lidx
			
 
				+                        })
			
 
				                         if log.isEnabledFor(logging.DEBUG):
			
 
				                             lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
			
 
				                             _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
			
 
				                     for l in varl[vid]:  # 排版公式线条
			
 
				                         if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
			
 
				-                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
			
 
				+                            ops_vals.append({
			
 
				+                                "type": OpType.LINE,
			
 
				+                                "x": l.pts[0][0] + x - var[vid][0].x0,
			
 
				+                                "dy": l.pts[0][1] + fix - var[vid][0].y0,
			
 
				+                                "linewidth": l.linewidth,
			
 
				+                                "xlen": l.pts[1][0] - l.pts[0][0],
			
 
				+                                "ylen": l.pts[1][1] - l.pts[0][1],
			
 
				+                                "lidx": lidx
			
 
				+                            })
			
 
				                 else:  # 插入文字缓冲区
			
 
				                     if not cstk:  # 单行开头
			
 
				                         tx = x
			
@@ -454,9 +500,34 @@ class TranslateConverter(PDFConverterEx):
 
				                     _x, _y = x, y
			
 
				             # 处理结尾
			
 
				             if cstk:
			
 
				-                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				+                ops_vals.append({
			
 
				+                    "type": OpType.TEXT,
			
 
				+                    "font": fcur,
			
 
				+                    "size": size,
			
 
				+                    "x": tx,
			
 
				+                    "dy": 0,
			
 
				+                    "rtxt": raw_string(fcur, cstk),
			
 
				+                    "lidx": lidx
			
 
				+                })
			
 
				+
			
 
				+            line_height = default_line_height
			
 
				+
			
 
				+            while (lidx + 1) * size * line_height > height and line_height >= 1:
			
 
				+                line_height -= 0.05
			
 
				+
			
 
				+            for vals in ops_vals:
			
 
				+                if vals["type"] == OpType.TEXT:
			
 
				+                    ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"]))
			
 
				+                elif vals["type"] == OpType.LINE:
			
 
				+                    ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"]))
			
 
				+
			
 
				         for l in lstk:  # 排版全局线条
			
 
				             if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
			
 
				-                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
			
 
				-        ops = f"BT {ops}ET "
			
 
				+                ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))
			
 
				+
			
 
				+        ops = f"BT {''.join(ops_list)}ET "
			
 
				         return ops
			
 
				+
			
 
				+class OpType(Enum):
			
 
				+    TEXT = "text"
			
 
				+    LINE = "line"