瀏覽代碼

feat: dynamic line space

timel 1 年之前
父節點
當前提交
32128ec13d
共有 1 個文件被更改,包括 85 次插入18 次删除
  1. 85 18
      pdf2zh/converter.py

+ 85 - 18
pdf2zh/converter.py

@@ -117,11 +117,13 @@ class PDFConverterEx(PDFConverter):
 
 
 class Paragraph:
-    def __init__(self, y, x, x0, x1, size, brk):
+    def __init__(self, y, x, x0, x1, y0, y1, size, brk):
         self.y: float = y  # 初始纵坐标
         self.x: float = x  # 初始横坐标
         self.x0: float = x0  # 左边界
         self.x1: float = x1  # 右边界
+        self.y0: float = y0  # 上边界
+        self.y1: float = y1  # 下边界
         self.size: float = size  # 字体大小
         self.brk: bool = brk  # 换行标记
 
@@ -186,6 +188,8 @@ class TranslateConverter(PDFConverterEx):
         vmax: float = ltpage.width / 4  # 行内公式最大宽度
         ops: str = ""                   # 渲染结果
 
+
+
         def vflag(font: str, char: str):    # 匹配公式(和角标)字体
             if isinstance(font, bytes):     # 不一定能 decode,直接转 str
                 font = str(font)
@@ -284,7 +288,7 @@ class TranslateConverter(PDFConverterEx):
                             pstk[-1].brk = True
                     else:                           # 根据当前字符构建一个新的段落
                         sstk.append("")
-                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
                 if not cur_v:                                               # 文字入栈
                     if (                                                    # 根据当前字符修正段落属性
                         child.size > pstk[-1].size                          # 1. 当前字符比段落字体大
@@ -304,6 +308,8 @@ class TranslateConverter(PDFConverterEx):
                 # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
                 pstk[-1].x0 = min(pstk[-1].x0, child.x0)
                 pstk[-1].x1 = max(pstk[-1].x1, child.x1)
+                pstk[-1].y0 = min(pstk[-1].y0, child.y0)
+                pstk[-1].y1 = max(pstk[-1].y1, child.y1)
                 # 更新上一个字符
                 xt = child
                 xt_cls = cls
@@ -366,20 +372,36 @@ class TranslateConverter(PDFConverterEx):
             else:
                 return "".join(["%02x" % ord(c) for c in cstk])
 
+        # 根据目标语言获取默认行距
+        lang_space = {
+            "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
+            "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
+        }
+        default_line_spacing = lang_space.get(self.translator.lang_out.lower(), 1.1)
+
         _x, _y = 0, 0
+
+        ops_list = []
+        gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
+        gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
         for id, new in enumerate(news):
-            x: float = pstk[id].x           # 段落初始横坐标
-            y: float = pstk[id].y           # 段落初始纵坐标
-            x0: float = pstk[id].x0         # 段落左边界
-            x1: float = pstk[id].x1         # 段落右边界
-            size: float = pstk[id].size     # 段落字体大小
-            brk: bool = pstk[id].brk        # 段落换行标记
-            cstk: str = ""                  # 当前文字栈
-            fcur: str = None                # 当前字体 ID
+            x: float = pstk[id].x                       # 段落初始横坐标
+            y: float = pstk[id].y                       # 段落初始纵坐标
+            x0: float = pstk[id].x0                     # 段落左边界
+            x1: float = pstk[id].x1                     # 段落右边界
+            height: float = pstk[id].y1 - pstk[id].y0   # 段落高度
+            size: float = pstk[id].size                 # 段落字体大小
+            brk: bool = pstk[id].brk                    # 段落换行标记
+            cstk: str = ""                              # 当前文字栈
+            fcur: str = None                            # 当前字体 ID
+            line = 0                                    # 记录换行次数
             tx = x
             fcur_ = fcur
             ptr = 0
             log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
+
+            ops_vals: list[dict] = []
+
             while ptr < len(new):
                 vy_regex = re.match(
                     r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
@@ -415,25 +437,48 @@ class TranslateConverter(PDFConverterEx):
                     or x + adv > x1 + 0.1 * size    # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
                 ):
                     if cstk:
-                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        ops_vals.append({
+                            "type": "text",
+                            "font": fcur,
+                            "size": size,
+                            "x": tx,
+                            "dy": 0,
+                            "rtxt": raw_string(fcur, cstk),
+                            "line": line
+                        })
                         cstk = ""
                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                     x = x0
-                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
-                    y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
+                    line += 1
                 if vy_regex:  # 插入公式
                     fix = 0
                     if fcur is not None:  # 段落内公式修正纵向偏移
                         fix = varf[vid]
                     for vch in var[vid]:  # 排版公式字符
                         vc = chr(vch.cid)
-                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
+                        ops_vals.append({
+                            "type": "text",
+                            "font": self.fontid[vch.font],
+                            "size": vch.size,
+                            "x": x + vch.x0 - var[vid][0].x0,
+                            "dy": fix + vch.y0 - var[vid][0].y0,
+                            "rtxt": raw_string(self.fontid[vch.font], vc),
+                            "line": line
+                        })
                         if log.isEnabledFor(logging.DEBUG):
                             lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
                             _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
                     for l in varl[vid]:  # 排版公式线条
                         if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
-                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+                            ops_vals.append({
+                                "type": "formula",
+                                "x": l.pts[0][0] + x - var[vid][0].x0,
+                                "dy": l.pts[0][1] + fix - var[vid][0].y0,
+                                "linewidth": l.linewidth,
+                                "xlen": l.pts[1][0] - l.pts[0][0],
+                                "ylen": l.pts[1][1] - l.pts[0][1],
+                                "line": line
+                            })
                 else:  # 插入文字缓冲区
                     if not cstk:  # 单行开头
                         tx = x
@@ -451,9 +496,31 @@ class TranslateConverter(PDFConverterEx):
                     _x, _y = x, y
             # 处理结尾
             if cstk:
-                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                ops_vals.append({
+                    "type": "text",
+                    "font": fcur,
+                    "size": size,
+                    "x": tx,
+                    "dy": 0,
+                    "rtxt": raw_string(fcur, cstk),
+                    "line": line
+                })
+
+            line_spacing = default_line_spacing
+
+            while (line + 1) * size * line_spacing > height and line_spacing >= 1:
+                line_spacing -= 0.05
+
+            for vals in ops_vals:
+                match vals["type"]:
+                    case "text":
+                        ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["line"] * size * line_spacing, vals["rtxt"]))
+                    case "formula":
+                        ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["line"] * size * line_spacing, vals["xlen"], vals["ylen"], vals["linewidth"]))
+
         for l in lstk:  # 排版全局线条
             if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
-                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
-        ops = f"BT {ops}ET "
+                ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))
+
+        ops = f"BT {''.join(ops_list)}ET "
         return ops