Przeglądaj źródła

Merge pull request #431 from timelic/dev/dynamic-line-space

feat: dynamic line space
Byaidu 1 rok temu
rodzic
commit
f30133e040
1 zmienionych plików z 89 dodań i 18 usunięć
  1. 89 18
      pdf2zh/converter.py

+ 89 - 18
pdf2zh/converter.py

@@ -1,4 +1,5 @@
 from typing import Dict, List
+from enum import Enum
 
 from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
 from pdfminer.pdffont import PDFCIDFont
@@ -117,11 +118,13 @@ class PDFConverterEx(PDFConverter):
 
 
 class Paragraph:
-    def __init__(self, y, x, x0, x1, size, brk):
+    def __init__(self, y, x, x0, x1, y0, y1, size, brk):
         self.y: float = y  # 初始纵坐标
         self.x: float = x  # 初始横坐标
         self.x0: float = x0  # 左边界
         self.x1: float = x1  # 右边界
+        self.y0: float = y0  # 上边界
+        self.y1: float = y1  # 下边界
         self.size: float = size  # 字体大小
         self.brk: bool = brk  # 换行标记
 
@@ -186,6 +189,8 @@ class TranslateConverter(PDFConverterEx):
         vmax: float = ltpage.width / 4  # 行内公式最大宽度
         ops: str = ""                   # 渲染结果
 
+
+
         def vflag(font: str, char: str):    # 匹配公式(和角标)字体
             if isinstance(font, bytes):     # 不一定能 decode,直接转 str
                 try:
@@ -287,7 +292,7 @@ class TranslateConverter(PDFConverterEx):
                             pstk[-1].brk = True
                     else:                           # 根据当前字符构建一个新的段落
                         sstk.append("")
-                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
                 if not cur_v:                                               # 文字入栈
                     if (                                                    # 根据当前字符修正段落属性
                         child.size > pstk[-1].size                          # 1. 当前字符比段落字体大
@@ -307,6 +312,8 @@ class TranslateConverter(PDFConverterEx):
                 # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
                 pstk[-1].x0 = min(pstk[-1].x0, child.x0)
                 pstk[-1].x1 = max(pstk[-1].x1, child.x1)
+                pstk[-1].y0 = min(pstk[-1].y0, child.y0)
+                pstk[-1].y1 = max(pstk[-1].y1, child.y1)
                 # 更新上一个字符
                 xt = child
                 xt_cls = cls
@@ -369,20 +376,36 @@ class TranslateConverter(PDFConverterEx):
             else:
                 return "".join(["%02x" % ord(c) for c in cstk])
 
+        # 根据目标语言获取默认行距
+        LANG_LINEHEIGHT_MAP = {
+            "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
+            "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
+        }
+        default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1
+
         _x, _y = 0, 0
+
+        ops_list = []
+        gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
+        gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
         for id, new in enumerate(news):
-            x: float = pstk[id].x           # 段落初始横坐标
-            y: float = pstk[id].y           # 段落初始纵坐标
-            x0: float = pstk[id].x0         # 段落左边界
-            x1: float = pstk[id].x1         # 段落右边界
-            size: float = pstk[id].size     # 段落字体大小
-            brk: bool = pstk[id].brk        # 段落换行标记
-            cstk: str = ""                  # 当前文字栈
-            fcur: str = None                # 当前字体 ID
+            x: float = pstk[id].x                       # 段落初始横坐标
+            y: float = pstk[id].y                       # 段落初始纵坐标
+            x0: float = pstk[id].x0                     # 段落左边界
+            x1: float = pstk[id].x1                     # 段落右边界
+            height: float = pstk[id].y1 - pstk[id].y0   # 段落高度
+            size: float = pstk[id].size                 # 段落字体大小
+            brk: bool = pstk[id].brk                    # 段落换行标记
+            cstk: str = ""                              # 当前文字栈
+            fcur: str = None                            # 当前字体 ID
+            lidx = 0                                    # 记录换行次数
             tx = x
             fcur_ = fcur
             ptr = 0
             log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
+
+            ops_vals: list[dict] = []
+
             while ptr < len(new):
                 vy_regex = re.match(
                     r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
@@ -418,25 +441,48 @@ class TranslateConverter(PDFConverterEx):
                     or x + adv > x1 + 0.1 * size    # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
                 ):
                     if cstk:
-                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        ops_vals.append({
+                            "type": OpType.TEXT,
+                            "font": fcur,
+                            "size": size,
+                            "x": tx,
+                            "dy": 0,
+                            "rtxt": raw_string(fcur, cstk),
+                            "lidx": lidx
+                        })
                         cstk = ""
                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                     x = x0
-                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
-                    y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
+                    lidx += 1
                 if vy_regex:  # 插入公式
                     fix = 0
                     if fcur is not None:  # 段落内公式修正纵向偏移
                         fix = varf[vid]
                     for vch in var[vid]:  # 排版公式字符
                         vc = chr(vch.cid)
-                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
+                        ops_vals.append({
+                            "type": OpType.TEXT,
+                            "font": self.fontid[vch.font],
+                            "size": vch.size,
+                            "x": x + vch.x0 - var[vid][0].x0,
+                            "dy": fix + vch.y0 - var[vid][0].y0,
+                            "rtxt": raw_string(self.fontid[vch.font], vc),
+                            "lidx": lidx
+                        })
                         if log.isEnabledFor(logging.DEBUG):
                             lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
                             _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
                     for l in varl[vid]:  # 排版公式线条
                         if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
-                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+                            ops_vals.append({
+                                "type": OpType.LINE,
+                                "x": l.pts[0][0] + x - var[vid][0].x0,
+                                "dy": l.pts[0][1] + fix - var[vid][0].y0,
+                                "linewidth": l.linewidth,
+                                "xlen": l.pts[1][0] - l.pts[0][0],
+                                "ylen": l.pts[1][1] - l.pts[0][1],
+                                "lidx": lidx
+                            })
                 else:  # 插入文字缓冲区
                     if not cstk:  # 单行开头
                         tx = x
@@ -454,9 +500,34 @@ class TranslateConverter(PDFConverterEx):
                     _x, _y = x, y
             # 处理结尾
             if cstk:
-                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                ops_vals.append({
+                    "type": OpType.TEXT,
+                    "font": fcur,
+                    "size": size,
+                    "x": tx,
+                    "dy": 0,
+                    "rtxt": raw_string(fcur, cstk),
+                    "lidx": lidx
+                })
+
+            line_height = default_line_height
+
+            while (lidx + 1) * size * line_height > height and line_height >= 1:
+                line_height -= 0.05
+
+            for vals in ops_vals:
+                if vals["type"] == OpType.TEXT:
+                    ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"]))
+                elif vals["type"] == OpType.LINE:
+                    ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"]))
+
         for l in lstk:  # 排版全局线条
             if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
-                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
-        ops = f"BT {ops}ET "
+                ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))
+
+        ops = f"BT {''.join(ops_list)}ET "
         return ops
+
+class OpType(Enum):
+    TEXT = "text"
+    LINE = "line"