há 1 ano atrás · f30133e040
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -1,4 +1,5 @@
 
															 from typing import Dict, List
														
 
															+from enum import Enum
														
 
															 from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
														
 
															 from pdfminer.pdffont import PDFCIDFont
														
@@ -117,11 +118,13 @@ class PDFConverterEx(PDFConverter):
 
															 class Paragraph:
														
 
															-    def __init__(self, y, x, x0, x1, size, brk):
														
 
															+    def __init__(self, y, x, x0, x1, y0, y1, size, brk):
														
 
															         self.y: float = y  # 初始纵坐标
														
 
															         self.x: float = x  # 初始横坐标
														
 
															         self.x0: float = x0  # 左边界
														
 
															         self.x1: float = x1  # 右边界
														
 
															+        self.y0: float = y0  # 上边界
														
 
															+        self.y1: float = y1  # 下边界
														
 
															         self.size: float = size  # 字体大小
														
 
															         self.brk: bool = brk  # 换行标记
														
@@ -186,6 +189,8 @@ class TranslateConverter(PDFConverterEx):
 
															         vmax: float = ltpage.width / 4  # 行内公式最大宽度
														
 
															         ops: str = ""                   # 渲染结果
														
 
															+
														
 
															+
														
 
															         def vflag(font: str, char: str):    # 匹配公式（和角标）字体
														
 
															             if isinstance(font, bytes):     # 不一定能 decode，直接转 str
														
 
															                 try:
														
@@ -287,7 +292,7 @@ class TranslateConverter(PDFConverterEx):
 
															                             pstk[-1].brk = True
														
 
															                     else:                           # 根据当前字符构建一个新的段落
														
 
															                         sstk.append("")
														
 
															-                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
														
 
															+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
														
 
															                 if not cur_v:                                               # 文字入栈
														
 
															                     if (                                                    # 根据当前字符修正段落属性
														
 
															                         child.size > pstk[-1].size                          # 1. 当前字符比段落字体大
														
@@ -307,6 +312,8 @@ class TranslateConverter(PDFConverterEx):
 
															                 # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
														
 
															                 pstk[-1].x0 = min(pstk[-1].x0, child.x0)
														
 
															                 pstk[-1].x1 = max(pstk[-1].x1, child.x1)
														
 
															+                pstk[-1].y0 = min(pstk[-1].y0, child.y0)
														
 
															+                pstk[-1].y1 = max(pstk[-1].y1, child.y1)
														
 
															                 # 更新上一个字符
														
 
															                 xt = child
														
 
															                 xt_cls = cls
														
@@ -369,20 +376,36 @@ class TranslateConverter(PDFConverterEx):
 
															             else:
														
 
															                 return "".join(["%02x" % ord(c) for c in cstk])
														
 
															+        # 根据目标语言获取默认行距
														
 
															+        LANG_LINEHEIGHT_MAP = {
														
 
															+            "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
														
 
															+            "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
														
 
															+        }
														
 
															+        default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1
														
 
															+
														
 
															         _x, _y = 0, 0
														
 
															+
														
 
															+        ops_list = []
														
 
															+        gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
														
 
															+        gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
														
 
															         for id, new in enumerate(news):
														
 
															-            x: float = pstk[id].x           # 段落初始横坐标
														
 
															-            y: float = pstk[id].y           # 段落初始纵坐标
														
 
															-            x0: float = pstk[id].x0         # 段落左边界
														
 
															-            x1: float = pstk[id].x1         # 段落右边界
														
 
															-            size: float = pstk[id].size     # 段落字体大小
														
 
															-            brk: bool = pstk[id].brk        # 段落换行标记
														
 
															-            cstk: str = ""                  # 当前文字栈
														
 
															-            fcur: str = None                # 当前字体 ID
														
 
															+            x: float = pstk[id].x                       # 段落初始横坐标
														
 
															+            y: float = pstk[id].y                       # 段落初始纵坐标
														
 
															+            x0: float = pstk[id].x0                     # 段落左边界
														
 
															+            x1: float = pstk[id].x1                     # 段落右边界
														
 
															+            height: float = pstk[id].y1 - pstk[id].y0   # 段落高度
														
 
															+            size: float = pstk[id].size                 # 段落字体大小
														
 
															+            brk: bool = pstk[id].brk                    # 段落换行标记
														
 
															+            cstk: str = ""                              # 当前文字栈
														
 
															+            fcur: str = None                            # 当前字体 ID
														
 
															+            lidx = 0                                    # 记录换行次数
														
 
															             tx = x
														
 
															             fcur_ = fcur
														
 
															             ptr = 0
														
 
															             log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
														
 
															+
														
 
															+            ops_vals: list[dict] = []
														
 
															+
														
 
															             while ptr < len(new):
														
 
															                 vy_regex = re.match(
														
 
															                     r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
														
@@ -418,25 +441,48 @@ class TranslateConverter(PDFConverterEx):
 
															                     or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
														
 
															                 ):
														
 
															                     if cstk:
														
 
															-                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
														
 
															+                        ops_vals.append({
														
 
															+                            "type": OpType.TEXT,
														
 
															+                            "font": fcur,
														
 
															+                            "size": size,
														
 
															+                            "x": tx,
														
 
															+                            "dy": 0,
														
 
															+                            "rtxt": raw_string(fcur, cstk),
														
 
															+                            "lidx": lidx
														
 
															+                        })
														
 
															                         cstk = ""
														
 
															                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
														
 
															                     x = x0
														
 
															-                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
														
 
															-                    y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
														
 
															+                    lidx += 1
														
 
															                 if vy_regex:  # 插入公式
														
 
															                     fix = 0
														
 
															                     if fcur is not None:  # 段落内公式修正纵向偏移
														
 
															                         fix = varf[vid]
														
 
															                     for vch in var[vid]:  # 排版公式字符
														
 
															                         vc = chr(vch.cid)
														
 
															-                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
														
 
															+                        ops_vals.append({
														
 
															+                            "type": OpType.TEXT,
														
 
															+                            "font": self.fontid[vch.font],
														
 
															+                            "size": vch.size,
														
 
															+                            "x": x + vch.x0 - var[vid][0].x0,
														
 
															+                            "dy": fix + vch.y0 - var[vid][0].y0,
														
 
															+                            "rtxt": raw_string(self.fontid[vch.font], vc),
														
 
															+                            "lidx": lidx
														
 
															+                        })
														
 
															                         if log.isEnabledFor(logging.DEBUG):
														
 
															                             lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
														
 
															                             _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
														
 
															                     for l in varl[vid]:  # 排版公式线条
														
 
															                         if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
														
 
															-                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
														
 
															+                            ops_vals.append({
														
 
															+                                "type": OpType.LINE,
														
 
															+                                "x": l.pts[0][0] + x - var[vid][0].x0,
														
 
															+                                "dy": l.pts[0][1] + fix - var[vid][0].y0,
														
 
															+                                "linewidth": l.linewidth,
														
 
															+                                "xlen": l.pts[1][0] - l.pts[0][0],
														
 
															+                                "ylen": l.pts[1][1] - l.pts[0][1],
														
 
															+                                "lidx": lidx
														
 
															+                            })
														
 
															                 else:  # 插入文字缓冲区
														
 
															                     if not cstk:  # 单行开头
														
 
															                         tx = x
														
@@ -454,9 +500,34 @@ class TranslateConverter(PDFConverterEx):
 
															                     _x, _y = x, y
														
 
															             # 处理结尾
														
 
															             if cstk:
														
 
															-                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
														
 
															+                ops_vals.append({
														
 
															+                    "type": OpType.TEXT,
														
 
															+                    "font": fcur,
														
 
															+                    "size": size,
														
 
															+                    "x": tx,
														
 
															+                    "dy": 0,
														
 
															+                    "rtxt": raw_string(fcur, cstk),
														
 
															+                    "lidx": lidx
														
 
															+                })
														
 
															+
														
 
															+            line_height = default_line_height
														
 
															+
														
 
															+            while (lidx + 1) * size * line_height > height and line_height >= 1:
														
 
															+                line_height -= 0.05
														
 
															+
														
 
															+            for vals in ops_vals:
														
 
															+                if vals["type"] == OpType.TEXT:
														
 
															+                    ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"]))
														
 
															+                elif vals["type"] == OpType.LINE:
														
 
															+                    ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"]))
														
 
															+
														
 
															         for l in lstk:  # 排版全局线条
														
 
															             if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
														
 
															-                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
														
 
															-        ops = f"BT {ops}ET "
														
 
															+                ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))
														
 
															+
														
 
															+        ops = f"BT {''.join(ops_list)}ET "
														
 
															         return ops
														
 
															+
														
 
															+class OpType(Enum):
														
 
															+    TEXT = "text"
														
 
															+    LINE = "line"