|
@@ -1,4 +1,5 @@
|
|
|
from typing import Dict, List
|
|
from typing import Dict, List
|
|
|
|
|
+from enum import Enum
|
|
|
|
|
|
|
|
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
|
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
|
|
from pdfminer.pdffont import PDFCIDFont
|
|
from pdfminer.pdffont import PDFCIDFont
|
|
@@ -117,11 +118,13 @@ class PDFConverterEx(PDFConverter):
|
|
|
|
|
|
|
|
|
|
|
|
|
class Paragraph:
|
|
class Paragraph:
|
|
|
- def __init__(self, y, x, x0, x1, size, brk):
|
|
|
|
|
|
|
+ def __init__(self, y, x, x0, x1, y0, y1, size, brk):
|
|
|
self.y: float = y # 初始纵坐标
|
|
self.y: float = y # 初始纵坐标
|
|
|
self.x: float = x # 初始横坐标
|
|
self.x: float = x # 初始横坐标
|
|
|
self.x0: float = x0 # 左边界
|
|
self.x0: float = x0 # 左边界
|
|
|
self.x1: float = x1 # 右边界
|
|
self.x1: float = x1 # 右边界
|
|
|
|
|
+ self.y0: float = y0 # 上边界
|
|
|
|
|
+ self.y1: float = y1 # 下边界
|
|
|
self.size: float = size # 字体大小
|
|
self.size: float = size # 字体大小
|
|
|
self.brk: bool = brk # 换行标记
|
|
self.brk: bool = brk # 换行标记
|
|
|
|
|
|
|
@@ -186,6 +189,8 @@ class TranslateConverter(PDFConverterEx):
|
|
|
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
|
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
|
|
ops: str = "" # 渲染结果
|
|
ops: str = "" # 渲染结果
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
|
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
|
|
if isinstance(font, bytes): # 不一定能 decode,直接转 str
|
|
if isinstance(font, bytes): # 不一定能 decode,直接转 str
|
|
|
try:
|
|
try:
|
|
@@ -287,7 +292,7 @@ class TranslateConverter(PDFConverterEx):
|
|
|
pstk[-1].brk = True
|
|
pstk[-1].brk = True
|
|
|
else: # 根据当前字符构建一个新的段落
|
|
else: # 根据当前字符构建一个新的段落
|
|
|
sstk.append("")
|
|
sstk.append("")
|
|
|
- pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
|
|
|
|
|
|
|
+ pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
|
|
|
if not cur_v: # 文字入栈
|
|
if not cur_v: # 文字入栈
|
|
|
if ( # 根据当前字符修正段落属性
|
|
if ( # 根据当前字符修正段落属性
|
|
|
child.size > pstk[-1].size # 1. 当前字符比段落字体大
|
|
child.size > pstk[-1].size # 1. 当前字符比段落字体大
|
|
@@ -307,6 +312,8 @@ class TranslateConverter(PDFConverterEx):
|
|
|
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
|
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
|
|
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
|
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
|
|
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
|
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
|
|
|
|
+ pstk[-1].y0 = min(pstk[-1].y0, child.y0)
|
|
|
|
|
+ pstk[-1].y1 = max(pstk[-1].y1, child.y1)
|
|
|
# 更新上一个字符
|
|
# 更新上一个字符
|
|
|
xt = child
|
|
xt = child
|
|
|
xt_cls = cls
|
|
xt_cls = cls
|
|
@@ -369,20 +376,36 @@ class TranslateConverter(PDFConverterEx):
|
|
|
else:
|
|
else:
|
|
|
return "".join(["%02x" % ord(c) for c in cstk])
|
|
return "".join(["%02x" % ord(c) for c in cstk])
|
|
|
|
|
|
|
|
|
|
+ # 根据目标语言获取默认行距
|
|
|
|
|
+ LANG_LINEHEIGHT_MAP = {
|
|
|
|
|
+ "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
|
|
|
|
|
+ "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
|
|
|
|
|
+ }
|
|
|
|
|
+ default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1
|
|
|
|
|
+
|
|
|
_x, _y = 0, 0
|
|
_x, _y = 0, 0
|
|
|
|
|
+
|
|
|
|
|
+ ops_list = []
|
|
|
|
|
+ gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
|
|
|
|
|
+ gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
|
|
|
for id, new in enumerate(news):
|
|
for id, new in enumerate(news):
|
|
|
- x: float = pstk[id].x # 段落初始横坐标
|
|
|
|
|
- y: float = pstk[id].y # 段落初始纵坐标
|
|
|
|
|
- x0: float = pstk[id].x0 # 段落左边界
|
|
|
|
|
- x1: float = pstk[id].x1 # 段落右边界
|
|
|
|
|
- size: float = pstk[id].size # 段落字体大小
|
|
|
|
|
- brk: bool = pstk[id].brk # 段落换行标记
|
|
|
|
|
- cstk: str = "" # 当前文字栈
|
|
|
|
|
- fcur: str = None # 当前字体 ID
|
|
|
|
|
|
|
+ x: float = pstk[id].x # 段落初始横坐标
|
|
|
|
|
+ y: float = pstk[id].y # 段落初始纵坐标
|
|
|
|
|
+ x0: float = pstk[id].x0 # 段落左边界
|
|
|
|
|
+ x1: float = pstk[id].x1 # 段落右边界
|
|
|
|
|
+ height: float = pstk[id].y1 - pstk[id].y0 # 段落高度
|
|
|
|
|
+ size: float = pstk[id].size # 段落字体大小
|
|
|
|
|
+ brk: bool = pstk[id].brk # 段落换行标记
|
|
|
|
|
+ cstk: str = "" # 当前文字栈
|
|
|
|
|
+ fcur: str = None # 当前字体 ID
|
|
|
|
|
+ lidx = 0 # 记录换行次数
|
|
|
tx = x
|
|
tx = x
|
|
|
fcur_ = fcur
|
|
fcur_ = fcur
|
|
|
ptr = 0
|
|
ptr = 0
|
|
|
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
|
|
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
|
|
|
|
|
+
|
|
|
|
|
+ ops_vals: list[dict] = []
|
|
|
|
|
+
|
|
|
while ptr < len(new):
|
|
while ptr < len(new):
|
|
|
vy_regex = re.match(
|
|
vy_regex = re.match(
|
|
|
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
|
|
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
|
|
@@ -418,25 +441,48 @@ class TranslateConverter(PDFConverterEx):
|
|
|
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
|
):
|
|
):
|
|
|
if cstk:
|
|
if cstk:
|
|
|
- ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
|
|
|
|
+ ops_vals.append({
|
|
|
|
|
+ "type": OpType.TEXT,
|
|
|
|
|
+ "font": fcur,
|
|
|
|
|
+ "size": size,
|
|
|
|
|
+ "x": tx,
|
|
|
|
|
+ "dy": 0,
|
|
|
|
|
+ "rtxt": raw_string(fcur, cstk),
|
|
|
|
|
+ "lidx": lidx
|
|
|
|
|
+ })
|
|
|
cstk = ""
|
|
cstk = ""
|
|
|
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
|
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
|
|
x = x0
|
|
x = x0
|
|
|
- lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
|
|
|
|
|
- y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
|
|
|
|
|
|
|
+ lidx += 1
|
|
|
if vy_regex: # 插入公式
|
|
if vy_regex: # 插入公式
|
|
|
fix = 0
|
|
fix = 0
|
|
|
if fcur is not None: # 段落内公式修正纵向偏移
|
|
if fcur is not None: # 段落内公式修正纵向偏移
|
|
|
fix = varf[vid]
|
|
fix = varf[vid]
|
|
|
for vch in var[vid]: # 排版公式字符
|
|
for vch in var[vid]: # 排版公式字符
|
|
|
vc = chr(vch.cid)
|
|
vc = chr(vch.cid)
|
|
|
- ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
|
|
|
|
|
|
|
+ ops_vals.append({
|
|
|
|
|
+ "type": OpType.TEXT,
|
|
|
|
|
+ "font": self.fontid[vch.font],
|
|
|
|
|
+ "size": vch.size,
|
|
|
|
|
+ "x": x + vch.x0 - var[vid][0].x0,
|
|
|
|
|
+ "dy": fix + vch.y0 - var[vid][0].y0,
|
|
|
|
|
+ "rtxt": raw_string(self.fontid[vch.font], vc),
|
|
|
|
|
+ "lidx": lidx
|
|
|
|
|
+ })
|
|
|
if log.isEnabledFor(logging.DEBUG):
|
|
if log.isEnabledFor(logging.DEBUG):
|
|
|
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
|
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
|
|
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
|
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
|
|
for l in varl[vid]: # 排版公式线条
|
|
for l in varl[vid]: # 排版公式线条
|
|
|
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
|
- ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
|
|
|
|
|
|
+ ops_vals.append({
|
|
|
|
|
+ "type": OpType.LINE,
|
|
|
|
|
+ "x": l.pts[0][0] + x - var[vid][0].x0,
|
|
|
|
|
+ "dy": l.pts[0][1] + fix - var[vid][0].y0,
|
|
|
|
|
+ "linewidth": l.linewidth,
|
|
|
|
|
+ "xlen": l.pts[1][0] - l.pts[0][0],
|
|
|
|
|
+ "ylen": l.pts[1][1] - l.pts[0][1],
|
|
|
|
|
+ "lidx": lidx
|
|
|
|
|
+ })
|
|
|
else: # 插入文字缓冲区
|
|
else: # 插入文字缓冲区
|
|
|
if not cstk: # 单行开头
|
|
if not cstk: # 单行开头
|
|
|
tx = x
|
|
tx = x
|
|
@@ -454,9 +500,34 @@ class TranslateConverter(PDFConverterEx):
|
|
|
_x, _y = x, y
|
|
_x, _y = x, y
|
|
|
# 处理结尾
|
|
# 处理结尾
|
|
|
if cstk:
|
|
if cstk:
|
|
|
- ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
|
|
|
|
+ ops_vals.append({
|
|
|
|
|
+ "type": OpType.TEXT,
|
|
|
|
|
+ "font": fcur,
|
|
|
|
|
+ "size": size,
|
|
|
|
|
+ "x": tx,
|
|
|
|
|
+ "dy": 0,
|
|
|
|
|
+ "rtxt": raw_string(fcur, cstk),
|
|
|
|
|
+ "lidx": lidx
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ line_height = default_line_height
|
|
|
|
|
+
|
|
|
|
|
+ while (lidx + 1) * size * line_height > height and line_height >= 1:
|
|
|
|
|
+ line_height -= 0.05
|
|
|
|
|
+
|
|
|
|
|
+ for vals in ops_vals:
|
|
|
|
|
+ if vals["type"] == OpType.TEXT:
|
|
|
|
|
+ ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"]))
|
|
|
|
|
+ elif vals["type"] == OpType.LINE:
|
|
|
|
|
+ ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"]))
|
|
|
|
|
+
|
|
|
for l in lstk: # 排版全局线条
|
|
for l in lstk: # 排版全局线条
|
|
|
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
|
- ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
|
|
|
|
- ops = f"BT {ops}ET "
|
|
|
|
|
|
|
+ ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))
|
|
|
|
|
+
|
|
|
|
|
+ ops = f"BT {''.join(ops_list)}ET "
|
|
|
return ops
|
|
return ops
|
|
|
|
|
+
|
|
|
|
|
+class OpType(Enum):
|
|
|
|
|
+ TEXT = "text"
|
|
|
|
|
+ LINE = "line"
|