|
|
@@ -281,7 +281,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
graphicstate,
|
|
|
)
|
|
|
self.cur_item.add(item)
|
|
|
- item.cid = cid # hack
|
|
|
+ item.cid = cid # hack 插入原字符编码
|
|
|
return item.adv
|
|
|
|
|
|
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
|
|
|
@@ -404,342 +404,304 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
else:
|
|
|
cast(TextIO, self.outfp).write(text)
|
|
|
|
|
|
+ # fmt: off
|
|
|
def receive_layout(self, ltpage: LTPage):
|
|
|
- def render(item: LTItem) -> None:
|
|
|
- xt = None # 上一个字符
|
|
|
- sstk = [] # 段落文字栈
|
|
|
- vstk = [] # 公式符号组
|
|
|
- vlstk = [] # 公式线条组
|
|
|
- vfix = 0 # 公式纵向偏移
|
|
|
- vbkt = 0 # 段落公式括号计数
|
|
|
- pstk = [] # 段落属性栈
|
|
|
- lstk = [] # 全局线条栈
|
|
|
- var = [] # 公式符号组栈
|
|
|
- varl = [] # 公式线条组栈
|
|
|
- varf = [] # 公式纵向偏移栈
|
|
|
- vlen = [] # 公式宽度栈
|
|
|
- xt_cls = -1 # 上一个字符所属段落
|
|
|
- vmax = ltpage.width / 4 # 行内公式最大宽度
|
|
|
- ops = "" # 渲染结果
|
|
|
-
|
|
|
- def vflag(font, char): # 匹配公式(和角标)字体
|
|
|
- if re.match(r"\(cid:", char):
|
|
|
+ xt = None # 上一个字符
|
|
|
+ sstk = [] # 段落文字栈
|
|
|
+ vstk = [] # 公式符号组
|
|
|
+ vlstk = [] # 公式线条组
|
|
|
+ vfix = 0 # 公式纵向偏移
|
|
|
+ vbkt = 0 # 段落公式括号计数
|
|
|
+ pstk = [] # 段落属性栈
|
|
|
+ lstk = [] # 全局线条栈
|
|
|
+ var = [] # 公式符号组栈
|
|
|
+ varl = [] # 公式线条组栈
|
|
|
+ varf = [] # 公式纵向偏移栈
|
|
|
+ vlen = [] # 公式宽度栈
|
|
|
+ xt_cls = -1 # 上一个字符所属段落
|
|
|
+ vmax = ltpage.width / 4 # 行内公式最大宽度
|
|
|
+ ops = "" # 渲染结果
|
|
|
+
|
|
|
+ def vflag(font, char): # 匹配公式(和角标)字体
|
|
|
+ if re.match(r"\(cid:", char):
|
|
|
+ return True
|
|
|
+ # 基于字体名规则的判定
|
|
|
+ if self.vfont:
|
|
|
+ if re.match(self.vfont, font):
|
|
|
return True
|
|
|
- if self.vfont:
|
|
|
- if re.match(self.vfont, font):
|
|
|
- return True
|
|
|
- else:
|
|
|
- if re.match(
|
|
|
- r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
|
|
|
- font,
|
|
|
- ):
|
|
|
- return True
|
|
|
- if self.vchar:
|
|
|
- if re.match(self.vchar, char):
|
|
|
- return True
|
|
|
- else:
|
|
|
- if (
|
|
|
- char
|
|
|
- and char != " "
|
|
|
- and (
|
|
|
- unicodedata.category(char[0])
|
|
|
- in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]
|
|
|
- or ord(char[0]) in range(0x370, 0x400)
|
|
|
- )
|
|
|
- ): # 文字修饰符、数学符号、分隔符号、希腊字母
|
|
|
- return True
|
|
|
- return False
|
|
|
-
|
|
|
- ptr = 0
|
|
|
- item = list(item)
|
|
|
- while ptr < len(item): # 识别文字和公式
|
|
|
- child = item[ptr]
|
|
|
- if isinstance(child, LTChar):
|
|
|
- cur_v = False # 公式
|
|
|
- fontname = child.fontname.split("+")[-1]
|
|
|
- layout = self.layout[ltpage.pageid]
|
|
|
- h, w = (
|
|
|
- layout.shape
|
|
|
- ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
- cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
|
|
|
- int(child.y0), 0, h - 1
|
|
|
+ else:
|
|
|
+ if re.match( # latex 字体
|
|
|
+ r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
|
|
|
+ font,
|
|
|
+ ):
|
|
|
+ return True
|
|
|
+ # 基于字符集规则的判定
|
|
|
+ if self.vchar:
|
|
|
+ if re.match(self.vchar, char):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ if (
|
|
|
+ char
|
|
|
+ and char != " " # 非空格
|
|
|
+ and (
|
|
|
+ unicodedata.category(char[0])
|
|
|
+ in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
|
|
+ or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
|
|
)
|
|
|
- cls = layout[cy, cx]
|
|
|
- # if log.isEnabledFor(logging.DEBUG):
|
|
|
- # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f}
|
|
|
- # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
|
|
|
- if (
|
|
|
- cls == 0
|
|
|
- or (cls == xt_cls and child.size < pstk[-1][4] * 0.79)
|
|
|
- or vflag(fontname, child.get_text())
|
|
|
- or (child.matrix[0] == 0 and child.matrix[3] == 0)
|
|
|
- ): # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中
|
|
|
+ ):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ ############################################################
|
|
|
+ # A. 原文档解析
|
|
|
+ ptr = 0
|
|
|
+ item = list(ltpage)
|
|
|
+ while ptr < len(item):
|
|
|
+ child = item[ptr]
|
|
|
+ if isinstance(child, LTChar):
|
|
|
+ cur_v = False
|
|
|
+ fontname = child.fontname.split("+")[-1]
|
|
|
+ layout = self.layout[ltpage.pageid]
|
|
|
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
+ h, w = layout.shape
|
|
|
+ # 读取当前字符在 layout 中的类别
|
|
|
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
|
|
+ cls = layout[cy, cx]
|
|
|
+ if ( # 判定当前字符是否属于公式
|
|
|
+ cls == 0 # 1. 类别为保留区域
|
|
|
+ or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1][4] * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
|
|
+ or vflag(fontname, child.get_text()) # 3. 公式字体
|
|
|
+ or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
|
|
+ ):
|
|
|
+ cur_v = True
|
|
|
+ # 判定括号组是否属于公式
|
|
|
+ if not cur_v:
|
|
|
+ if vstk and child.get_text() == "(":
|
|
|
cur_v = True
|
|
|
- if not cur_v: # 判定括号组是否属于公式
|
|
|
- if vstk and child.get_text() == "(":
|
|
|
- cur_v = True
|
|
|
- vbkt += 1
|
|
|
- if vbkt and child.get_text() == ")":
|
|
|
- cur_v = True
|
|
|
- vbkt -= 1
|
|
|
- if (
|
|
|
- not cur_v
|
|
|
- or cls != xt_cls
|
|
|
- or (abs(child.x0 - xt.x0) > vmax and cls != 0)
|
|
|
- ): # 公式结束、段落边界、公式换行
|
|
|
- if vstk: # 公式出栈
|
|
|
- sstk[-1] += f"$v{len(var)}$"
|
|
|
- if (
|
|
|
- not cur_v
|
|
|
- and cls == xt_cls
|
|
|
- and child.x0 > max([vch.x0 for vch in vstk])
|
|
|
- ): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
|
|
|
- vfix = vstk[0].y0 - child.y0
|
|
|
- var.append(vstk)
|
|
|
- varl.append(vlstk)
|
|
|
- varf.append(vfix)
|
|
|
- vstk = []
|
|
|
- vlstk = []
|
|
|
- vfix = 0
|
|
|
- if not vstk: # 非公式或是公式开头
|
|
|
- if cls == xt_cls: # 同一段落
|
|
|
- if child.x0 > xt.x1 + 1: # 行内空格
|
|
|
- sstk[-1] += " "
|
|
|
- elif child.x1 < xt.x0: # 换行空格
|
|
|
- sstk[-1] += " "
|
|
|
- pstk[-1][6] = True # 标记原文段落存在换行
|
|
|
- else:
|
|
|
- sstk.append("")
|
|
|
- pstk.append(
|
|
|
- [
|
|
|
- child.y0,
|
|
|
- child.x0,
|
|
|
- child.x0,
|
|
|
- child.x0,
|
|
|
- child.size,
|
|
|
- child.font,
|
|
|
- False,
|
|
|
- ]
|
|
|
- )
|
|
|
- if not cur_v: # 文字入栈
|
|
|
- if (
|
|
|
- child.size > pstk[-1][4] / 0.79
|
|
|
- or vflag(pstk[-1][5].fontname.split("+")[-1], "")
|
|
|
- or re.match(
|
|
|
- r"(.*Medi|.*Bold)",
|
|
|
- pstk[-1][5].fontname.split("+")[-1],
|
|
|
- re.IGNORECASE,
|
|
|
- )
|
|
|
- ): # 小字体、公式或粗体开头,后续接文字,需要校正字体
|
|
|
- pstk[-1][0] -= child.size - pstk[-1][4]
|
|
|
- pstk[-1][4] = child.size
|
|
|
- pstk[-1][5] = child.font
|
|
|
- sstk[-1] += child.get_text()
|
|
|
- else: # 公式入栈
|
|
|
- if (
|
|
|
- not vstk and cls == xt_cls and child.x0 > xt.x0
|
|
|
- ): # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
|
|
|
- vfix = child.y0 - xt.y0
|
|
|
- vstk.append(child)
|
|
|
- # 更新段落边界,段落内换行之后可能是公式开头
|
|
|
- pstk[-1][2] = min(pstk[-1][2], child.x0)
|
|
|
- pstk[-1][3] = max(pstk[-1][3], child.x1)
|
|
|
- xt = child
|
|
|
- xt_cls = cls
|
|
|
- elif isinstance(child, LTFigure): # 图表
|
|
|
- pass
|
|
|
- elif isinstance(child, LTLine): # 线条
|
|
|
- layout = self.layout[ltpage.pageid]
|
|
|
- h, w = (
|
|
|
- layout.shape
|
|
|
- ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
- cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
|
|
|
- int(child.y0), 0, h - 1
|
|
|
- )
|
|
|
- cls = layout[cy, cx]
|
|
|
- if vstk and cls == xt_cls: # 公式线条
|
|
|
- vlstk.append(child)
|
|
|
- else: # 全局线条
|
|
|
- lstk.append(child)
|
|
|
- else:
|
|
|
- # print(child)
|
|
|
- pass
|
|
|
- ptr += 1
|
|
|
- # 处理结尾
|
|
|
- if vstk: # 公式出栈
|
|
|
- sstk[-1] += f"$v{len(var)}$"
|
|
|
- var.append(vstk)
|
|
|
- varl.append(vlstk)
|
|
|
- varf.append(vfix)
|
|
|
- log.debug("\n==========[VSTACK]==========\n")
|
|
|
- for id, v in enumerate(var): # 计算公式宽度
|
|
|
- l = max([vch.x1 for vch in v]) - v[0].x0 # noqa: E741
|
|
|
- log.debug(
|
|
|
- f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}' # noqa: E501
|
|
|
+ vbkt += 1
|
|
|
+ if vbkt and child.get_text() == ")":
|
|
|
+ cur_v = True
|
|
|
+ vbkt -= 1
|
|
|
+ if ( # 判定当前公式是否结束
|
|
|
+ not cur_v # 1. 当前字符不属于公式
|
|
|
+ or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
|
|
+ or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
|
|
+ ):
|
|
|
+ if vstk:
|
|
|
+ if ( # 根据公式右侧的文字修正公式的纵向偏移
|
|
|
+ not cur_v # 1. 当前字符不属于公式
|
|
|
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
|
|
+ and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
|
|
+ ):
|
|
|
+ vfix = vstk[0].y0 - child.y0
|
|
|
+ sstk[-1] += f"$v{len(var)}$"
|
|
|
+ var.append(vstk)
|
|
|
+ varl.append(vlstk)
|
|
|
+ varf.append(vfix)
|
|
|
+ vstk = []
|
|
|
+ vlstk = []
|
|
|
+ vfix = 0
|
|
|
+ # 当前字符不属于公式或当前字符是公式的第一个字符
|
|
|
+ if not vstk:
|
|
|
+ if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
|
|
+ if child.x0 > xt.x1 + 1: # 添加行内空格
|
|
|
+ sstk[-1] += " "
|
|
|
+ elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
|
|
+ sstk[-1] += " "
|
|
|
+ pstk[-1][6] = True
|
|
|
+ else: # 根据当前字符构建一个新的段落
|
|
|
+ sstk.append("")
|
|
|
+ pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False,])
|
|
|
+ if not cur_v: # 文字入栈
|
|
|
+ if ( # 根据当前字符修正段落属性
|
|
|
+ child.size > pstk[-1][4] / 0.79 # 1. 当前字符显著比段落字体大
|
|
|
+ or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
|
|
+ or vflag(pstk[-1][5].fontname.split("+")[-1], "") # 3. 段落字体为公式字体
|
|
|
+ or re.match( # 4. 段落字体为粗体
|
|
|
+ r"(.*Medi|.*Bold)",
|
|
|
+ pstk[-1][5].fontname.split("+")[-1],
|
|
|
+ re.IGNORECASE,
|
|
|
+ )
|
|
|
+ ):
|
|
|
+ pstk[-1][0] -= child.size - pstk[-1][4] # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
|
|
|
+ pstk[-1][4] = child.size
|
|
|
+ pstk[-1][5] = child.font
|
|
|
+ sstk[-1] += child.get_text()
|
|
|
+ else: # 公式入栈
|
|
|
+ if ( # 根据公式右侧的文字修正公式的纵向偏移
|
|
|
+ not vstk # 1. 当前字符是公式的第一个字符
|
|
|
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
|
|
+ and child.x0 > xt.x0 # 3. 当前字符在前一个字符右侧
|
|
|
+ ):
|
|
|
+ vfix = child.y0 - xt.y0
|
|
|
+ vstk.append(child)
|
|
|
+ # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
|
|
+ pstk[-1][2] = min(pstk[-1][2], child.x0)
|
|
|
+ pstk[-1][3] = max(pstk[-1][3], child.x1)
|
|
|
+ # 更新上一个字符
|
|
|
+ xt = child
|
|
|
+ xt_cls = cls
|
|
|
+ elif isinstance(child, LTFigure): # 图表
|
|
|
+ pass
|
|
|
+ elif isinstance(child, LTLine): # 线条
|
|
|
+ layout = self.layout[ltpage.pageid]
|
|
|
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
+ h, w = layout.shape
|
|
|
+ # 读取当前线条在 layout 中的类别
|
|
|
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
|
|
+ cls = layout[cy, cx]
|
|
|
+ if vstk and cls == xt_cls: # 公式线条
|
|
|
+ vlstk.append(child)
|
|
|
+ else: # 全局线条
|
|
|
+ lstk.append(child)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ ptr += 1
|
|
|
+ # 处理结尾
|
|
|
+ if vstk: # 公式出栈
|
|
|
+ sstk[-1] += f"$v{len(var)}$"
|
|
|
+ var.append(vstk)
|
|
|
+ varl.append(vlstk)
|
|
|
+ varf.append(vfix)
|
|
|
+ log.debug("\n==========[VSTACK]==========\n")
|
|
|
+ for id, v in enumerate(var): # 计算公式宽度
|
|
|
+ l = max([vch.x1 for vch in v]) - v[0].x0 # noqa: E741
|
|
|
+ log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
|
|
|
+ vlen.append(l)
|
|
|
+
|
|
|
+ ############################################################
|
|
|
+ # B. 段落翻译
|
|
|
+ log.debug("\n==========[SSTACK]==========\n")
|
|
|
+ hash_key = cache.deterministic_hash("PDFMathTranslate")
|
|
|
+ cache.create_cache(hash_key)
|
|
|
+ @retry(wait=wait_fixed(1))
|
|
|
+ def worker(s): # 多线程翻译
|
|
|
+ try:
|
|
|
+ hash_key_paragraph = cache.deterministic_hash(
|
|
|
+ (s, str(self.translator))
|
|
|
)
|
|
|
- vlen.append(l)
|
|
|
- log.debug("\n==========[SSTACK]==========\n")
|
|
|
- hash_key = cache.deterministic_hash("PDFMathTranslate")
|
|
|
- cache.create_cache(hash_key)
|
|
|
-
|
|
|
- @retry(wait=wait_fixed(1))
|
|
|
- def worker(s): # 多线程翻译
|
|
|
- try:
|
|
|
- hash_key_paragraph = cache.deterministic_hash(
|
|
|
- (s, str(self.translator))
|
|
|
- )
|
|
|
- new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
|
|
- if new is None:
|
|
|
- new = self.translator.translate(s)
|
|
|
- new = remove_control_characters(new)
|
|
|
- cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
|
|
- return new
|
|
|
- except BaseException as e:
|
|
|
- if log.isEnabledFor(logging.DEBUG):
|
|
|
- log.exception(e)
|
|
|
- else:
|
|
|
- log.exception(e, exc_info=False)
|
|
|
- raise e
|
|
|
-
|
|
|
- with concurrent.futures.ThreadPoolExecutor(
|
|
|
- max_workers=self.thread
|
|
|
- ) as executor:
|
|
|
- news = list(executor.map(worker, sstk))
|
|
|
-
|
|
|
- def raw_string(fcur, cstk): # 编码字符串
|
|
|
- if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
|
|
- return "".join(["%04x" % ord(c) for c in cstk])
|
|
|
+ new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
|
|
+ if new is None:
|
|
|
+ new = self.translator.translate(s)
|
|
|
+ new = remove_control_characters(new)
|
|
|
+ cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
|
|
+ return new
|
|
|
+ except BaseException as e:
|
|
|
+ if log.isEnabledFor(logging.DEBUG):
|
|
|
+ log.exception(e)
|
|
|
else:
|
|
|
- return "".join(["%02x" % ord(c) for c in cstk])
|
|
|
-
|
|
|
- _x, _y = 0, 0
|
|
|
- for id, new in enumerate(news): # 排版文字和公式
|
|
|
- tx = x = pstk[id][1]
|
|
|
- y = pstk[id][0]
|
|
|
- lt = pstk[id][2]
|
|
|
- rt = pstk[id][3]
|
|
|
- ptr = 0
|
|
|
- size = pstk[id][4]
|
|
|
- font = pstk[id][5]
|
|
|
- lb = pstk[id][6] # 段落属性
|
|
|
- cstk = "" # 单行文字栈
|
|
|
- fcur = fcur_ = None # 单行字体
|
|
|
- log.debug(
|
|
|
- f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}"
|
|
|
- )
|
|
|
- while True:
|
|
|
- if ptr == len(new): # 到达段落结尾
|
|
|
- if cstk:
|
|
|
- ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
- break
|
|
|
- vy_regex = re.match(
|
|
|
- r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
|
|
- ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
|
|
- mod = False # 当前公式是否为文字修饰符
|
|
|
- if vy_regex: # 加载公式
|
|
|
- ptr += len(vy_regex.group(0))
|
|
|
- try:
|
|
|
- vid = int(vy_regex.group(1).replace(" ", ""))
|
|
|
- adv = vlen[vid]
|
|
|
- except Exception:
|
|
|
- continue # 翻译器可能会自动补个越界的公式标记
|
|
|
- if len(var[vid]) == 1 and unicodedata.category(
|
|
|
- var[vid][0].get_text()[0]
|
|
|
- ) in [
|
|
|
- "Lm",
|
|
|
- "Mn",
|
|
|
- "Sk",
|
|
|
- ]: # 文字修饰符
|
|
|
- mod = True
|
|
|
- else: # 加载文字
|
|
|
- ch = new[ptr]
|
|
|
- # if font.char_width(ord(ch)):
|
|
|
- fcur_ = None
|
|
|
- # 原字体编码容易出问题,这里直接放弃掉
|
|
|
- # try:
|
|
|
- # if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
|
|
- # fcur_=self.fontid[font] # 原字体
|
|
|
- # except:
|
|
|
- # pass
|
|
|
- try:
|
|
|
- if (
|
|
|
- fcur_ is None
|
|
|
- and self.fontmap["tiro"].to_unichr(ord(ch)) == ch
|
|
|
- ):
|
|
|
- fcur_ = "tiro" # 默认英文字体
|
|
|
- except Exception:
|
|
|
- pass
|
|
|
- if fcur_ is None:
|
|
|
- fcur_ = "china-ss" # 默认中文字体
|
|
|
- # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
|
|
- adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
|
|
- ptr += 1
|
|
|
- if (
|
|
|
- fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size
|
|
|
- ): # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
|
- if cstk:
|
|
|
- ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
- cstk = ""
|
|
|
- if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行
|
|
|
- x = lt
|
|
|
- lang_space = {
|
|
|
- "zh-CN": 1.4,
|
|
|
- "zh-TW": 1.4,
|
|
|
- "ja": 1.1,
|
|
|
- "ko": 1.2,
|
|
|
- "en": 1.2,
|
|
|
- } # CJK
|
|
|
- y -= size * lang_space.get(
|
|
|
- self.translator.lang_out, 1.1
|
|
|
- ) # 小语种大多适配 1.1
|
|
|
- if vy_regex: # 插入公式
|
|
|
- fix = 0
|
|
|
- if fcur is not None: # 段落内公式修正纵向偏移
|
|
|
- fix = varf[vid]
|
|
|
- for vch in var[vid]: # 排版公式字符
|
|
|
- vc = chr(vch.cid)
|
|
|
- ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " # noqa: E501
|
|
|
- if log.isEnabledFor(logging.DEBUG):
|
|
|
- lstk.append(
|
|
|
- LTLine(
|
|
|
- 0.1,
|
|
|
- (_x, _y),
|
|
|
- (
|
|
|
- x + vch.x0 - var[vid][0].x0,
|
|
|
- fix + y + vch.y0 - var[vid][0].y0,
|
|
|
- ),
|
|
|
- )
|
|
|
- )
|
|
|
- _x, _y = (
|
|
|
- x + vch.x0 - var[vid][0].x0,
|
|
|
- fix + y + vch.y0 - var[vid][0].y0,
|
|
|
- )
|
|
|
- for l in varl[vid]: # 排版公式线条 # noqa: E741
|
|
|
- if l.linewidth < 5: # hack
|
|
|
- ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
- else: # 插入文字缓冲区
|
|
|
- if not cstk: # 单行开头
|
|
|
- tx = x
|
|
|
- if x == lt and ch == " ": # 消除段落换行空格
|
|
|
- adv = 0
|
|
|
- else:
|
|
|
- cstk += ch
|
|
|
+ log.exception(e, exc_info=False)
|
|
|
+ raise e
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(
|
|
|
+ max_workers=self.thread
|
|
|
+ ) as executor:
|
|
|
+ news = list(executor.map(worker, sstk))
|
|
|
+
|
|
|
+ ############################################################
|
|
|
+ # C. 新文档排版
|
|
|
+ def raw_string(fcur, cstk): # 编码字符串
|
|
|
+ if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
|
|
+ return "".join(["%04x" % ord(c) for c in cstk])
|
|
|
+ else:
|
|
|
+ return "".join(["%02x" % ord(c) for c in cstk])
|
|
|
+ _x, _y = 0, 0
|
|
|
+ for id, new in enumerate(news):
|
|
|
+ tx = x = pstk[id][1] # 段落初始横坐标
|
|
|
+ y = pstk[id][0] # 段落上边界
|
|
|
+ lt = pstk[id][2] # 段落左边界
|
|
|
+ rt = pstk[id][3] # 段落右边界
|
|
|
+ size = pstk[id][4] # 段落字体大小
|
|
|
+ font = pstk[id][5] # 段落字体
|
|
|
+ lb = pstk[id][6] # 段落属性
|
|
|
+ cstk = "" # 当前文字栈
|
|
|
+ fcur = fcur_ = None # 当前字体
|
|
|
+ ptr = 0
|
|
|
+ log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
|
|
|
+ while True:
|
|
|
+ if ptr == len(new): # 到达段落结尾
|
|
|
+ if cstk:
|
|
|
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
+ break
|
|
|
+ vy_regex = re.match(
|
|
|
+ r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
|
|
+ ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
|
|
+ mod = False # 当前公式是否为文字修饰符
|
|
|
+ if vy_regex: # 加载公式
|
|
|
+ ptr += len(vy_regex.group(0))
|
|
|
+ try:
|
|
|
+ vid = int(vy_regex.group(1).replace(" ", ""))
|
|
|
+ adv = vlen[vid]
|
|
|
+ except Exception:
|
|
|
+ continue # 翻译器可能会自动补个越界的公式标记
|
|
|
+ if len(var[vid]) == 1 and unicodedata.category(
|
|
|
+ var[vid][0].get_text()[0]
|
|
|
+ ) in ["Lm","Mn","Sk",]: # 文字修饰符
|
|
|
+ mod = True
|
|
|
+ else: # 加载文字
|
|
|
+ ch = new[ptr]
|
|
|
+ # if font.char_width(ord(ch)):
|
|
|
+ fcur_ = None
|
|
|
+ # 原字体编码容易出问题,这里直接放弃掉
|
|
|
+ # try:
|
|
|
+ # if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
|
|
+ # fcur_=self.fontid[font] # 原字体
|
|
|
+ # except:
|
|
|
+ # pass
|
|
|
+ try:
|
|
|
+ if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
|
|
+ fcur_ = "tiro" # 默认英文字体
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ if fcur_ is None:
|
|
|
+ fcur_ = "china-ss" # 默认中文字体
|
|
|
+ # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
|
|
+ adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
|
|
+ ptr += 1
|
|
|
+ if ( # 输出文字缓冲区
|
|
|
+ fcur_ != fcur # 1. 字体更新
|
|
|
+ or vy_regex # 2. 插入公式
|
|
|
+ or x + adv > rt + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
|
+ ):
|
|
|
+ if cstk:
|
|
|
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
+ cstk = ""
|
|
|
+ if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行
|
|
|
+ x = lt
|
|
|
+ lang_space = {"zh-CN": 1.4,"zh-TW": 1.4,"ja": 1.1,"ko": 1.2,"en": 1.2} # CJK
|
|
|
+ y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
|
|
|
+ if vy_regex: # 插入公式
|
|
|
+ fix = 0
|
|
|
+ if fcur is not None: # 段落内公式修正纵向偏移
|
|
|
+ fix = varf[vid]
|
|
|
+ for vch in var[vid]: # 排版公式字符
|
|
|
+ vc = chr(vch.cid)
|
|
|
+ ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " # noqa: E501
|
|
|
+ if log.isEnabledFor(logging.DEBUG):
|
|
|
+ lstk.append(LTLine(0.1,(_x, _y),(x + vch.x0 - var[vid][0].x0,fix + y + vch.y0 - var[vid][0].y0,)))
|
|
|
+ _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
|
|
+ for l in varl[vid]: # 排版公式线条 # noqa: E741
|
|
|
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
|
+ ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
+ else: # 插入文字缓冲区
|
|
|
+ if not cstk: # 单行开头
|
|
|
+ tx = x
|
|
|
+ if x == lt and ch == " ": # 消除段落换行空格
|
|
|
+ adv = 0
|
|
|
else:
|
|
|
cstk += ch
|
|
|
- if mod: # 文字修饰符
|
|
|
- adv = 0
|
|
|
- fcur = fcur_
|
|
|
- x += adv
|
|
|
- if log.isEnabledFor(logging.DEBUG):
|
|
|
- lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
|
|
- _x, _y = x, y
|
|
|
- for l in lstk: # 排版全局线条 # noqa: E741
|
|
|
- if l.linewidth < 5: # hack
|
|
|
- ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
- ops = f"BT {ops}ET "
|
|
|
- return ops
|
|
|
-
|
|
|
- ops = render(ltpage)
|
|
|
+ else:
|
|
|
+ cstk += ch
|
|
|
+ if mod: # 文字修饰符
|
|
|
+ adv = 0
|
|
|
+ fcur = fcur_
|
|
|
+ x += adv
|
|
|
+ if log.isEnabledFor(logging.DEBUG):
|
|
|
+ lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
|
|
+ _x, _y = x, y
|
|
|
+ for l in lstk: # 排版全局线条 # noqa: E741
|
|
|
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
|
|
+ ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
+ ops = f"BT {ops}ET "
|
|
|
return ops
|
|
|
|
|
|
# Some dummy functions to save memory/CPU when all that is wanted
|