|
|
@@ -1,3 +1,45 @@
|
|
|
+from pdf2zh.utils import (
|
|
|
+ AnyIO,
|
|
|
+ Matrix,
|
|
|
+ PathSegment,
|
|
|
+ Point,
|
|
|
+ Rect,
|
|
|
+ apply_matrix_pt,
|
|
|
+ bbox2str,
|
|
|
+ enc,
|
|
|
+ make_compat_str,
|
|
|
+ mult_matrix,
|
|
|
+ matrix_scale,
|
|
|
+)
|
|
|
+from pdf2zh.pdftypes import PDFStream
|
|
|
+from pdf2zh.pdfpage import PDFPage
|
|
|
+from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
|
|
|
+from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
|
|
|
+from pdf2zh.pdfexceptions import PDFValueError
|
|
|
+from pdf2zh.pdfdevice import PDFTextDevice
|
|
|
+from pdf2zh.pdfcolor import PDFColorSpace
|
|
|
+from pdf2zh.layout import (
|
|
|
+ LAParams,
|
|
|
+ LTAnno,
|
|
|
+ LTChar,
|
|
|
+ LTComponent,
|
|
|
+ LTCurve,
|
|
|
+ LTFigure,
|
|
|
+ LTImage,
|
|
|
+ LTItem,
|
|
|
+ LTLayoutContainer,
|
|
|
+ LTLine,
|
|
|
+ LTPage,
|
|
|
+ LTRect,
|
|
|
+ LTText,
|
|
|
+ LTTextBox,
|
|
|
+ LTTextBoxVertical,
|
|
|
+ LTTextGroup,
|
|
|
+ LTTextLine,
|
|
|
+ TextGroupElement,
|
|
|
+)
|
|
|
+from pdf2zh.image import ImageWriter
|
|
|
+from pdf2zh import utils
|
|
|
import io
|
|
|
import logging
|
|
|
import re
|
|
|
@@ -28,55 +70,15 @@ from pdf2zh.translator import (
|
|
|
OpenAITranslator,
|
|
|
AzureTranslator,
|
|
|
)
|
|
|
+
|
|
|
+
|
|
|
def remove_control_characters(s):
|
|
|
- return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")
|
|
|
+ return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
|
|
|
|
|
|
-from pdf2zh import utils
|
|
|
-from pdf2zh.image import ImageWriter
|
|
|
-from pdf2zh.layout import (
|
|
|
- LAParams,
|
|
|
- LTAnno,
|
|
|
- LTChar,
|
|
|
- LTComponent,
|
|
|
- LTContainer,
|
|
|
- LTCurve,
|
|
|
- LTFigure,
|
|
|
- LTImage,
|
|
|
- LTItem,
|
|
|
- LTLayoutContainer,
|
|
|
- LTLine,
|
|
|
- LTPage,
|
|
|
- LTRect,
|
|
|
- LTText,
|
|
|
- LTTextBox,
|
|
|
- LTTextBoxVertical,
|
|
|
- LTTextGroup,
|
|
|
- LTTextLine,
|
|
|
- TextGroupElement,
|
|
|
-)
|
|
|
-from pdf2zh.pdfcolor import PDFColorSpace
|
|
|
-from pdf2zh.pdfdevice import PDFTextDevice
|
|
|
-from pdf2zh.pdfexceptions import PDFValueError
|
|
|
-from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
|
|
|
-from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
|
|
|
-from pdf2zh.pdfpage import PDFPage
|
|
|
-from pdf2zh.pdftypes import PDFStream
|
|
|
-from pdf2zh.utils import (
|
|
|
- AnyIO,
|
|
|
- Matrix,
|
|
|
- PathSegment,
|
|
|
- Point,
|
|
|
- Rect,
|
|
|
- apply_matrix_pt,
|
|
|
- bbox2str,
|
|
|
- enc,
|
|
|
- make_compat_str,
|
|
|
- mult_matrix,
|
|
|
- matrix_scale,
|
|
|
-)
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
+
|
|
|
class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
cur_item: LTLayoutContainer
|
|
|
ctm: Matrix
|
|
|
@@ -188,7 +190,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
# Note: 'ml', in conditional above, is a frequent anomaly
|
|
|
# that we want to support.
|
|
|
line = LTLine(
|
|
|
- gstate.linewidth*matrix_scale(self.ctm),
|
|
|
+ gstate.linewidth * matrix_scale(self.ctm),
|
|
|
pts[0],
|
|
|
pts[1],
|
|
|
stroke,
|
|
|
@@ -210,7 +212,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
|
|
|
if is_closed_loop and has_square_coordinates:
|
|
|
rect = LTRect(
|
|
|
- gstate.linewidth*matrix_scale(self.ctm),
|
|
|
+ gstate.linewidth * matrix_scale(self.ctm),
|
|
|
(*pts[0], *pts[2]),
|
|
|
stroke,
|
|
|
fill,
|
|
|
@@ -223,7 +225,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
self.cur_item.add(rect)
|
|
|
else:
|
|
|
curve = LTCurve(
|
|
|
- gstate.linewidth*matrix_scale(self.ctm),
|
|
|
+ gstate.linewidth * matrix_scale(self.ctm),
|
|
|
pts,
|
|
|
stroke,
|
|
|
fill,
|
|
|
@@ -236,7 +238,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
self.cur_item.add(curve)
|
|
|
else:
|
|
|
curve = LTCurve(
|
|
|
- gstate.linewidth*matrix_scale(self.ctm),
|
|
|
+ gstate.linewidth * matrix_scale(self.ctm),
|
|
|
pts,
|
|
|
stroke,
|
|
|
fill,
|
|
|
@@ -279,7 +281,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|
|
graphicstate,
|
|
|
)
|
|
|
self.cur_item.add(item)
|
|
|
- item.cid=cid # hack
|
|
|
+ item.cid = cid # hack
|
|
|
return item.adv
|
|
|
|
|
|
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
|
|
|
@@ -355,7 +357,7 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
vfont: str = None,
|
|
|
vchar: str = None,
|
|
|
thread: int = 0,
|
|
|
- layout = {},
|
|
|
+ layout={},
|
|
|
lang_in: str = "",
|
|
|
lang_out: str = "",
|
|
|
service: str = "",
|
|
|
@@ -367,7 +369,7 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
self.vchar = vchar
|
|
|
self.thread = thread
|
|
|
self.layout = layout
|
|
|
- param=service.split(':',1)
|
|
|
+ param = service.split(":", 1)
|
|
|
if param[0] == "google":
|
|
|
self.translator: BaseTranslator = GoogleTranslator(
|
|
|
service, lang_out, lang_in, None
|
|
|
@@ -384,11 +386,11 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
self.translator: BaseTranslator = OllamaTranslator(
|
|
|
service, lang_out, lang_in, param[1]
|
|
|
)
|
|
|
- elif param[0] == 'openai':
|
|
|
+ elif param[0] == "openai":
|
|
|
self.translator: BaseTranslator = OpenAITranslator(
|
|
|
service, lang_out, lang_in, param[1]
|
|
|
)
|
|
|
- elif param[0] == 'azure':
|
|
|
+ elif param[0] == "azure":
|
|
|
self.translator: BaseTranslator = AzureTranslator(
|
|
|
service, lang_out, lang_in, None
|
|
|
)
|
|
|
@@ -404,173 +406,255 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
|
|
|
def receive_layout(self, ltpage: LTPage):
|
|
|
def render(item: LTItem) -> None:
|
|
|
- xt=None # 上一个字符
|
|
|
- sstk=[] # 段落文字栈
|
|
|
- vstk=[] # 公式符号组
|
|
|
- vlstk=[] # 公式线条组
|
|
|
- vfix=0 # 公式纵向偏移
|
|
|
- vbkt=0 # 段落公式括号计数
|
|
|
- pstk=[] # 段落属性栈
|
|
|
- lstk=[] # 全局线条栈
|
|
|
- var=[] # 公式符号组栈
|
|
|
- varl=[] # 公式线条组栈
|
|
|
- varf=[] # 公式纵向偏移栈
|
|
|
- vlen=[] # 公式宽度栈
|
|
|
- xt_cls=-1 # 上一个字符所属段落
|
|
|
- vmax=ltpage.width/4 # 行内公式最大宽度
|
|
|
- ops="" # 渲染结果
|
|
|
- def vflag(font,char): # 匹配公式(和角标)字体
|
|
|
- if re.match(r'\(cid:',char):
|
|
|
+ xt = None # 上一个字符
|
|
|
+ sstk = [] # 段落文字栈
|
|
|
+ vstk = [] # 公式符号组
|
|
|
+ vlstk = [] # 公式线条组
|
|
|
+ vfix = 0 # 公式纵向偏移
|
|
|
+ vbkt = 0 # 段落公式括号计数
|
|
|
+ pstk = [] # 段落属性栈
|
|
|
+ lstk = [] # 全局线条栈
|
|
|
+ var = [] # 公式符号组栈
|
|
|
+ varl = [] # 公式线条组栈
|
|
|
+ varf = [] # 公式纵向偏移栈
|
|
|
+ vlen = [] # 公式宽度栈
|
|
|
+ xt_cls = -1 # 上一个字符所属段落
|
|
|
+ vmax = ltpage.width / 4 # 行内公式最大宽度
|
|
|
+ ops = "" # 渲染结果
|
|
|
+
|
|
|
+ def vflag(font, char): # 匹配公式(和角标)字体
|
|
|
+ if re.match(r"\(cid:", char):
|
|
|
return True
|
|
|
if self.vfont:
|
|
|
- if re.match(self.vfont,font):
|
|
|
+ if re.match(self.vfont, font):
|
|
|
return True
|
|
|
else:
|
|
|
- if re.match(r'(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)',font):
|
|
|
+ if re.match(
|
|
|
+ r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
|
|
|
+ font,
|
|
|
+ ):
|
|
|
return True
|
|
|
if self.vchar:
|
|
|
- if re.match(self.vchar,char):
|
|
|
+ if re.match(self.vchar, char):
|
|
|
return True
|
|
|
else:
|
|
|
- if char and char!=' ' and (unicodedata.category(char[0]) in ['Lm','Mn','Sk','Sm','Zl','Zp','Zs'] or ord(char[0]) in range(0x370,0x400)): # 文字修饰符、数学符号、分隔符号、希腊字母
|
|
|
+ if (
|
|
|
+ char
|
|
|
+ and char != " "
|
|
|
+ and (
|
|
|
+ unicodedata.category(char[0])
|
|
|
+ in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]
|
|
|
+ or ord(char[0]) in range(0x370, 0x400)
|
|
|
+ )
|
|
|
+ ): # 文字修饰符、数学符号、分隔符号、希腊字母
|
|
|
return True
|
|
|
return False
|
|
|
- ptr=0
|
|
|
- item=list(item)
|
|
|
- while ptr<len(item): # 识别文字和公式
|
|
|
- child=item[ptr]
|
|
|
+
|
|
|
+ ptr = 0
|
|
|
+ item = list(item)
|
|
|
+ while ptr < len(item): # 识别文字和公式
|
|
|
+ child = item[ptr]
|
|
|
if isinstance(child, LTChar):
|
|
|
- cur_v=False # 公式
|
|
|
- fontname=child.fontname.split('+')[-1]
|
|
|
- layout=self.layout[ltpage.pageid]
|
|
|
- h,w=layout.shape # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
- cx,cy=np.clip(int(child.x0),0,w-1),np.clip(int(child.y0),0,h-1)
|
|
|
- cls=layout[cy,cx]
|
|
|
+ cur_v = False # 公式
|
|
|
+ fontname = child.fontname.split("+")[-1]
|
|
|
+ layout = self.layout[ltpage.pageid]
|
|
|
+ h, w = (
|
|
|
+ layout.shape
|
|
|
+ ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
|
|
|
+ int(child.y0), 0, h - 1
|
|
|
+ )
|
|
|
+ cls = layout[cy, cx]
|
|
|
# if log.isEnabledFor(logging.DEBUG):
|
|
|
- # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f} {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
|
|
|
- if cls==0 or (cls==xt_cls and child.size<pstk[-1][4]*0.79) or vflag(fontname,child.get_text()) or (child.matrix[0]==0 and child.matrix[3]==0): # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中
|
|
|
- cur_v=True
|
|
|
- if not cur_v: # 判定括号组是否属于公式
|
|
|
- if vstk and child.get_text()=='(':
|
|
|
- cur_v=True
|
|
|
- vbkt+=1
|
|
|
- if vbkt and child.get_text()==')':
|
|
|
- cur_v=True
|
|
|
- vbkt-=1
|
|
|
- if not cur_v or cls!=xt_cls or (abs(child.x0-xt.x0)>vmax and cls!=0): # 公式结束、段落边界、公式换行
|
|
|
- if vstk: # 公式出栈
|
|
|
- sstk[-1]+=f'$v{len(var)}$'
|
|
|
- if not cur_v and cls==xt_cls and child.x0>max([vch.x0 for vch in vstk]): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
|
|
|
- vfix=vstk[0].y0-child.y0
|
|
|
+ # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f}
|
|
|
+ # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
|
|
|
+ if (
|
|
|
+ cls == 0
|
|
|
+ or (cls == xt_cls and child.size < pstk[-1][4] * 0.79)
|
|
|
+ or vflag(fontname, child.get_text())
|
|
|
+ or (child.matrix[0] == 0 and child.matrix[3] == 0)
|
|
|
+ ): # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中
|
|
|
+ cur_v = True
|
|
|
+ if not cur_v: # 判定括号组是否属于公式
|
|
|
+ if vstk and child.get_text() == "(":
|
|
|
+ cur_v = True
|
|
|
+ vbkt += 1
|
|
|
+ if vbkt and child.get_text() == ")":
|
|
|
+ cur_v = True
|
|
|
+ vbkt -= 1
|
|
|
+ if (
|
|
|
+ not cur_v
|
|
|
+ or cls != xt_cls
|
|
|
+ or (abs(child.x0 - xt.x0) > vmax and cls != 0)
|
|
|
+ ): # 公式结束、段落边界、公式换行
|
|
|
+ if vstk: # 公式出栈
|
|
|
+ sstk[-1] += f"$v{len(var)}$"
|
|
|
+ if (
|
|
|
+ not cur_v
|
|
|
+ and cls == xt_cls
|
|
|
+ and child.x0 > max([vch.x0 for vch in vstk])
|
|
|
+ ): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
|
|
|
+ vfix = vstk[0].y0 - child.y0
|
|
|
var.append(vstk)
|
|
|
varl.append(vlstk)
|
|
|
varf.append(vfix)
|
|
|
- vstk=[]
|
|
|
- vlstk=[]
|
|
|
- vfix=0
|
|
|
- if not vstk: # 非公式或是公式开头
|
|
|
- if cls==xt_cls: # 同一段落
|
|
|
- if child.x0 > xt.x1 + 1: # 行内空格
|
|
|
- sstk[-1]+=' '
|
|
|
- elif child.x1 < xt.x0: # 换行空格
|
|
|
- sstk[-1]+=' '
|
|
|
- pstk[-1][6]=True # 标记原文段落存在换行
|
|
|
+ vstk = []
|
|
|
+ vlstk = []
|
|
|
+ vfix = 0
|
|
|
+ if not vstk: # 非公式或是公式开头
|
|
|
+ if cls == xt_cls: # 同一段落
|
|
|
+ if child.x0 > xt.x1 + 1: # 行内空格
|
|
|
+ sstk[-1] += " "
|
|
|
+ elif child.x1 < xt.x0: # 换行空格
|
|
|
+ sstk[-1] += " "
|
|
|
+ pstk[-1][6] = True # 标记原文段落存在换行
|
|
|
else:
|
|
|
sstk.append("")
|
|
|
- pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
|
|
|
- if not cur_v: # 文字入栈
|
|
|
- if child.size>pstk[-1][4]/0.79 or vflag(pstk[-1][5].fontname.split('+')[-1],'') or re.match(r'(.*Medi|.*Bold)',pstk[-1][5].fontname.split('+')[-1],re.IGNORECASE): # 小字体、公式或粗体开头,后续接文字,需要校正字体
|
|
|
- pstk[-1][0]-=child.size-pstk[-1][4]
|
|
|
- pstk[-1][4]=child.size
|
|
|
- pstk[-1][5]=child.font
|
|
|
- sstk[-1]+=child.get_text()
|
|
|
- else: # 公式入栈
|
|
|
- if not vstk and cls==xt_cls and child.x0>xt.x0: # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
|
|
|
- vfix=child.y0-xt.y0
|
|
|
+ pstk.append(
|
|
|
+ [
|
|
|
+ child.y0,
|
|
|
+ child.x0,
|
|
|
+ child.x0,
|
|
|
+ child.x0,
|
|
|
+ child.size,
|
|
|
+ child.font,
|
|
|
+ False,
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ if not cur_v: # 文字入栈
|
|
|
+ if (
|
|
|
+ child.size > pstk[-1][4] / 0.79
|
|
|
+ or vflag(pstk[-1][5].fontname.split("+")[-1], "")
|
|
|
+ or re.match(
|
|
|
+ r"(.*Medi|.*Bold)",
|
|
|
+ pstk[-1][5].fontname.split("+")[-1],
|
|
|
+ re.IGNORECASE,
|
|
|
+ )
|
|
|
+ ): # 小字体、公式或粗体开头,后续接文字,需要校正字体
|
|
|
+ pstk[-1][0] -= child.size - pstk[-1][4]
|
|
|
+ pstk[-1][4] = child.size
|
|
|
+ pstk[-1][5] = child.font
|
|
|
+ sstk[-1] += child.get_text()
|
|
|
+ else: # 公式入栈
|
|
|
+ if (
|
|
|
+ not vstk and cls == xt_cls and child.x0 > xt.x0
|
|
|
+ ): # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
|
|
|
+ vfix = child.y0 - xt.y0
|
|
|
vstk.append(child)
|
|
|
# 更新段落边界,段落内换行之后可能是公式开头
|
|
|
- pstk[-1][2]=min(pstk[-1][2],child.x0)
|
|
|
- pstk[-1][3]=max(pstk[-1][3],child.x1)
|
|
|
- xt=child
|
|
|
- xt_cls=cls
|
|
|
- elif isinstance(child, LTFigure): # 图表
|
|
|
+ pstk[-1][2] = min(pstk[-1][2], child.x0)
|
|
|
+ pstk[-1][3] = max(pstk[-1][3], child.x1)
|
|
|
+ xt = child
|
|
|
+ xt_cls = cls
|
|
|
+ elif isinstance(child, LTFigure): # 图表
|
|
|
pass
|
|
|
- elif isinstance(child, LTLine): # 线条
|
|
|
- layout=self.layout[ltpage.pageid]
|
|
|
- h,w=layout.shape # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
- cx,cy=np.clip(int(child.x0),0,w-1),np.clip(int(child.y0),0,h-1)
|
|
|
- cls=layout[cy,cx]
|
|
|
- if vstk and cls==xt_cls: # 公式线条
|
|
|
+ elif isinstance(child, LTLine): # 线条
|
|
|
+ layout = self.layout[ltpage.pageid]
|
|
|
+ h, w = (
|
|
|
+ layout.shape
|
|
|
+ ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
|
|
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
|
|
|
+ int(child.y0), 0, h - 1
|
|
|
+ )
|
|
|
+ cls = layout[cy, cx]
|
|
|
+ if vstk and cls == xt_cls: # 公式线条
|
|
|
vlstk.append(child)
|
|
|
- else: # 全局线条
|
|
|
+ else: # 全局线条
|
|
|
lstk.append(child)
|
|
|
else:
|
|
|
# print(child)
|
|
|
pass
|
|
|
- ptr+=1
|
|
|
+ ptr += 1
|
|
|
# 处理结尾
|
|
|
- if vstk: # 公式出栈
|
|
|
- sstk[-1]+=f'$v{len(var)}$'
|
|
|
+ if vstk: # 公式出栈
|
|
|
+ sstk[-1] += f"$v{len(var)}$"
|
|
|
var.append(vstk)
|
|
|
varl.append(vlstk)
|
|
|
varf.append(vfix)
|
|
|
- log.debug('\n==========[VSTACK]==========\n')
|
|
|
- for id,v in enumerate(var): # 计算公式宽度
|
|
|
- l=max([vch.x1 for vch in v])-v[0].x0
|
|
|
- log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
|
|
|
+ log.debug("\n==========[VSTACK]==========\n")
|
|
|
+ for id, v in enumerate(var): # 计算公式宽度
|
|
|
+ l = max([vch.x1 for vch in v]) - v[0].x0 # noqa: E741
|
|
|
+ log.debug(
|
|
|
+ f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}' # noqa: E501
|
|
|
+ )
|
|
|
vlen.append(l)
|
|
|
- log.debug('\n==========[SSTACK]==========\n')
|
|
|
- hash_key=cache.deterministic_hash("PDFMathTranslate")
|
|
|
+ log.debug("\n==========[SSTACK]==========\n")
|
|
|
+ hash_key = cache.deterministic_hash("PDFMathTranslate")
|
|
|
cache.create_cache(hash_key)
|
|
|
+
|
|
|
@retry(wait=wait_fixed(1))
|
|
|
- def worker(s): # 多线程翻译
|
|
|
+ def worker(s): # 多线程翻译
|
|
|
try:
|
|
|
- hash_key_paragraph = cache.deterministic_hash((s,str(self.translator)))
|
|
|
- new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
|
|
+ hash_key_paragraph = cache.deterministic_hash(
|
|
|
+ (s, str(self.translator))
|
|
|
+ )
|
|
|
+ new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
|
|
if new is None:
|
|
|
- new=self.translator.translate(s)
|
|
|
- new=remove_control_characters(new)
|
|
|
+ new = self.translator.translate(s)
|
|
|
+ new = remove_control_characters(new)
|
|
|
cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
|
|
return new
|
|
|
except BaseException as e:
|
|
|
if log.isEnabledFor(logging.DEBUG):
|
|
|
log.exception(e)
|
|
|
else:
|
|
|
- log.exception(e,exc_info=False)
|
|
|
+ log.exception(e, exc_info=False)
|
|
|
raise e
|
|
|
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread) as executor:
|
|
|
+
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(
|
|
|
+ max_workers=self.thread
|
|
|
+ ) as executor:
|
|
|
news = list(executor.map(worker, sstk))
|
|
|
- def raw_string(fcur,cstk): # 编码字符串
|
|
|
- if isinstance(self.fontmap[fcur],PDFCIDFont): # 判断编码长度
|
|
|
+
|
|
|
+ def raw_string(fcur, cstk): # 编码字符串
|
|
|
+ if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
|
|
return "".join(["%04x" % ord(c) for c in cstk])
|
|
|
else:
|
|
|
return "".join(["%02x" % ord(c) for c in cstk])
|
|
|
- _x,_y=0,0
|
|
|
- for id,new in enumerate(news): # 排版文字和公式
|
|
|
- tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6] # 段落属性
|
|
|
- cstk='' # 单行文字栈
|
|
|
- fcur=fcur_=None # 单行字体
|
|
|
- log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
|
|
|
+
|
|
|
+ _x, _y = 0, 0
|
|
|
+ for id, new in enumerate(news): # 排版文字和公式
|
|
|
+ tx = x = pstk[id][1]
|
|
|
+ y = pstk[id][0]
|
|
|
+ lt = pstk[id][2]
|
|
|
+ rt = pstk[id][3]
|
|
|
+ ptr = 0
|
|
|
+ size = pstk[id][4]
|
|
|
+ font = pstk[id][5]
|
|
|
+ lb = pstk[id][6] # 段落属性
|
|
|
+ cstk = "" # 单行文字栈
|
|
|
+ fcur = fcur_ = None # 单行字体
|
|
|
+ log.debug(
|
|
|
+ f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}"
|
|
|
+ )
|
|
|
while True:
|
|
|
- if ptr==len(new): # 到达段落结尾
|
|
|
+ if ptr == len(new): # 到达段落结尾
|
|
|
if cstk:
|
|
|
- ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ '
|
|
|
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
break
|
|
|
- vy_regex=re.match(r'\$?\s*v([\d\s]+)\$',new[ptr:],re.IGNORECASE) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
|
|
- mod=False # 当前公式是否为文字修饰符
|
|
|
- if vy_regex: # 加载公式
|
|
|
- ptr+=len(vy_regex.group(0))
|
|
|
+ vy_regex = re.match(
|
|
|
+ r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
|
|
+ ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
|
|
+ mod = False # 当前公式是否为文字修饰符
|
|
|
+ if vy_regex: # 加载公式
|
|
|
+ ptr += len(vy_regex.group(0))
|
|
|
try:
|
|
|
- vid=int(vy_regex.group(1).replace(' ',''))
|
|
|
- adv=vlen[vid]
|
|
|
- except:
|
|
|
- continue # 翻译器可能会自动补个越界的公式标记
|
|
|
- if len(var[vid])==1 and unicodedata.category(var[vid][0].get_text()[0]) in ['Lm','Mn','Sk']: # 文字修饰符
|
|
|
- mod=True
|
|
|
- else: # 加载文字
|
|
|
- ch=new[ptr]
|
|
|
+ vid = int(vy_regex.group(1).replace(" ", ""))
|
|
|
+ adv = vlen[vid]
|
|
|
+ except Exception:
|
|
|
+ continue # 翻译器可能会自动补个越界的公式标记
|
|
|
+ if len(var[vid]) == 1 and unicodedata.category(
|
|
|
+ var[vid][0].get_text()[0]
|
|
|
+ ) in [
|
|
|
+ "Lm",
|
|
|
+ "Mn",
|
|
|
+ "Sk",
|
|
|
+ ]: # 文字修饰符
|
|
|
+ mod = True
|
|
|
+ else: # 加载文字
|
|
|
+ ch = new[ptr]
|
|
|
# if font.char_width(ord(ch)):
|
|
|
- fcur_=None
|
|
|
+ fcur_ = None
|
|
|
# 原字体编码容易出问题,这里直接放弃掉
|
|
|
# try:
|
|
|
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
|
|
@@ -578,58 +662,84 @@ class TextConverter(PDFConverter[AnyIO]):
|
|
|
# except:
|
|
|
# pass
|
|
|
try:
|
|
|
- if fcur_==None and self.fontmap['tiro'].to_unichr(ord(ch))==ch:
|
|
|
- fcur_='tiro' # 默认英文字体
|
|
|
- except:
|
|
|
+ if (
|
|
|
+ fcur_ is None
|
|
|
+ and self.fontmap["tiro"].to_unichr(ord(ch)) == ch
|
|
|
+ ):
|
|
|
+ fcur_ = "tiro" # 默认英文字体
|
|
|
+ except Exception:
|
|
|
pass
|
|
|
- if fcur_==None:
|
|
|
- fcur_='china-ss' # 默认中文字体
|
|
|
+ if fcur_ is None:
|
|
|
+ fcur_ = "china-ss" # 默认中文字体
|
|
|
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
|
|
- adv=self.fontmap[fcur_].char_width(ord(ch))*size
|
|
|
- ptr+=1
|
|
|
- if fcur_!=fcur or vy_regex or x+adv>rt+0.1*size: # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
|
+ adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
|
|
+ ptr += 1
|
|
|
+ if (
|
|
|
+ fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size
|
|
|
+ ): # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
|
|
if cstk:
|
|
|
- ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ '
|
|
|
- cstk=''
|
|
|
- if lb and x+adv>rt+0.1*size: # 到达右边界且原文段落存在换行
|
|
|
- x=lt
|
|
|
- lang_space={'zh-CN':1.4,'zh-TW':1.4,'ja':1.1,'ko':1.2,'en':1.2} # CJK
|
|
|
- y-=size*lang_space.get(self.translator.lang_out,1.1) # 小语种大多适配 1.1
|
|
|
- if vy_regex: # 插入公式
|
|
|
- fix=0
|
|
|
- if fcur!=None: # 段落内公式修正纵向偏移
|
|
|
- fix=varf[vid]
|
|
|
- for vch in var[vid]: # 排版公式字符
|
|
|
- vc=chr(vch.cid)
|
|
|
- ops+=f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0:f} {fix+y+vch.y0-var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font],vc)}>] TJ "
|
|
|
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
|
|
+ cstk = ""
|
|
|
+ if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行
|
|
|
+ x = lt
|
|
|
+ lang_space = {
|
|
|
+ "zh-CN": 1.4,
|
|
|
+ "zh-TW": 1.4,
|
|
|
+ "ja": 1.1,
|
|
|
+ "ko": 1.2,
|
|
|
+ "en": 1.2,
|
|
|
+ } # CJK
|
|
|
+ y -= size * lang_space.get(
|
|
|
+ self.translator.lang_out, 1.1
|
|
|
+ ) # 小语种大多适配 1.1
|
|
|
+ if vy_regex: # 插入公式
|
|
|
+ fix = 0
|
|
|
+ if fcur is not None: # 段落内公式修正纵向偏移
|
|
|
+ fix = varf[vid]
|
|
|
+ for vch in var[vid]: # 排版公式字符
|
|
|
+ vc = chr(vch.cid)
|
|
|
+ ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " # noqa: E501
|
|
|
if log.isEnabledFor(logging.DEBUG):
|
|
|
- lstk.append(LTLine(0.1,(_x,_y),(x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0)))
|
|
|
- _x,_y=x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0
|
|
|
- for l in varl[vid]: # 排版公式线条
|
|
|
- if l.linewidth<5: # hack
|
|
|
- ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0:f} {l.pts[0][1]+fix+y-var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT "
|
|
|
- else: # 插入文字缓冲区
|
|
|
- if not cstk: # 单行开头
|
|
|
- tx=x
|
|
|
- if x==lt and ch==' ': # 消除段落换行空格
|
|
|
- adv=0
|
|
|
+ lstk.append(
|
|
|
+ LTLine(
|
|
|
+ 0.1,
|
|
|
+ (_x, _y),
|
|
|
+ (
|
|
|
+ x + vch.x0 - var[vid][0].x0,
|
|
|
+ fix + y + vch.y0 - var[vid][0].y0,
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ )
|
|
|
+ _x, _y = (
|
|
|
+ x + vch.x0 - var[vid][0].x0,
|
|
|
+ fix + y + vch.y0 - var[vid][0].y0,
|
|
|
+ )
|
|
|
+ for l in varl[vid]: # 排版公式线条 # noqa: E741
|
|
|
+ if l.linewidth < 5: # hack
|
|
|
+ ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
+ else: # 插入文字缓冲区
|
|
|
+ if not cstk: # 单行开头
|
|
|
+ tx = x
|
|
|
+ if x == lt and ch == " ": # 消除段落换行空格
|
|
|
+ adv = 0
|
|
|
else:
|
|
|
- cstk+=ch
|
|
|
+ cstk += ch
|
|
|
else:
|
|
|
- cstk+=ch
|
|
|
- if mod: # 文字修饰符
|
|
|
- adv=0
|
|
|
- fcur=fcur_
|
|
|
- x+=adv
|
|
|
+ cstk += ch
|
|
|
+ if mod: # 文字修饰符
|
|
|
+ adv = 0
|
|
|
+ fcur = fcur_
|
|
|
+ x += adv
|
|
|
if log.isEnabledFor(logging.DEBUG):
|
|
|
- lstk.append(LTLine(0.1,(_x,_y),(x,y)))
|
|
|
- _x,_y=x,y
|
|
|
- for l in lstk: # 排版全局线条
|
|
|
- if l.linewidth<5: # hack
|
|
|
- ops+=f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT "
|
|
|
- ops=f'BT {ops}ET '
|
|
|
+ lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
|
|
+ _x, _y = x, y
|
|
|
+ for l in lstk: # 排版全局线条 # noqa: E741
|
|
|
+ if l.linewidth < 5: # hack
|
|
|
+ ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
|
|
|
+ ops = f"BT {ops}ET "
|
|
|
return ops
|
|
|
- ops=render(ltpage)
|
|
|
+
|
|
|
+ ops = render(ltpage)
|
|
|
return ops
|
|
|
|
|
|
# Some dummy functions to save memory/CPU when all that is wanted
|