пре 1 година · 3394aee69f
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -281,7 +281,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
 
				             graphicstate,
			
 
				         )
			
 
				         self.cur_item.add(item)
			
 
				-        item.cid = cid  # hack
			
 
				+        item.cid = cid  # hack 插入原字符编码
			
 
				         return item.adv
			
 
				 
			
 
				     def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
			
@@ -404,342 +404,304 @@ class TextConverter(PDFConverter[AnyIO]):
 
				         else:
			
 
				             cast(TextIO, self.outfp).write(text)
			
 
				 
			
 
				+    # fmt: off
			
 
				     def receive_layout(self, ltpage: LTPage):
			
 
				-        def render(item: LTItem) -> None:
			
 
				-            xt = None  # 上一个字符
			
 
				-            sstk = []  # 段落文字栈
			
 
				-            vstk = []  # 公式符号组
			
 
				-            vlstk = []  # 公式线条组
			
 
				-            vfix = 0  # 公式纵向偏移
			
 
				-            vbkt = 0  # 段落公式括号计数
			
 
				-            pstk = []  # 段落属性栈
			
 
				-            lstk = []  # 全局线条栈
			
 
				-            var = []  # 公式符号组栈
			
 
				-            varl = []  # 公式线条组栈
			
 
				-            varf = []  # 公式纵向偏移栈
			
 
				-            vlen = []  # 公式宽度栈
			
 
				-            xt_cls = -1  # 上一个字符所属段落
			
 
				-            vmax = ltpage.width / 4  # 行内公式最大宽度
			
 
				-            ops = ""  # 渲染结果
			
 
				-
			
 
				-            def vflag(font, char):  # 匹配公式（和角标）字体
			
 
				-                if re.match(r"\(cid:", char):
			
 
				+        xt = None   # 上一个字符
			
 
				+        sstk = []   # 段落文字栈
			
 
				+        vstk = []   # 公式符号组
			
 
				+        vlstk = []  # 公式线条组
			
 
				+        vfix = 0    # 公式纵向偏移
			
 
				+        vbkt = 0    # 段落公式括号计数
			
 
				+        pstk = []   # 段落属性栈
			
 
				+        lstk = []   # 全局线条栈
			
 
				+        var = []    # 公式符号组栈
			
 
				+        varl = []   # 公式线条组栈
			
 
				+        varf = []   # 公式纵向偏移栈
			
 
				+        vlen = []   # 公式宽度栈
			
 
				+        xt_cls = -1 # 上一个字符所属段落
			
 
				+        vmax = ltpage.width / 4 # 行内公式最大宽度
			
 
				+        ops = ""    # 渲染结果
			
 
				+
			
 
				+        def vflag(font, char):  # 匹配公式（和角标）字体
			
 
				+            if re.match(r"\(cid:", char):
			
 
				+                return True
			
 
				+            # 基于字体名规则的判定
			
 
				+            if self.vfont:
			
 
				+                if re.match(self.vfont, font):
			
 
				                     return True
			
 
				-                if self.vfont:
			
 
				-                    if re.match(self.vfont, font):
			
 
				-                        return True
			
 
				-                else:
			
 
				-                    if re.match(
			
 
				-                        r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
			
 
				-                        font,
			
 
				-                    ):
			
 
				-                        return True
			
 
				-                if self.vchar:
			
 
				-                    if re.match(self.vchar, char):
			
 
				-                        return True
			
 
				-                else:
			
 
				-                    if (
			
 
				-                        char
			
 
				-                        and char != " "
			
 
				-                        and (
			
 
				-                            unicodedata.category(char[0])
			
 
				-                            in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]
			
 
				-                            or ord(char[0]) in range(0x370, 0x400)
			
 
				-                        )
			
 
				-                    ):  # 文字修饰符、数学符号、分隔符号、希腊字母
			
 
				-                        return True
			
 
				-                return False
			
 
				-
			
 
				-            ptr = 0
			
 
				-            item = list(item)
			
 
				-            while ptr < len(item):  # 识别文字和公式
			
 
				-                child = item[ptr]
			
 
				-                if isinstance(child, LTChar):
			
 
				-                    cur_v = False  # 公式
			
 
				-                    fontname = child.fontname.split("+")[-1]
			
 
				-                    layout = self.layout[ltpage.pageid]
			
 
				-                    h, w = (
			
 
				-                        layout.shape
			
 
				-                    )  # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
			
 
				-                    cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
			
 
				-                        int(child.y0), 0, h - 1
			
 
				+            else:
			
 
				+                if re.match(                                            # latex 字体
			
 
				+                    r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
			
 
				+                    font,
			
 
				+                ):
			
 
				+                    return True
			
 
				+            # 基于字符集规则的判定
			
 
				+            if self.vchar:
			
 
				+                if re.match(self.vchar, char):
			
 
				+                    return True
			
 
				+            else:
			
 
				+                if (
			
 
				+                    char
			
 
				+                    and char != " "                                     # 非空格
			
 
				+                    and (
			
 
				+                        unicodedata.category(char[0])
			
 
				+                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
			
 
				+                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
			
 
				                     )
			
 
				-                    cls = layout[cy, cx]
			
 
				-                    # if log.isEnabledFor(logging.DEBUG):
			
 
				-                    # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f}
			
 
				-                    # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
			
 
				-                    if (
			
 
				-                        cls == 0
			
 
				-                        or (cls == xt_cls and child.size < pstk[-1][4] * 0.79)
			
 
				-                        or vflag(fontname, child.get_text())
			
 
				-                        or (child.matrix[0] == 0 and child.matrix[3] == 0)
			
 
				-                    ):  # 有 0.76 的角标和 0.799 的大写，这里用 0.79 取中
			
 
				+                ):
			
 
				+                    return True
			
 
				+            return False
			
 
				+
			
 
				+        ############################################################
			
 
				+        # A. 原文档解析
			
 
				+        ptr = 0
			
 
				+        item = list(ltpage)
			
 
				+        while ptr < len(item):
			
 
				+            child = item[ptr]
			
 
				+            if isinstance(child, LTChar):
			
 
				+                cur_v = False
			
 
				+                fontname = child.fontname.split("+")[-1]
			
 
				+                layout = self.layout[ltpage.pageid]
			
 
				+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
			
 
				+                h, w = layout.shape
			
 
				+                # 读取当前字符在 layout 中的类别
			
 
				+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
			
 
				+                cls = layout[cy, cx]
			
 
				+                if (                                                                                        # 判定当前字符是否属于公式
			
 
				+                    cls == 0                                                                                # 1. 类别为保留区域
			
 
				+                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1][4] * 0.79)    # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
			
 
				+                    or vflag(fontname, child.get_text())                                                    # 3. 公式字体
			
 
				+                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
			
 
				+                ):
			
 
				+                    cur_v = True
			
 
				+                # 判定括号组是否属于公式
			
 
				+                if not cur_v:
			
 
				+                    if vstk and child.get_text() == "(":
			
 
				                         cur_v = True
			
 
				-                    if not cur_v:  # 判定括号组是否属于公式
			
 
				-                        if vstk and child.get_text() == "(":
			
 
				-                            cur_v = True
			
 
				-                            vbkt += 1
			
 
				-                        if vbkt and child.get_text() == ")":
			
 
				-                            cur_v = True
			
 
				-                            vbkt -= 1
			
 
				-                    if (
			
 
				-                        not cur_v
			
 
				-                        or cls != xt_cls
			
 
				-                        or (abs(child.x0 - xt.x0) > vmax and cls != 0)
			
 
				-                    ):  # 公式结束、段落边界、公式换行
			
 
				-                        if vstk:  # 公式出栈
			
 
				-                            sstk[-1] += f"$v{len(var)}$"
			
 
				-                            if (
			
 
				-                                not cur_v
			
 
				-                                and cls == xt_cls
			
 
				-                                and child.x0 > max([vch.x0 for vch in vstk])
			
 
				-                            ):  # and child.y1>vstk[0].y0: # 段落内公式转文字，行内公式修正
			
 
				-                                vfix = vstk[0].y0 - child.y0
			
 
				-                            var.append(vstk)
			
 
				-                            varl.append(vlstk)
			
 
				-                            varf.append(vfix)
			
 
				-                            vstk = []
			
 
				-                            vlstk = []
			
 
				-                            vfix = 0
			
 
				-                    if not vstk:  # 非公式或是公式开头
			
 
				-                        if cls == xt_cls:  # 同一段落
			
 
				-                            if child.x0 > xt.x1 + 1:  # 行内空格
			
 
				-                                sstk[-1] += " "
			
 
				-                            elif child.x1 < xt.x0:  # 换行空格
			
 
				-                                sstk[-1] += " "
			
 
				-                                pstk[-1][6] = True  # 标记原文段落存在换行
			
 
				-                        else:
			
 
				-                            sstk.append("")
			
 
				-                            pstk.append(
			
 
				-                                [
			
 
				-                                    child.y0,
			
 
				-                                    child.x0,
			
 
				-                                    child.x0,
			
 
				-                                    child.x0,
			
 
				-                                    child.size,
			
 
				-                                    child.font,
			
 
				-                                    False,
			
 
				-                                ]
			
 
				-                            )
			
 
				-                    if not cur_v:  # 文字入栈
			
 
				-                        if (
			
 
				-                            child.size > pstk[-1][4] / 0.79
			
 
				-                            or vflag(pstk[-1][5].fontname.split("+")[-1], "")
			
 
				-                            or re.match(
			
 
				-                                r"(.*Medi|.*Bold)",
			
 
				-                                pstk[-1][5].fontname.split("+")[-1],
			
 
				-                                re.IGNORECASE,
			
 
				-                            )
			
 
				-                        ):  # 小字体、公式或粗体开头，后续接文字，需要校正字体
			
 
				-                            pstk[-1][0] -= child.size - pstk[-1][4]
			
 
				-                            pstk[-1][4] = child.size
			
 
				-                            pstk[-1][5] = child.font
			
 
				-                        sstk[-1] += child.get_text()
			
 
				-                    else:  # 公式入栈
			
 
				-                        if (
			
 
				-                            not vstk and cls == xt_cls and child.x0 > xt.x0
			
 
				-                        ):  # and child.y1>xt.y0: # 段落内文字转公式，行内公式修正
			
 
				-                            vfix = child.y0 - xt.y0
			
 
				-                        vstk.append(child)
			
 
				-                    # 更新段落边界，段落内换行之后可能是公式开头
			
 
				-                    pstk[-1][2] = min(pstk[-1][2], child.x0)
			
 
				-                    pstk[-1][3] = max(pstk[-1][3], child.x1)
			
 
				-                    xt = child
			
 
				-                    xt_cls = cls
			
 
				-                elif isinstance(child, LTFigure):  # 图表
			
 
				-                    pass
			
 
				-                elif isinstance(child, LTLine):  # 线条
			
 
				-                    layout = self.layout[ltpage.pageid]
			
 
				-                    h, w = (
			
 
				-                        layout.shape
			
 
				-                    )  # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
			
 
				-                    cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
			
 
				-                        int(child.y0), 0, h - 1
			
 
				-                    )
			
 
				-                    cls = layout[cy, cx]
			
 
				-                    if vstk and cls == xt_cls:  # 公式线条
			
 
				-                        vlstk.append(child)
			
 
				-                    else:  # 全局线条
			
 
				-                        lstk.append(child)
			
 
				-                else:
			
 
				-                    # print(child)
			
 
				-                    pass
			
 
				-                ptr += 1
			
 
				-            # 处理结尾
			
 
				-            if vstk:  # 公式出栈
			
 
				-                sstk[-1] += f"$v{len(var)}$"
			
 
				-                var.append(vstk)
			
 
				-                varl.append(vlstk)
			
 
				-                varf.append(vfix)
			
 
				-            log.debug("\n==========[VSTACK]==========\n")
			
 
				-            for id, v in enumerate(var):  # 计算公式宽度
			
 
				-                l = max([vch.x1 for vch in v]) - v[0].x0  # noqa: E741
			
 
				-                log.debug(
			
 
				-                    f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}'  # noqa: E501
			
 
				+                        vbkt += 1
			
 
				+                    if vbkt and child.get_text() == ")":
			
 
				+                        cur_v = True
			
 
				+                        vbkt -= 1
			
 
				+                if (                                                        # 判定当前公式是否结束
			
 
				+                    not cur_v                                               # 1. 当前字符不属于公式
			
 
				+                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
			
 
				+                    or (abs(child.x0 - xt.x0) > vmax and cls != 0)          # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
			
 
				+                ):
			
 
				+                    if vstk:
			
 
				+                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
			
 
				+                            not cur_v                                       # 1. 当前字符不属于公式
			
 
				+                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
			
 
				+                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
			
 
				+                        ):
			
 
				+                            vfix = vstk[0].y0 - child.y0
			
 
				+                        sstk[-1] += f"$v{len(var)}$"
			
 
				+                        var.append(vstk)
			
 
				+                        varl.append(vlstk)
			
 
				+                        varf.append(vfix)
			
 
				+                        vstk = []
			
 
				+                        vlstk = []
			
 
				+                        vfix = 0
			
 
				+                # 当前字符不属于公式或当前字符是公式的第一个字符
			
 
				+                if not vstk:
			
 
				+                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
			
 
				+                        if child.x0 > xt.x1 + 1:    # 添加行内空格
			
 
				+                            sstk[-1] += " "
			
 
				+                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
			
 
				+                            sstk[-1] += " "
			
 
				+                            pstk[-1][6] = True
			
 
				+                    else:                           # 根据当前字符构建一个新的段落
			
 
				+                        sstk.append("")
			
 
				+                        pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False,])
			
 
				+                if not cur_v:                                               # 文字入栈
			
 
				+                    if (                                                    # 根据当前字符修正段落属性
			
 
				+                        child.size > pstk[-1][4] / 0.79                     # 1. 当前字符显著比段落字体大
			
 
				+                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
			
 
				+                        or vflag(pstk[-1][5].fontname.split("+")[-1], "")   # 3. 段落字体为公式字体
			
 
				+                        or re.match(                                        # 4. 段落字体为粗体
			
 
				+                            r"(.*Medi|.*Bold)",
			
 
				+                            pstk[-1][5].fontname.split("+")[-1],
			
 
				+                            re.IGNORECASE,
			
 
				+                        )
			
 
				+                    ):
			
 
				+                        pstk[-1][0] -= child.size - pstk[-1][4]             # hack 这个段落纵向位置的修正有问题，不过先凑合用吧
			
 
				+                        pstk[-1][4] = child.size
			
 
				+                        pstk[-1][5] = child.font
			
 
				+                    sstk[-1] += child.get_text()
			
 
				+                else:                                                       # 公式入栈
			
 
				+                    if (                                                    # 根据公式右侧的文字修正公式的纵向偏移
			
 
				+                        not vstk                                            # 1. 当前字符是公式的第一个字符
			
 
				+                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
			
 
				+                        and child.x0 > xt.x0                                # 3. 当前字符在前一个字符右侧
			
 
				+                    ):
			
 
				+                        vfix = child.y0 - xt.y0
			
 
				+                    vstk.append(child)
			
 
				+                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
			
 
				+                pstk[-1][2] = min(pstk[-1][2], child.x0)
			
 
				+                pstk[-1][3] = max(pstk[-1][3], child.x1)
			
 
				+                # 更新上一个字符
			
 
				+                xt = child
			
 
				+                xt_cls = cls
			
 
				+            elif isinstance(child, LTFigure):   # 图表
			
 
				+                pass
			
 
				+            elif isinstance(child, LTLine):     # 线条
			
 
				+                layout = self.layout[ltpage.pageid]
			
 
				+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
			
 
				+                h, w = layout.shape
			
 
				+                # 读取当前线条在 layout 中的类别
			
 
				+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
			
 
				+                cls = layout[cy, cx]
			
 
				+                if vstk and cls == xt_cls:      # 公式线条
			
 
				+                    vlstk.append(child)
			
 
				+                else:                           # 全局线条
			
 
				+                    lstk.append(child)
			
 
				+            else:
			
 
				+                pass
			
 
				+            ptr += 1
			
 
				+        # 处理结尾
			
 
				+        if vstk:    # 公式出栈
			
 
				+            sstk[-1] += f"$v{len(var)}$"
			
 
				+            var.append(vstk)
			
 
				+            varl.append(vlstk)
			
 
				+            varf.append(vfix)
			
 
				+        log.debug("\n==========[VSTACK]==========\n")
			
 
				+        for id, v in enumerate(var):  # 计算公式宽度
			
 
				+            l = max([vch.x1 for vch in v]) - v[0].x0  # noqa: E741
			
 
				+            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
			
 
				+            vlen.append(l)
			
 
				+
			
 
				+        ############################################################
			
 
				+        # B. 段落翻译
			
 
				+        log.debug("\n==========[SSTACK]==========\n")
			
 
				+        hash_key = cache.deterministic_hash("PDFMathTranslate")
			
 
				+        cache.create_cache(hash_key)
			
 
				+        @retry(wait=wait_fixed(1))
			
 
				+        def worker(s):  # 多线程翻译
			
 
				+            try:
			
 
				+                hash_key_paragraph = cache.deterministic_hash(
			
 
				+                    (s, str(self.translator))
			
 
				                 )
			
 
				-                vlen.append(l)
			
 
				-            log.debug("\n==========[SSTACK]==========\n")
			
 
				-            hash_key = cache.deterministic_hash("PDFMathTranslate")
			
 
				-            cache.create_cache(hash_key)
			
 
				-
			
 
				-            @retry(wait=wait_fixed(1))
			
 
				-            def worker(s):  # 多线程翻译
			
 
				-                try:
			
 
				-                    hash_key_paragraph = cache.deterministic_hash(
			
 
				-                        (s, str(self.translator))
			
 
				-                    )
			
 
				-                    new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
			
 
				-                    if new is None:
			
 
				-                        new = self.translator.translate(s)
			
 
				-                        new = remove_control_characters(new)
			
 
				-                        cache.write_paragraph(hash_key, hash_key_paragraph, new)
			
 
				-                    return new
			
 
				-                except BaseException as e:
			
 
				-                    if log.isEnabledFor(logging.DEBUG):
			
 
				-                        log.exception(e)
			
 
				-                    else:
			
 
				-                        log.exception(e, exc_info=False)
			
 
				-                    raise e
			
 
				-
			
 
				-            with concurrent.futures.ThreadPoolExecutor(
			
 
				-                max_workers=self.thread
			
 
				-            ) as executor:
			
 
				-                news = list(executor.map(worker, sstk))
			
 
				-
			
 
				-            def raw_string(fcur, cstk):  # 编码字符串
			
 
				-                if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
			
 
				-                    return "".join(["%04x" % ord(c) for c in cstk])
			
 
				+                new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
			
 
				+                if new is None:
			
 
				+                    new = self.translator.translate(s)
			
 
				+                    new = remove_control_characters(new)
			
 
				+                    cache.write_paragraph(hash_key, hash_key_paragraph, new)
			
 
				+                return new
			
 
				+            except BaseException as e:
			
 
				+                if log.isEnabledFor(logging.DEBUG):
			
 
				+                    log.exception(e)
			
 
				                 else:
			
 
				-                    return "".join(["%02x" % ord(c) for c in cstk])
			
 
				-
			
 
				-            _x, _y = 0, 0
			
 
				-            for id, new in enumerate(news):  # 排版文字和公式
			
 
				-                tx = x = pstk[id][1]
			
 
				-                y = pstk[id][0]
			
 
				-                lt = pstk[id][2]
			
 
				-                rt = pstk[id][3]
			
 
				-                ptr = 0
			
 
				-                size = pstk[id][4]
			
 
				-                font = pstk[id][5]
			
 
				-                lb = pstk[id][6]  # 段落属性
			
 
				-                cstk = ""  # 单行文字栈
			
 
				-                fcur = fcur_ = None  # 单行字体
			
 
				-                log.debug(
			
 
				-                    f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}"
			
 
				-                )
			
 
				-                while True:
			
 
				-                    if ptr == len(new):  # 到达段落结尾
			
 
				-                        if cstk:
			
 
				-                            ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				-                        break
			
 
				-                    vy_regex = re.match(
			
 
				-                        r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
			
 
				-                    )  # 匹配 $vn$ 公式标记，前面的 $ 有的时候会被丢掉
			
 
				-                    mod = False  # 当前公式是否为文字修饰符
			
 
				-                    if vy_regex:  # 加载公式
			
 
				-                        ptr += len(vy_regex.group(0))
			
 
				-                        try:
			
 
				-                            vid = int(vy_regex.group(1).replace(" ", ""))
			
 
				-                            adv = vlen[vid]
			
 
				-                        except Exception:
			
 
				-                            continue  # 翻译器可能会自动补个越界的公式标记
			
 
				-                        if len(var[vid]) == 1 and unicodedata.category(
			
 
				-                            var[vid][0].get_text()[0]
			
 
				-                        ) in [
			
 
				-                            "Lm",
			
 
				-                            "Mn",
			
 
				-                            "Sk",
			
 
				-                        ]:  # 文字修饰符
			
 
				-                            mod = True
			
 
				-                    else:  # 加载文字
			
 
				-                        ch = new[ptr]
			
 
				-                        # if font.char_width(ord(ch)):
			
 
				-                        fcur_ = None
			
 
				-                        # 原字体编码容易出问题，这里直接放弃掉
			
 
				-                        # try:
			
 
				-                        #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
			
 
				-                        #         fcur_=self.fontid[font] # 原字体
			
 
				-                        # except:
			
 
				-                        #     pass
			
 
				-                        try:
			
 
				-                            if (
			
 
				-                                fcur_ is None
			
 
				-                                and self.fontmap["tiro"].to_unichr(ord(ch)) == ch
			
 
				-                            ):
			
 
				-                                fcur_ = "tiro"  # 默认英文字体
			
 
				-                        except Exception:
			
 
				-                            pass
			
 
				-                        if fcur_ is None:
			
 
				-                            fcur_ = "china-ss"  # 默认中文字体
			
 
				-                        # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
			
 
				-                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
			
 
				-                        ptr += 1
			
 
				-                    if (
			
 
				-                        fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size
			
 
				-                    ):  # 输出文字缓冲区：1.字体更新 2.插入公式 3.到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
			
 
				-                        if cstk:
			
 
				-                            ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				-                            cstk = ""
			
 
				-                    if lb and x + adv > rt + 0.1 * size:  # 到达右边界且原文段落存在换行
			
 
				-                        x = lt
			
 
				-                        lang_space = {
			
 
				-                            "zh-CN": 1.4,
			
 
				-                            "zh-TW": 1.4,
			
 
				-                            "ja": 1.1,
			
 
				-                            "ko": 1.2,
			
 
				-                            "en": 1.2,
			
 
				-                        }  # CJK
			
 
				-                        y -= size * lang_space.get(
			
 
				-                            self.translator.lang_out, 1.1
			
 
				-                        )  # 小语种大多适配 1.1
			
 
				-                    if vy_regex:  # 插入公式
			
 
				-                        fix = 0
			
 
				-                        if fcur is not None:  # 段落内公式修正纵向偏移
			
 
				-                            fix = varf[vid]
			
 
				-                        for vch in var[vid]:  # 排版公式字符
			
 
				-                            vc = chr(vch.cid)
			
 
				-                            ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "  # noqa: E501
			
 
				-                            if log.isEnabledFor(logging.DEBUG):
			
 
				-                                lstk.append(
			
 
				-                                    LTLine(
			
 
				-                                        0.1,
			
 
				-                                        (_x, _y),
			
 
				-                                        (
			
 
				-                                            x + vch.x0 - var[vid][0].x0,
			
 
				-                                            fix + y + vch.y0 - var[vid][0].y0,
			
 
				-                                        ),
			
 
				-                                    )
			
 
				-                                )
			
 
				-                                _x, _y = (
			
 
				-                                    x + vch.x0 - var[vid][0].x0,
			
 
				-                                    fix + y + vch.y0 - var[vid][0].y0,
			
 
				-                                )
			
 
				-                        for l in varl[vid]:  # 排版公式线条 # noqa: E741
			
 
				-                            if l.linewidth < 5:  # hack
			
 
				-                                ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
			
 
				-                    else:  # 插入文字缓冲区
			
 
				-                        if not cstk:  # 单行开头
			
 
				-                            tx = x
			
 
				-                            if x == lt and ch == " ":  # 消除段落换行空格
			
 
				-                                adv = 0
			
 
				-                            else:
			
 
				-                                cstk += ch
			
 
				+                    log.exception(e, exc_info=False)
			
 
				+                raise e
			
 
				+        with concurrent.futures.ThreadPoolExecutor(
			
 
				+            max_workers=self.thread
			
 
				+        ) as executor:
			
 
				+            news = list(executor.map(worker, sstk))
			
 
				+
			
 
				+        ############################################################
			
 
				+        # C. 新文档排版
			
 
				+        def raw_string(fcur, cstk):  # 编码字符串
			
 
				+            if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
			
 
				+                return "".join(["%04x" % ord(c) for c in cstk])
			
 
				+            else:
			
 
				+                return "".join(["%02x" % ord(c) for c in cstk])
			
 
				+        _x, _y = 0, 0
			
 
				+        for id, new in enumerate(news):
			
 
				+            tx = x = pstk[id][1]    # 段落初始横坐标
			
 
				+            y = pstk[id][0]         # 段落上边界
			
 
				+            lt = pstk[id][2]        # 段落左边界
			
 
				+            rt = pstk[id][3]        # 段落右边界
			
 
				+            size = pstk[id][4]      # 段落字体大小
			
 
				+            font = pstk[id][5]      # 段落字体
			
 
				+            lb = pstk[id][6]        # 段落属性
			
 
				+            cstk = ""               # 当前文字栈
			
 
				+            fcur = fcur_ = None     # 当前字体
			
 
				+            ptr = 0
			
 
				+            log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
			
 
				+            while True:
			
 
				+                if ptr == len(new):  # 到达段落结尾
			
 
				+                    if cstk:
			
 
				+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				+                    break
			
 
				+                vy_regex = re.match(
			
 
				+                    r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
			
 
				+                )  # 匹配 $vn$ 公式标记，前面的 $ 有的时候会被丢掉
			
 
				+                mod = False  # 当前公式是否为文字修饰符
			
 
				+                if vy_regex:  # 加载公式
			
 
				+                    ptr += len(vy_regex.group(0))
			
 
				+                    try:
			
 
				+                        vid = int(vy_regex.group(1).replace(" ", ""))
			
 
				+                        adv = vlen[vid]
			
 
				+                    except Exception:
			
 
				+                        continue  # 翻译器可能会自动补个越界的公式标记
			
 
				+                    if len(var[vid]) == 1 and unicodedata.category(
			
 
				+                        var[vid][0].get_text()[0]
			
 
				+                    ) in ["Lm","Mn","Sk",]:  # 文字修饰符
			
 
				+                        mod = True
			
 
				+                else:  # 加载文字
			
 
				+                    ch = new[ptr]
			
 
				+                    # if font.char_width(ord(ch)):
			
 
				+                    fcur_ = None
			
 
				+                    # 原字体编码容易出问题，这里直接放弃掉
			
 
				+                    # try:
			
 
				+                    #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
			
 
				+                    #         fcur_=self.fontid[font] # 原字体
			
 
				+                    # except:
			
 
				+                    #     pass
			
 
				+                    try:
			
 
				+                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
			
 
				+                            fcur_ = "tiro"  # 默认英文字体
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+                    if fcur_ is None:
			
 
				+                        fcur_ = "china-ss"  # 默认中文字体
			
 
				+                    # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
			
 
				+                    adv = self.fontmap[fcur_].char_width(ord(ch)) * size
			
 
				+                    ptr += 1
			
 
				+                if (                                # 输出文字缓冲区
			
 
				+                    fcur_ != fcur                   # 1. 字体更新
			
 
				+                    or vy_regex                     # 2. 插入公式
			
 
				+                    or x + adv > rt + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
			
 
				+                ):
			
 
				+                    if cstk:
			
 
				+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
			
 
				+                        cstk = ""
			
 
				+                if lb and x + adv > rt + 0.1 * size:  # 到达右边界且原文段落存在换行
			
 
				+                    x = lt
			
 
				+                    lang_space = {"zh-CN": 1.4,"zh-TW": 1.4,"ja": 1.1,"ko": 1.2,"en": 1.2}  # CJK
			
 
				+                    y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
			
 
				+                if vy_regex:  # 插入公式
			
 
				+                    fix = 0
			
 
				+                    if fcur is not None:  # 段落内公式修正纵向偏移
			
 
				+                        fix = varf[vid]
			
 
				+                    for vch in var[vid]:  # 排版公式字符
			
 
				+                        vc = chr(vch.cid)
			
 
				+                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "  # noqa: E501
			
 
				+                        if log.isEnabledFor(logging.DEBUG):
			
 
				+                            lstk.append(LTLine(0.1,(_x, _y),(x + vch.x0 - var[vid][0].x0,fix + y + vch.y0 - var[vid][0].y0,)))
			
 
				+                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
			
 
				+                    for l in varl[vid]:  # 排版公式线条 # noqa: E741
			
 
				+                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
			
 
				+                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
			
 
				+                else:  # 插入文字缓冲区
			
 
				+                    if not cstk:  # 单行开头
			
 
				+                        tx = x
			
 
				+                        if x == lt and ch == " ":  # 消除段落换行空格
			
 
				+                            adv = 0
			
 
				                         else:
			
 
				                             cstk += ch
			
 
				-                    if mod:  # 文字修饰符
			
 
				-                        adv = 0
			
 
				-                    fcur = fcur_
			
 
				-                    x += adv
			
 
				-                    if log.isEnabledFor(logging.DEBUG):
			
 
				-                        lstk.append(LTLine(0.1, (_x, _y), (x, y)))
			
 
				-                        _x, _y = x, y
			
 
				-            for l in lstk:  # 排版全局线条 # noqa: E741
			
 
				-                if l.linewidth < 5:  # hack
			
 
				-                    ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
			
 
				-            ops = f"BT {ops}ET "
			
 
				-            return ops
			
 
				-
			
 
				-        ops = render(ltpage)
			
 
				+                    else:
			
 
				+                        cstk += ch
			
 
				+                if mod:  # 文字修饰符
			
 
				+                    adv = 0
			
 
				+                fcur = fcur_
			
 
				+                x += adv
			
 
				+                if log.isEnabledFor(logging.DEBUG):
			
 
				+                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
			
 
				+                    _x, _y = x, y
			
 
				+        for l in lstk:  # 排版全局线条 # noqa: E741
			
 
				+            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
			
 
				+                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
			
 
				+        ops = f"BT {ops}ET "
			
 
				         return ops
			
 
				 
			
 
				     # Some dummy functions to save memory/CPU when all that is wanted
			
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -205,7 +205,7 @@ def extract_text_to_fp(
 
				             # print(page.number,page_layout)
			
 
				             page.rotate = (page.rotate + rotation) % 360
			
 
				             # 新建一个 xref 存放新指令流
			
 
				-            page.page_xref = doc_en.get_new_xref()  # hack
			
 
				+            page.page_xref = doc_en.get_new_xref()  # hack 插入页面的新 xref
			
 
				             doc_en.update_object(page.page_xref, "<<>>")
			
 
				             doc_en.update_stream(page.page_xref, b"")
			
 
				             doc_en[page.pageno].set_contents(page.page_xref)
			
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -8,7 +8,6 @@ from __future__ import annotations
 
				 import argparse
			
 
				 import logging
			
 
				 import os
			
 
				-import subprocess
			
 
				 import sys
			
 
				 from pathlib import Path
			
 
				 from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
			
@@ -93,45 +92,7 @@ def extract_text(
 
				     for file in files:
			
 
				         filename = os.path.splitext(os.path.basename(file))[0]
			
 
				 
			
 
				-        def convert_to_pdfa(input_pdf_path, output_pdfa_path):
			
 
				-            """
			
 
				-            Converts a PDF to PDF/A format using Ghostscript.
			
 
				-            Args:
			
 
				-                input_pdf_path (str): Path to the input PDF file.
			
 
				-                output_pdfa_path (str): Path where the PDF/A file will be saved.
			
 
				-            """
			
 
				-            try:
			
 
				-                # Ghostscript command for conversion
			
 
				-                command = [
			
 
				-                    "gs",
			
 
				-                    "-dPDFA",
			
 
				-                    "-dBATCH",
			
 
				-                    "-dNOPAUSE",
			
 
				-                    "-dNOOUTERSAVE",
			
 
				-                    "-sDEVICE=pdfwrite",
			
 
				-                    "-sOutputFile=" + output_pdfa_path,
			
 
				-                    "-dPDFACompatibilityPolicy=1",
			
 
				-                    input_pdf_path,
			
 
				-                ]
			
 
				-
			
 
				-                # Run the command
			
 
				-                subprocess.run(command, check=True)
			
 
				-                print(
			
 
				-                    f"Successfully converted {input_pdf_path} to PDF/A at {output_pdfa_path}"
			
 
				-                )
			
 
				-            except subprocess.CalledProcessError as e:
			
 
				-                print(f"Error during conversion: {e}")
			
 
				-            except FileNotFoundError:
			
 
				-                print("Ghostscript is not installed or not found in the PATH.")
			
 
				-
			
 
				-        try:
			
 
				-            file_pdfa = f"{str(file)}-pdfa.pdf"
			
 
				-            convert_to_pdfa(file, file_pdfa)
			
 
				-            doc_en = pymupdf.open(file_pdfa)
			
 
				-        except Exception as e:
			
 
				-            print(f"Error converting PDF: {e}")
			
 
				-            doc_en = pymupdf.open(file)
			
 
				-
			
 
				+        doc_en = pymupdf.open(file)
			
 
				         page_count = doc_en.page_count
			
 
				         font_list = ["china-ss", "tiro"]
			
 
				         font_id = {}