Browse Source

fix: first char

Byaidu 1 year ago
parent
commit
3394aee69f
3 changed files with 293 additions and 370 deletions
  1. 291 329
      pdf2zh/converter.py
  2. 1 1
      pdf2zh/high_level.py
  3. 1 40
      pdf2zh/pdf2zh.py

+ 291 - 329
pdf2zh/converter.py

@@ -281,7 +281,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
             graphicstate,
         )
         self.cur_item.add(item)
-        item.cid = cid  # hack
+        item.cid = cid  # hack 插入原字符编码
         return item.adv
 
     def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
@@ -404,342 +404,304 @@ class TextConverter(PDFConverter[AnyIO]):
         else:
             cast(TextIO, self.outfp).write(text)
 
+    # fmt: off
     def receive_layout(self, ltpage: LTPage):
-        def render(item: LTItem) -> None:
-            xt = None  # 上一个字符
-            sstk = []  # 段落文字栈
-            vstk = []  # 公式符号组
-            vlstk = []  # 公式线条组
-            vfix = 0  # 公式纵向偏移
-            vbkt = 0  # 段落公式括号计数
-            pstk = []  # 段落属性栈
-            lstk = []  # 全局线条栈
-            var = []  # 公式符号组栈
-            varl = []  # 公式线条组栈
-            varf = []  # 公式纵向偏移栈
-            vlen = []  # 公式宽度栈
-            xt_cls = -1  # 上一个字符所属段落
-            vmax = ltpage.width / 4  # 行内公式最大宽度
-            ops = ""  # 渲染结果
-
-            def vflag(font, char):  # 匹配公式(和角标)字体
-                if re.match(r"\(cid:", char):
+        xt = None   # 上一个字符
+        sstk = []   # 段落文字栈
+        vstk = []   # 公式符号组
+        vlstk = []  # 公式线条组
+        vfix = 0    # 公式纵向偏移
+        vbkt = 0    # 段落公式括号计数
+        pstk = []   # 段落属性栈
+        lstk = []   # 全局线条栈
+        var = []    # 公式符号组栈
+        varl = []   # 公式线条组栈
+        varf = []   # 公式纵向偏移栈
+        vlen = []   # 公式宽度栈
+        xt_cls = -1 # 上一个字符所属段落
+        vmax = ltpage.width / 4 # 行内公式最大宽度
+        ops = ""    # 渲染结果
+
+        def vflag(font, char):  # 匹配公式(和角标)字体
+            if re.match(r"\(cid:", char):
+                return True
+            # 基于字体名规则的判定
+            if self.vfont:
+                if re.match(self.vfont, font):
                     return True
-                if self.vfont:
-                    if re.match(self.vfont, font):
-                        return True
-                else:
-                    if re.match(
-                        r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
-                        font,
-                    ):
-                        return True
-                if self.vchar:
-                    if re.match(self.vchar, char):
-                        return True
-                else:
-                    if (
-                        char
-                        and char != " "
-                        and (
-                            unicodedata.category(char[0])
-                            in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]
-                            or ord(char[0]) in range(0x370, 0x400)
-                        )
-                    ):  # 文字修饰符、数学符号、分隔符号、希腊字母
-                        return True
-                return False
-
-            ptr = 0
-            item = list(item)
-            while ptr < len(item):  # 识别文字和公式
-                child = item[ptr]
-                if isinstance(child, LTChar):
-                    cur_v = False  # 公式
-                    fontname = child.fontname.split("+")[-1]
-                    layout = self.layout[ltpage.pageid]
-                    h, w = (
-                        layout.shape
-                    )  # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
-                    cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
-                        int(child.y0), 0, h - 1
+            else:
+                if re.match(                                            # latex 字体
+                    r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
+                    font,
+                ):
+                    return True
+            # 基于字符集规则的判定
+            if self.vchar:
+                if re.match(self.vchar, char):
+                    return True
+            else:
+                if (
+                    char
+                    and char != " "                                     # 非空格
+                    and (
+                        unicodedata.category(char[0])
+                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
+                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
                     )
-                    cls = layout[cy, cx]
-                    # if log.isEnabledFor(logging.DEBUG):
-                    # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f}
-                    # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
-                    if (
-                        cls == 0
-                        or (cls == xt_cls and child.size < pstk[-1][4] * 0.79)
-                        or vflag(fontname, child.get_text())
-                        or (child.matrix[0] == 0 and child.matrix[3] == 0)
-                    ):  # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中
+                ):
+                    return True
+            return False
+
+        ############################################################
+        # A. 原文档解析
+        ptr = 0
+        item = list(ltpage)
+        while ptr < len(item):
+            child = item[ptr]
+            if isinstance(child, LTChar):
+                cur_v = False
+                fontname = child.fontname.split("+")[-1]
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前字符在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if (                                                                                        # 判定当前字符是否属于公式
+                    cls == 0                                                                                # 1. 类别为保留区域
+                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1][4] * 0.79)    # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
+                    or vflag(fontname, child.get_text())                                                    # 3. 公式字体
+                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
+                ):
+                    cur_v = True
+                # 判定括号组是否属于公式
+                if not cur_v:
+                    if vstk and child.get_text() == "(":
                         cur_v = True
-                    if not cur_v:  # 判定括号组是否属于公式
-                        if vstk and child.get_text() == "(":
-                            cur_v = True
-                            vbkt += 1
-                        if vbkt and child.get_text() == ")":
-                            cur_v = True
-                            vbkt -= 1
-                    if (
-                        not cur_v
-                        or cls != xt_cls
-                        or (abs(child.x0 - xt.x0) > vmax and cls != 0)
-                    ):  # 公式结束、段落边界、公式换行
-                        if vstk:  # 公式出栈
-                            sstk[-1] += f"$v{len(var)}$"
-                            if (
-                                not cur_v
-                                and cls == xt_cls
-                                and child.x0 > max([vch.x0 for vch in vstk])
-                            ):  # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
-                                vfix = vstk[0].y0 - child.y0
-                            var.append(vstk)
-                            varl.append(vlstk)
-                            varf.append(vfix)
-                            vstk = []
-                            vlstk = []
-                            vfix = 0
-                    if not vstk:  # 非公式或是公式开头
-                        if cls == xt_cls:  # 同一段落
-                            if child.x0 > xt.x1 + 1:  # 行内空格
-                                sstk[-1] += " "
-                            elif child.x1 < xt.x0:  # 换行空格
-                                sstk[-1] += " "
-                                pstk[-1][6] = True  # 标记原文段落存在换行
-                        else:
-                            sstk.append("")
-                            pstk.append(
-                                [
-                                    child.y0,
-                                    child.x0,
-                                    child.x0,
-                                    child.x0,
-                                    child.size,
-                                    child.font,
-                                    False,
-                                ]
-                            )
-                    if not cur_v:  # 文字入栈
-                        if (
-                            child.size > pstk[-1][4] / 0.79
-                            or vflag(pstk[-1][5].fontname.split("+")[-1], "")
-                            or re.match(
-                                r"(.*Medi|.*Bold)",
-                                pstk[-1][5].fontname.split("+")[-1],
-                                re.IGNORECASE,
-                            )
-                        ):  # 小字体、公式或粗体开头,后续接文字,需要校正字体
-                            pstk[-1][0] -= child.size - pstk[-1][4]
-                            pstk[-1][4] = child.size
-                            pstk[-1][5] = child.font
-                        sstk[-1] += child.get_text()
-                    else:  # 公式入栈
-                        if (
-                            not vstk and cls == xt_cls and child.x0 > xt.x0
-                        ):  # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
-                            vfix = child.y0 - xt.y0
-                        vstk.append(child)
-                    # 更新段落边界,段落内换行之后可能是公式开头
-                    pstk[-1][2] = min(pstk[-1][2], child.x0)
-                    pstk[-1][3] = max(pstk[-1][3], child.x1)
-                    xt = child
-                    xt_cls = cls
-                elif isinstance(child, LTFigure):  # 图表
-                    pass
-                elif isinstance(child, LTLine):  # 线条
-                    layout = self.layout[ltpage.pageid]
-                    h, w = (
-                        layout.shape
-                    )  # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
-                    cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
-                        int(child.y0), 0, h - 1
-                    )
-                    cls = layout[cy, cx]
-                    if vstk and cls == xt_cls:  # 公式线条
-                        vlstk.append(child)
-                    else:  # 全局线条
-                        lstk.append(child)
-                else:
-                    # print(child)
-                    pass
-                ptr += 1
-            # 处理结尾
-            if vstk:  # 公式出栈
-                sstk[-1] += f"$v{len(var)}$"
-                var.append(vstk)
-                varl.append(vlstk)
-                varf.append(vfix)
-            log.debug("\n==========[VSTACK]==========\n")
-            for id, v in enumerate(var):  # 计算公式宽度
-                l = max([vch.x1 for vch in v]) - v[0].x0  # noqa: E741
-                log.debug(
-                    f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}'  # noqa: E501
+                        vbkt += 1
+                    if vbkt and child.get_text() == ")":
+                        cur_v = True
+                        vbkt -= 1
+                if (                                                        # 判定当前公式是否结束
+                    not cur_v                                               # 1. 当前字符不属于公式
+                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
+                    or (abs(child.x0 - xt.x0) > vmax and cls != 0)          # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
+                ):
+                    if vstk:
+                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
+                            not cur_v                                       # 1. 当前字符不属于公式
+                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
+                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
+                        ):
+                            vfix = vstk[0].y0 - child.y0
+                        sstk[-1] += f"$v{len(var)}$"
+                        var.append(vstk)
+                        varl.append(vlstk)
+                        varf.append(vfix)
+                        vstk = []
+                        vlstk = []
+                        vfix = 0
+                # 当前字符不属于公式或当前字符是公式的第一个字符
+                if not vstk:
+                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
+                        if child.x0 > xt.x1 + 1:    # 添加行内空格
+                            sstk[-1] += " "
+                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
+                            sstk[-1] += " "
+                            pstk[-1][6] = True
+                    else:                           # 根据当前字符构建一个新的段落
+                        sstk.append("")
+                        pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False,])
+                if not cur_v:                                               # 文字入栈
+                    if (                                                    # 根据当前字符修正段落属性
+                        child.size > pstk[-1][4] / 0.79                     # 1. 当前字符显著比段落字体大
+                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
+                        or vflag(pstk[-1][5].fontname.split("+")[-1], "")   # 3. 段落字体为公式字体
+                        or re.match(                                        # 4. 段落字体为粗体
+                            r"(.*Medi|.*Bold)",
+                            pstk[-1][5].fontname.split("+")[-1],
+                            re.IGNORECASE,
+                        )
+                    ):
+                        pstk[-1][0] -= child.size - pstk[-1][4]             # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
+                        pstk[-1][4] = child.size
+                        pstk[-1][5] = child.font
+                    sstk[-1] += child.get_text()
+                else:                                                       # 公式入栈
+                    if (                                                    # 根据公式右侧的文字修正公式的纵向偏移
+                        not vstk                                            # 1. 当前字符是公式的第一个字符
+                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
+                        and child.x0 > xt.x0                                # 3. 当前字符在前一个字符右侧
+                    ):
+                        vfix = child.y0 - xt.y0
+                    vstk.append(child)
+                # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
+                pstk[-1][2] = min(pstk[-1][2], child.x0)
+                pstk[-1][3] = max(pstk[-1][3], child.x1)
+                # 更新上一个字符
+                xt = child
+                xt_cls = cls
+            elif isinstance(child, LTFigure):   # 图表
+                pass
+            elif isinstance(child, LTLine):     # 线条
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前线条在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if vstk and cls == xt_cls:      # 公式线条
+                    vlstk.append(child)
+                else:                           # 全局线条
+                    lstk.append(child)
+            else:
+                pass
+            ptr += 1
+        # 处理结尾
+        if vstk:    # 公式出栈
+            sstk[-1] += f"$v{len(var)}$"
+            var.append(vstk)
+            varl.append(vlstk)
+            varf.append(vfix)
+        log.debug("\n==========[VSTACK]==========\n")
+        for id, v in enumerate(var):  # 计算公式宽度
+            l = max([vch.x1 for vch in v]) - v[0].x0  # noqa: E741
+            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
+            vlen.append(l)
+
+        ############################################################
+        # B. 段落翻译
+        log.debug("\n==========[SSTACK]==========\n")
+        hash_key = cache.deterministic_hash("PDFMathTranslate")
+        cache.create_cache(hash_key)
+        @retry(wait=wait_fixed(1))
+        def worker(s):  # 多线程翻译
+            try:
+                hash_key_paragraph = cache.deterministic_hash(
+                    (s, str(self.translator))
                 )
-                vlen.append(l)
-            log.debug("\n==========[SSTACK]==========\n")
-            hash_key = cache.deterministic_hash("PDFMathTranslate")
-            cache.create_cache(hash_key)
-
-            @retry(wait=wait_fixed(1))
-            def worker(s):  # 多线程翻译
-                try:
-                    hash_key_paragraph = cache.deterministic_hash(
-                        (s, str(self.translator))
-                    )
-                    new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
-                    if new is None:
-                        new = self.translator.translate(s)
-                        new = remove_control_characters(new)
-                        cache.write_paragraph(hash_key, hash_key_paragraph, new)
-                    return new
-                except BaseException as e:
-                    if log.isEnabledFor(logging.DEBUG):
-                        log.exception(e)
-                    else:
-                        log.exception(e, exc_info=False)
-                    raise e
-
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=self.thread
-            ) as executor:
-                news = list(executor.map(worker, sstk))
-
-            def raw_string(fcur, cstk):  # 编码字符串
-                if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
-                    return "".join(["%04x" % ord(c) for c in cstk])
+                new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
+                if new is None:
+                    new = self.translator.translate(s)
+                    new = remove_control_characters(new)
+                    cache.write_paragraph(hash_key, hash_key_paragraph, new)
+                return new
+            except BaseException as e:
+                if log.isEnabledFor(logging.DEBUG):
+                    log.exception(e)
                 else:
-                    return "".join(["%02x" % ord(c) for c in cstk])
-
-            _x, _y = 0, 0
-            for id, new in enumerate(news):  # 排版文字和公式
-                tx = x = pstk[id][1]
-                y = pstk[id][0]
-                lt = pstk[id][2]
-                rt = pstk[id][3]
-                ptr = 0
-                size = pstk[id][4]
-                font = pstk[id][5]
-                lb = pstk[id][6]  # 段落属性
-                cstk = ""  # 单行文字栈
-                fcur = fcur_ = None  # 单行字体
-                log.debug(
-                    f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}"
-                )
-                while True:
-                    if ptr == len(new):  # 到达段落结尾
-                        if cstk:
-                            ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
-                        break
-                    vy_regex = re.match(
-                        r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
-                    )  # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
-                    mod = False  # 当前公式是否为文字修饰符
-                    if vy_regex:  # 加载公式
-                        ptr += len(vy_regex.group(0))
-                        try:
-                            vid = int(vy_regex.group(1).replace(" ", ""))
-                            adv = vlen[vid]
-                        except Exception:
-                            continue  # 翻译器可能会自动补个越界的公式标记
-                        if len(var[vid]) == 1 and unicodedata.category(
-                            var[vid][0].get_text()[0]
-                        ) in [
-                            "Lm",
-                            "Mn",
-                            "Sk",
-                        ]:  # 文字修饰符
-                            mod = True
-                    else:  # 加载文字
-                        ch = new[ptr]
-                        # if font.char_width(ord(ch)):
-                        fcur_ = None
-                        # 原字体编码容易出问题,这里直接放弃掉
-                        # try:
-                        #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
-                        #         fcur_=self.fontid[font] # 原字体
-                        # except:
-                        #     pass
-                        try:
-                            if (
-                                fcur_ is None
-                                and self.fontmap["tiro"].to_unichr(ord(ch)) == ch
-                            ):
-                                fcur_ = "tiro"  # 默认英文字体
-                        except Exception:
-                            pass
-                        if fcur_ is None:
-                            fcur_ = "china-ss"  # 默认中文字体
-                        # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
-                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
-                        ptr += 1
-                    if (
-                        fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size
-                    ):  # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
-                        if cstk:
-                            ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
-                            cstk = ""
-                    if lb and x + adv > rt + 0.1 * size:  # 到达右边界且原文段落存在换行
-                        x = lt
-                        lang_space = {
-                            "zh-CN": 1.4,
-                            "zh-TW": 1.4,
-                            "ja": 1.1,
-                            "ko": 1.2,
-                            "en": 1.2,
-                        }  # CJK
-                        y -= size * lang_space.get(
-                            self.translator.lang_out, 1.1
-                        )  # 小语种大多适配 1.1
-                    if vy_regex:  # 插入公式
-                        fix = 0
-                        if fcur is not None:  # 段落内公式修正纵向偏移
-                            fix = varf[vid]
-                        for vch in var[vid]:  # 排版公式字符
-                            vc = chr(vch.cid)
-                            ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "  # noqa: E501
-                            if log.isEnabledFor(logging.DEBUG):
-                                lstk.append(
-                                    LTLine(
-                                        0.1,
-                                        (_x, _y),
-                                        (
-                                            x + vch.x0 - var[vid][0].x0,
-                                            fix + y + vch.y0 - var[vid][0].y0,
-                                        ),
-                                    )
-                                )
-                                _x, _y = (
-                                    x + vch.x0 - var[vid][0].x0,
-                                    fix + y + vch.y0 - var[vid][0].y0,
-                                )
-                        for l in varl[vid]:  # 排版公式线条 # noqa: E741
-                            if l.linewidth < 5:  # hack
-                                ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
-                    else:  # 插入文字缓冲区
-                        if not cstk:  # 单行开头
-                            tx = x
-                            if x == lt and ch == " ":  # 消除段落换行空格
-                                adv = 0
-                            else:
-                                cstk += ch
+                    log.exception(e, exc_info=False)
+                raise e
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.thread
+        ) as executor:
+            news = list(executor.map(worker, sstk))
+
+        ############################################################
+        # C. 新文档排版
+        def raw_string(fcur, cstk):  # 编码字符串
+            if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
+                return "".join(["%04x" % ord(c) for c in cstk])
+            else:
+                return "".join(["%02x" % ord(c) for c in cstk])
+        _x, _y = 0, 0
+        for id, new in enumerate(news):
+            tx = x = pstk[id][1]    # 段落初始横坐标
+            y = pstk[id][0]         # 段落上边界
+            lt = pstk[id][2]        # 段落左边界
+            rt = pstk[id][3]        # 段落右边界
+            size = pstk[id][4]      # 段落字体大小
+            font = pstk[id][5]      # 段落字体
+            lb = pstk[id][6]        # 段落属性
+            cstk = ""               # 当前文字栈
+            fcur = fcur_ = None     # 当前字体
+            ptr = 0
+            log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
+            while True:
+                if ptr == len(new):  # 到达段落结尾
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                    break
+                vy_regex = re.match(
+                    r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
+                )  # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
+                mod = False  # 当前公式是否为文字修饰符
+                if vy_regex:  # 加载公式
+                    ptr += len(vy_regex.group(0))
+                    try:
+                        vid = int(vy_regex.group(1).replace(" ", ""))
+                        adv = vlen[vid]
+                    except Exception:
+                        continue  # 翻译器可能会自动补个越界的公式标记
+                    if len(var[vid]) == 1 and unicodedata.category(
+                        var[vid][0].get_text()[0]
+                    ) in ["Lm","Mn","Sk",]:  # 文字修饰符
+                        mod = True
+                else:  # 加载文字
+                    ch = new[ptr]
+                    # if font.char_width(ord(ch)):
+                    fcur_ = None
+                    # 原字体编码容易出问题,这里直接放弃掉
+                    # try:
+                    #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
+                    #         fcur_=self.fontid[font] # 原字体
+                    # except:
+                    #     pass
+                    try:
+                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
+                            fcur_ = "tiro"  # 默认英文字体
+                    except Exception:
+                        pass
+                    if fcur_ is None:
+                        fcur_ = "china-ss"  # 默认中文字体
+                    # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
+                    adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+                    ptr += 1
+                if (                                # 输出文字缓冲区
+                    fcur_ != fcur                   # 1. 字体更新
+                    or vy_regex                     # 2. 插入公式
+                    or x + adv > rt + 0.1 * size    # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
+                ):
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        cstk = ""
+                if lb and x + adv > rt + 0.1 * size:  # 到达右边界且原文段落存在换行
+                    x = lt
+                    lang_space = {"zh-CN": 1.4,"zh-TW": 1.4,"ja": 1.1,"ko": 1.2,"en": 1.2}  # CJK
+                    y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
+                if vy_regex:  # 插入公式
+                    fix = 0
+                    if fcur is not None:  # 段落内公式修正纵向偏移
+                        fix = varf[vid]
+                    for vch in var[vid]:  # 排版公式字符
+                        vc = chr(vch.cid)
+                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "  # noqa: E501
+                        if log.isEnabledFor(logging.DEBUG):
+                            lstk.append(LTLine(0.1,(_x, _y),(x + vch.x0 - var[vid][0].x0,fix + y + vch.y0 - var[vid][0].y0,)))
+                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
+                    for l in varl[vid]:  # 排版公式线条 # noqa: E741
+                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
+                else:  # 插入文字缓冲区
+                    if not cstk:  # 单行开头
+                        tx = x
+                        if x == lt and ch == " ":  # 消除段落换行空格
+                            adv = 0
                         else:
                             cstk += ch
-                    if mod:  # 文字修饰符
-                        adv = 0
-                    fcur = fcur_
-                    x += adv
-                    if log.isEnabledFor(logging.DEBUG):
-                        lstk.append(LTLine(0.1, (_x, _y), (x, y)))
-                        _x, _y = x, y
-            for l in lstk:  # 排版全局线条 # noqa: E741
-                if l.linewidth < 5:  # hack
-                    ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
-            ops = f"BT {ops}ET "
-            return ops
-
-        ops = render(ltpage)
+                    else:
+                        cstk += ch
+                if mod:  # 文字修饰符
+                    adv = 0
+                fcur = fcur_
+                x += adv
+                if log.isEnabledFor(logging.DEBUG):
+                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
+                    _x, _y = x, y
+        for l in lstk:  # 排版全局线条 # noqa: E741
+            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "  # noqa: E501
+        ops = f"BT {ops}ET "
         return ops
 
     # Some dummy functions to save memory/CPU when all that is wanted

+ 1 - 1
pdf2zh/high_level.py

@@ -205,7 +205,7 @@ def extract_text_to_fp(
             # print(page.number,page_layout)
             page.rotate = (page.rotate + rotation) % 360
             # 新建一个 xref 存放新指令流
-            page.page_xref = doc_en.get_new_xref()  # hack
+            page.page_xref = doc_en.get_new_xref()  # hack 插入页面的新 xref
             doc_en.update_object(page.page_xref, "<<>>")
             doc_en.update_stream(page.page_xref, b"")
             doc_en[page.pageno].set_contents(page.page_xref)

+ 1 - 40
pdf2zh/pdf2zh.py

@@ -8,7 +8,6 @@ from __future__ import annotations
 import argparse
 import logging
 import os
-import subprocess
 import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
@@ -93,45 +92,7 @@ def extract_text(
     for file in files:
         filename = os.path.splitext(os.path.basename(file))[0]
 
-        def convert_to_pdfa(input_pdf_path, output_pdfa_path):
-            """
-            Converts a PDF to PDF/A format using Ghostscript.
-            Args:
-                input_pdf_path (str): Path to the input PDF file.
-                output_pdfa_path (str): Path where the PDF/A file will be saved.
-            """
-            try:
-                # Ghostscript command for conversion
-                command = [
-                    "gs",
-                    "-dPDFA",
-                    "-dBATCH",
-                    "-dNOPAUSE",
-                    "-dNOOUTERSAVE",
-                    "-sDEVICE=pdfwrite",
-                    "-sOutputFile=" + output_pdfa_path,
-                    "-dPDFACompatibilityPolicy=1",
-                    input_pdf_path,
-                ]
-
-                # Run the command
-                subprocess.run(command, check=True)
-                print(
-                    f"Successfully converted {input_pdf_path} to PDF/A at {output_pdfa_path}"
-                )
-            except subprocess.CalledProcessError as e:
-                print(f"Error during conversion: {e}")
-            except FileNotFoundError:
-                print("Ghostscript is not installed or not found in the PATH.")
-
-        try:
-            file_pdfa = f"{str(file)}-pdfa.pdf"
-            convert_to_pdfa(file, file_pdfa)
-            doc_en = pymupdf.open(file_pdfa)
-        except Exception as e:
-            print(f"Error converting PDF: {e}")
-            doc_en = pymupdf.open(file)
-
+        doc_en = pymupdf.open(file)
         page_count = doc_en.page_count
         font_list = ["china-ss", "tiro"]
         font_id = {}