Jelajahi Sumber

feat (cli): add support for non-pdf/a document

Rongxin 11 bulan lalu
induk
melakukan
6b293ab38e
7 mengubah file dengan 92 tambahan dan 21 penghapusan
  1. 4 3
      README.md
  2. 77 16
      pdf2zh/high_level.py
  3. 10 2
      pdf2zh/pdf2zh.py
  4. 1 0
      pyproject.toml
  5. TEMPAT SAMPAH
      test/file/translate.cli.font.unknown.pdf
  6. TEMPAT SAMPAH
      test/file/translate.cli.plain.text.pdf
  7. TEMPAT SAMPAH
      test/file/translate.cli.text.with.figure.pdf

+ 4 - 3
README.md

@@ -43,9 +43,9 @@ Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMa
 
 <h2 id="updates">Updates</h2>
 
-- [Nov. 26 2024] CLI now supports online file(s) _(by [@reycn](https://github.com/reycn))_
-- [Nov. 24 2024] [ONNX](https://github.com/onnx/onnx) support to reduce dependency sizes _(by [@Wybxc](https://github.com/Wybxc))_
-- [Nov. 23 2024] 🌟 [Public Service](#demo) online! _(by [@Byaidu](https://github.com/Byaidu))_
+- [Dec. 19 2024] Non-PDF/A documents are now supported using `-cp` _(by [@reycn](https://github.com/reycn))_
+- [Dec. 13 2024] Additional support for backend by _(by [@YadominJinta](https://github.com/YadominJinta))_
+- [Dec. 10 2024] The translator now supports OpenAI models on Azure _(by [@yidasanqian](https://github.com/yidasanqian))_
 
 <h2 id="preview">Preview</h2>
 
@@ -184,6 +184,7 @@ In the following table, we list all advanced options for reference:
 | `-t`           | [Multi-threads](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#threads)                | `pdf2zh example.pdf -t 1`                      |
 | `-o`           | Output dir                                                                                                    | `pdf2zh example.pdf -o output`                 |
 | `-f`, `-c`     | [Exceptions](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#exceptions)                | `pdf2zh example.pdf -f "(MS.*)"`               |
+| `-cp`          | Compatibility Mode                                                                                            | `pdf2zh example.pdf --compatible`              |
 | `--share`      | Public link                                                                                                   | `pdf2zh -i --share`                            |
 | `--authorized` | Authorization                                                                                                 | `pdf2zh -i --authorized users.txt [auth.html]` |
 | `--prompt`     | [Custom Prompt](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#prompt)                 | `pdf2zh --prompt [prompt.txt]`                 |

+ 77 - 16
pdf2zh/high_level.py

@@ -1,27 +1,28 @@
 """Functions that can be used for the most common use-cases for pdf2zh.six"""
 
 import asyncio
+import io
+import os
+import sys
+import tempfile
+import urllib.request
 from asyncio import CancelledError
-from typing import BinaryIO
+from pathlib import Path
+from typing import Any, BinaryIO, List, Optional
+
 import numpy as np
+import requests
 import tqdm
-import sys
-from pymupdf import Font, Document
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfexceptions import PDFValueError
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
+from pymupdf import Document, Font
+
 from pdf2zh.converter import TranslateConverter
-from pdf2zh.pdfinterp import PDFPageInterpreterEx
 from pdf2zh.doclayout import DocLayoutModel
-from pathlib import Path
-from typing import Any, List, Optional
-import urllib.request
-import requests
-import tempfile
-import os
-import io
+from pdf2zh.pdfinterp import PDFPageInterpreterEx
 
 model = DocLayoutModel.load_available()
 
@@ -136,7 +137,7 @@ def translate_patch(
             h, w = box.shape
             vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
             for i, d in enumerate(page_layout.boxes):
-                if not page_layout.names[int(d.cls)] in vcls:
+                if page_layout.names[int(d.cls)] not in vcls:
                     x0, y0, x1, y1 = d.xyxy.squeeze()
                     x0, y0, x1, y1 = (
                         np.clip(int(x0 - 1), 0, w - 1),
@@ -246,6 +247,56 @@ def translate_stream(
     return doc_zh.write(deflate=1), doc_en.write(deflate=1)
 
 
+def convert_to_pdfa(input_path, output_path):
+    """
+    Convert PDF to PDF/A format
+
+    Args:
+        input_path: Path to source PDF file
+        output_path: Path to save PDF/A file
+    """
+    import pikepdf
+    from pikepdf import Dictionary, Name, Pdf
+
+    # Open the PDF file
+    pdf = Pdf.open(input_path)
+
+    # Add PDF/A conformance metadata
+    metadata = {
+        "pdfa_part": "2",
+        "pdfa_conformance": "B",
+        "title": pdf.docinfo.get("/Title", ""),
+        "author": pdf.docinfo.get("/Author", ""),
+        "creator": "PDF Math Translate",
+    }
+
+    with pdf.open_metadata() as meta:
+        meta.load_from_docinfo(pdf.docinfo)
+        meta["pdfaid:part"] = metadata["pdfa_part"]
+        meta["pdfaid:conformance"] = metadata["pdfa_conformance"]
+
+    # Create OutputIntent dictionary
+    output_intent = Dictionary(
+        {
+            "/Type": Name("/OutputIntent"),
+            "/S": Name("/GTS_PDFA1"),
+            "/OutputConditionIdentifier": "sRGB IEC61966-2.1",
+            "/RegistryName": "http://www.color.org",
+            "/Info": "sRGB IEC61966-2.1",
+        }
+    )
+
+    # Add output intent to PDF root
+    if "/OutputIntents" not in pdf.Root:
+        pdf.Root.OutputIntents = [output_intent]
+    else:
+        pdf.Root.OutputIntents.append(output_intent)
+
+    # Save as PDF/A
+    pdf.save(output_path, linearize=True)
+    pdf.close()
+
+
 def translate(
     files: list[str],
     output: str = "",
@@ -257,6 +308,7 @@ def translate(
     vfont: str = "",
     vchar: str = "",
     callback: object = None,
+    compatible: bool = False,
     cancellation_event: asyncio.Event = None,
     **kwarg: Any,
 ):
@@ -294,7 +346,15 @@ def translate(
                 )
         filename = os.path.splitext(os.path.basename(file))[0]
 
-        doc_raw = open(file, "rb")
+        # If the commandline has specified converting to PDF/A format
+        ## --compatible / -cp
+        if compatible:
+            file_pdfa = file.replace(".pdf", "-pdfa.pdf")
+            print(f"Converting {file} to PDF/A format...")
+            convert_to_pdfa(file, file_pdfa)
+            doc_raw = open(file_pdfa, "rb")
+        else:
+            doc_raw = open(file, "rb")
         s_raw = doc_raw.read()
         s_mono, s_dual = translate_stream(
             s_raw,
@@ -311,3 +371,4 @@ def translate(
         result_files.append((str(file_mono), str(file_dual)))
 
     return result_files
+    return result_files

+ 10 - 2
pdf2zh/pdf2zh.py

@@ -6,12 +6,13 @@ output it to plain text, html, xml or tags.
 from __future__ import annotations
 
 import argparse
-import sys
 import logging
+import sys
+from string import Template
 from typing import List, Optional
+
 from pdf2zh import __version__, log
 from pdf2zh.high_level import translate
-from string import Template
 
 
 def create_parser() -> argparse.ArgumentParser:
@@ -128,6 +129,13 @@ def create_parser() -> argparse.ArgumentParser:
         help="user custom prompt.",
     )
 
+    parse_params.add_argument(
+        "--compatible",
+        "-cp",
+        action="store_true",
+        help="Convert the PDF file into PDF/A format to improve compatibility.",
+    )
+
     return parser
 
 

+ 1 - 0
pyproject.toml

@@ -28,6 +28,7 @@ dependencies = [
     "tencentcloud-sdk-python",
     "pdfminer.six>=20240706",
     "gradio_pdf",
+    "pikepdf",
 ]
 
 [project.optional-dependencies]

TEMPAT SAMPAH
test/file/translate.cli.font.unknown.pdf


TEMPAT SAMPAH
test/file/translate.cli.plain.text.pdf


TEMPAT SAMPAH
test/file/translate.cli.text.with.figure.pdf