|
|
@@ -1,13 +1,14 @@
|
|
|
import os
|
|
|
-import re
|
|
|
-import subprocess
|
|
|
-import tempfile
|
|
|
+import shutil
|
|
|
from pathlib import Path
|
|
|
from pdf2zh import __version__
|
|
|
+from pdf2zh.pdf2zh import extract_text
|
|
|
|
|
|
import gradio as gr
|
|
|
import numpy as np
|
|
|
import pymupdf
|
|
|
+import tqdm
|
|
|
+import requests
|
|
|
|
|
|
# Map service names to pdf2zh service options
|
|
|
service_map = {
|
|
|
@@ -29,6 +30,37 @@ lang_map = {
|
|
|
"Spanish": "es",
|
|
|
"Italian": "it",
|
|
|
}
|
|
|
+page_map = {
|
|
|
+ "All": None,
|
|
|
+ "First": [0],
|
|
|
+ "First 5 pages": list(range(0,5)),
|
|
|
+}
|
|
|
+
|
|
|
+flag_demo=False
|
|
|
+if os.environ.get('PDF2ZH_DEMO'):
|
|
|
+ flag_demo=True
|
|
|
+ service_map = {
|
|
|
+ "Google": "google",
|
|
|
+ }
|
|
|
+ page_map = {
|
|
|
+ "First": [0],
|
|
|
+ "First 20 pages": list(range(0,20)),
|
|
|
+ }
|
|
|
+ client_key=os.environ.get('PDF2ZH_CLIENT_KEY')
|
|
|
+ server_key=os.environ.get('PDF2ZH_SERVER_KEY')
|
|
|
+
|
|
|
+
|
|
|
+def verify_recaptcha(response):
|
|
|
+ recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
|
|
|
+
|
|
|
+ print('reCAPTCHA',server_key,response)
|
|
|
+
|
|
|
+ data = {"secret": server_key, "response": response}
|
|
|
+ result = requests.post(recaptcha_url, data=data).json()
|
|
|
+
|
|
|
+ print('reCAPTCHA',result.get("success"))
|
|
|
+
|
|
|
+ return result.get("success")
|
|
|
|
|
|
|
|
|
def pdf_preview(file):
|
|
|
@@ -42,146 +74,76 @@ def pdf_preview(file):
|
|
|
def upload_file(file, service, progress=gr.Progress()):
|
|
|
"""Handle file upload, validation, and initial preview."""
|
|
|
if not file or not os.path.exists(file):
|
|
|
- return None, None, gr.update(visible=False)
|
|
|
+ return None, None, gr.update(visible=False), gr.update(visible=False)
|
|
|
|
|
|
- progress(0.3, desc="Converting PDF for preview...")
|
|
|
try:
|
|
|
# Convert first page for preview
|
|
|
preview_image = pdf_preview(file)
|
|
|
|
|
|
- return file, preview_image, gr.update(visible=True)
|
|
|
+ return file, preview_image, gr.update(visible=True), gr.update(visible=True)
|
|
|
except Exception as e:
|
|
|
print(f"Error converting PDF: {e}")
|
|
|
- return None, None, gr.update(visible=False)
|
|
|
+ return None, None, gr.update(visible=False), gr.update(visible=False)
|
|
|
|
|
|
|
|
|
def translate(
|
|
|
- file_path, service, model_id, lang, page_range, extra_args, progress=gr.Progress()
|
|
|
+ file_path, service, model_id, lang, page_range, recaptcha_response, progress=gr.Progress()
|
|
|
):
|
|
|
"""Translate PDF content using selected service."""
|
|
|
if not file_path:
|
|
|
- return (
|
|
|
- None,
|
|
|
- None,
|
|
|
- None,
|
|
|
- gr.update(visible=False),
|
|
|
- gr.update(visible=False),
|
|
|
- gr.update(visible=False),
|
|
|
- )
|
|
|
+ raise gr.Error('No input')
|
|
|
|
|
|
- progress(0, desc="Starting translation...")
|
|
|
-
|
|
|
- # Create a temporary working directory using Gradio's file utilities
|
|
|
- with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
- # Create safe paths using pathlib
|
|
|
- temp_path = Path(temp_dir)
|
|
|
- input_pdf = temp_path / "input.pdf"
|
|
|
-
|
|
|
- # Copy input file to temp directory
|
|
|
- progress(0.2, desc="Preparing files...")
|
|
|
- with open(file_path, "rb") as src, open(input_pdf, "wb") as dst:
|
|
|
- dst.write(src.read())
|
|
|
-
|
|
|
- selected_service = service_map.get(service, "google")
|
|
|
- lang_to = lang_map.get(lang, "zh")
|
|
|
-
|
|
|
- # Execute translation in temp directory with real-time progress
|
|
|
- progress(0.3, desc=f"Starting translation with {selected_service}...")
|
|
|
-
|
|
|
- # Create output directory for translated files
|
|
|
- output_dir = Path("gradio_files") / "outputs"
|
|
|
- output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
- final_output = output_dir / f"translated_{os.path.basename(file_path)}"
|
|
|
- final_output_dual = output_dir / f"dual_{os.path.basename(file_path)}"
|
|
|
-
|
|
|
- # Prepare extra arguments
|
|
|
- extra_args = extra_args.strip()
|
|
|
- # Add page range arguments
|
|
|
- if page_range == "All":
|
|
|
- extra_args += ""
|
|
|
- elif page_range == "First":
|
|
|
- extra_args += " -p 1"
|
|
|
- elif page_range == "First 5 pages":
|
|
|
- extra_args += " -p 1-5"
|
|
|
-
|
|
|
- # Execute translation command
|
|
|
- if selected_service == "google":
|
|
|
- lang_to = "zh-CN" if lang_to == "zh" else lang_to
|
|
|
-
|
|
|
- if selected_service in ["ollama", "openai"]:
|
|
|
- command = f'cd "{temp_path}" && pdf2zh "{input_pdf}" -lo {lang_to} -s {selected_service}:{model_id} {extra_args}'
|
|
|
- else:
|
|
|
- command = f'cd "{temp_path}" && pdf2zh "{input_pdf}" -lo {lang_to} -s {selected_service} {extra_args}'
|
|
|
- print(f"Executing command: {command}")
|
|
|
- print(f"Files in temp directory: {os.listdir(temp_path)}")
|
|
|
-
|
|
|
- process = subprocess.Popen(
|
|
|
- command,
|
|
|
- shell=True,
|
|
|
- stdout=subprocess.PIPE,
|
|
|
- stderr=subprocess.STDOUT,
|
|
|
- universal_newlines=True,
|
|
|
- )
|
|
|
-
|
|
|
- # Monitor progress from command output
|
|
|
- while True:
|
|
|
- output = process.stdout.readline()
|
|
|
- if output == "" and process.poll() is not None:
|
|
|
- break
|
|
|
- if output:
|
|
|
- print(f"Command output: {output.strip()}")
|
|
|
- # Look for percentage in output
|
|
|
- match = re.search(r"(\d+)%", output.strip())
|
|
|
- if match:
|
|
|
- percent = int(match.group(1))
|
|
|
- # Map command progress (0-100%) to our progress range (30-80%)
|
|
|
- progress_val = 0.3 + (percent * 0.5 / 100)
|
|
|
- progress(progress_val, desc=f"Translating content: {percent}%")
|
|
|
-
|
|
|
- # Get the return code
|
|
|
- return_code = process.poll()
|
|
|
- print(f"Command completed with return code: {return_code}")
|
|
|
-
|
|
|
- # Check if translation was successful
|
|
|
- translated_file = temp_path / "input-zh.pdf" # <= Do not change filename
|
|
|
- dual_file = temp_path / "input-dual.pdf"
|
|
|
- print(f"Files after translation: {os.listdir(temp_path)}")
|
|
|
-
|
|
|
- if not translated_file.exists() and not dual_file.exists():
|
|
|
- print("Translation failed: No output files found")
|
|
|
- return (
|
|
|
- None,
|
|
|
- None,
|
|
|
- None,
|
|
|
- gr.update(visible=False),
|
|
|
- gr.update(visible=False),
|
|
|
- gr.update(visible=False),
|
|
|
- )
|
|
|
+ if flag_demo and not verify_recaptcha(recaptcha_response):
|
|
|
+ raise gr.Error('reCAPTCHA fail')
|
|
|
|
|
|
- # Copy the translated files to permanent locations
|
|
|
- progress(0.8, desc="Saving translated files...")
|
|
|
-
|
|
|
- if translated_file.exists():
|
|
|
- with open(translated_file, "rb") as src, open(final_output, "wb") as dst:
|
|
|
- dst.write(src.read())
|
|
|
+ progress(0, desc="Starting translation...")
|
|
|
|
|
|
- if dual_file.exists():
|
|
|
- with open(dual_file, "rb") as src, open(final_output_dual, "wb") as dst:
|
|
|
- dst.write(src.read())
|
|
|
+ output = Path("pdf2zh_files")
|
|
|
+ output.mkdir(parents=True, exist_ok=True)
|
|
|
+ filename = os.path.splitext(os.path.basename(file_path))[0]
|
|
|
+ file_en = output / f"{filename}.pdf"
|
|
|
+ file_zh = output / f"{filename}-zh.pdf"
|
|
|
+ file_dual = output / f"{filename}-dual.pdf"
|
|
|
+ shutil.copyfile(file_path, file_en)
|
|
|
+
|
|
|
+ selected_service = service_map.get(service, "google")
|
|
|
+ selected_page = page_map.get(page_range, [1])
|
|
|
+ lang_to = lang_map.get(lang, "zh")
|
|
|
+ if selected_service == "google":
|
|
|
+ lang_to = "zh-CN" if lang_to == "zh" else lang_to
|
|
|
+
|
|
|
+ print(f"Files before translation: {os.listdir(output)}")
|
|
|
+ def progress_bar(t:tqdm.tqdm):
|
|
|
+ progress(t.n/t.total, desc="Translating...")
|
|
|
+
|
|
|
+ param={
|
|
|
+ 'files':[file_en],
|
|
|
+ 'pages':selected_page,
|
|
|
+ 'lang_in':'auto',
|
|
|
+ 'lang_out':lang_to,
|
|
|
+ 'service':f"{selected_service}:{model_id}",
|
|
|
+ 'output':output,
|
|
|
+ 'thread':4,
|
|
|
+ 'callback':progress_bar,
|
|
|
+ }
|
|
|
+ print(param)
|
|
|
+ extract_text(**param)
|
|
|
+ print(f"Files after translation: {os.listdir(output)}")
|
|
|
+
|
|
|
+ if not file_zh.exists() or not file_dual.exists():
|
|
|
+ raise gr.Error('No output')
|
|
|
|
|
|
- # Generate preview of translated PDF
|
|
|
- progress(0.9, desc="Generating preview...")
|
|
|
- try:
|
|
|
- translated_preview = pdf_preview(str(final_output))
|
|
|
- except Exception as e:
|
|
|
- print(f"Error generating preview: {e}")
|
|
|
- translated_preview = None
|
|
|
+ try:
|
|
|
+ translated_preview = pdf_preview(str(file_zh))
|
|
|
+ except Exception as e:
|
|
|
+ raise gr.Error('No preview')
|
|
|
|
|
|
progress(1.0, desc="Translation complete!")
|
|
|
+
|
|
|
return (
|
|
|
- str(final_output),
|
|
|
+ str(file_zh),
|
|
|
translated_preview,
|
|
|
- str(final_output_dual),
|
|
|
+ str(file_dual),
|
|
|
gr.update(visible=True),
|
|
|
gr.update(visible=True),
|
|
|
gr.update(visible=True),
|
|
|
@@ -239,12 +201,12 @@ with gr.Blocks(
|
|
|
transition: background-color 0.2s ease-in;
|
|
|
}
|
|
|
|
|
|
-.progress-bar-wrap {
|
|
|
- border-radius: 8px !important;
|
|
|
-}
|
|
|
-.progress-bar {
|
|
|
- border-radius: 8px !important;
|
|
|
-}
|
|
|
+ .progress-bar-wrap {
|
|
|
+ border-radius: 8px !important;
|
|
|
+ }
|
|
|
+ .progress-bar {
|
|
|
+ border-radius: 8px !important;
|
|
|
+ }
|
|
|
|
|
|
# .input-file label {
|
|
|
# color: #165DFF !important;
|
|
|
@@ -259,12 +221,22 @@ with gr.Blocks(
|
|
|
# color: #165DFF !important;
|
|
|
# }
|
|
|
""",
|
|
|
+ head='''
|
|
|
+ <script src="https://www.google.com/recaptcha/api.js" async defer></script>
|
|
|
+ <script type="text/javascript">
|
|
|
+ var onVerify = function(token) {
|
|
|
+ el=document.getElementById('verify').getElementsByTagName('textarea')[0];
|
|
|
+ el.value=token;
|
|
|
+ el.dispatchEvent(new Event('input'));
|
|
|
+ };
|
|
|
+ </script>
|
|
|
+ ''' if flag_demo else None
|
|
|
) as demo:
|
|
|
- gr.Markdown("# PDFMathTranslate")
|
|
|
+ gr.Markdown("# [PDFMathTranslate @ Github](https://github.com/Byaidu/PDFMathTranslate)")
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column(scale=1):
|
|
|
- gr.Markdown("## File")
|
|
|
+ gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
|
|
|
file_input = gr.File(
|
|
|
label="Document",
|
|
|
file_count="single",
|
|
|
@@ -279,12 +251,6 @@ with gr.Blocks(
|
|
|
choices=service_map.keys(),
|
|
|
value="Google",
|
|
|
)
|
|
|
- # lang_src = gr.Dropdown(
|
|
|
- # label="Source Language",
|
|
|
- # info="Which translation service to use. Some require keys",
|
|
|
- # choices=["Google", "DeepL", "DeepLX", "Ollama", "Azure"],
|
|
|
- # value="Google",
|
|
|
- # )
|
|
|
lang_to = gr.Dropdown(
|
|
|
label="Translate to",
|
|
|
info="Which language to translate to (optional)",
|
|
|
@@ -292,10 +258,10 @@ with gr.Blocks(
|
|
|
value="Chinese",
|
|
|
)
|
|
|
page_range = gr.Radio(
|
|
|
- ["All", "First", "First 5 pages"],
|
|
|
+ choices=page_map.keys(),
|
|
|
label="Pages",
|
|
|
info="Translate the full document or just few pages (optional)",
|
|
|
- value="All",
|
|
|
+ value=list(page_map.keys())[0],
|
|
|
)
|
|
|
model_id = gr.Textbox(
|
|
|
label="Model ID",
|
|
|
@@ -303,11 +269,6 @@ with gr.Blocks(
|
|
|
# value="gemma2",
|
|
|
visible=False, # hide by default
|
|
|
)
|
|
|
- extra_args = gr.Textbox(
|
|
|
- label="Advanced Arguments",
|
|
|
- info="Extra arguments supported in commandline (optional)",
|
|
|
- value="",
|
|
|
- )
|
|
|
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
|
|
|
|
|
def details_wrapper(text_markdown):
|
|
|
@@ -374,6 +335,11 @@ with gr.Blocks(
|
|
|
output_file_dual = gr.File(
|
|
|
label="Download Translation (Dual)", visible=False
|
|
|
)
|
|
|
+ recaptcha_response = gr.Textbox(label="reCAPTCHA Response", elem_id='verify', visible=False)
|
|
|
+ if flag_demo:
|
|
|
+ recaptcha_box=gr.HTML(f'<div class="g-recaptcha" data-sitekey="{client_key}" data-callback="onVerify"></div>', visible=False)
|
|
|
+ else:
|
|
|
+ recaptcha_box=gr.HTML()
|
|
|
translate_btn = gr.Button("Translate", variant="primary", visible=False)
|
|
|
tech_details_tog = gr.Markdown(
|
|
|
details_wrapper(envs_status),
|
|
|
@@ -389,12 +355,12 @@ with gr.Blocks(
|
|
|
file_input.upload(
|
|
|
upload_file,
|
|
|
inputs=[file_input, service],
|
|
|
- outputs=[file_input, preview, translate_btn],
|
|
|
+ outputs=[file_input, preview, translate_btn, recaptcha_box],
|
|
|
)
|
|
|
|
|
|
translate_btn.click(
|
|
|
translate,
|
|
|
- inputs=[file_input, service, model_id, lang_to, page_range, extra_args],
|
|
|
+ inputs=[file_input, service, model_id, lang_to, page_range, recaptcha_response],
|
|
|
outputs=[
|
|
|
output_file,
|
|
|
preview,
|
|
|
@@ -407,15 +373,19 @@ with gr.Blocks(
|
|
|
|
|
|
|
|
|
def setup_gui(share=False):
|
|
|
- try:
|
|
|
- demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
|
|
|
- except Exception:
|
|
|
- print("Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software.")
|
|
|
+ import doclayout_yolo # cache
|
|
|
+ if flag_demo:
|
|
|
+ demo.launch(server_name="0.0.0.0", max_file_size='5mb', inbrowser=True)
|
|
|
+ else:
|
|
|
try:
|
|
|
- demo.launch(server_name="127.0.0.1", debug=True, inbrowser=True, share=share)
|
|
|
+ demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
|
|
|
except Exception:
|
|
|
- print("Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software.")
|
|
|
- demo.launch(debug=True, inbrowser=True, share=True)
|
|
|
+ print("Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software.")
|
|
|
+ try:
|
|
|
+ demo.launch(server_name="127.0.0.1", debug=True, inbrowser=True, share=share)
|
|
|
+ except Exception:
|
|
|
+ print("Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software.")
|
|
|
+ demo.launch(debug=True, inbrowser=True, share=True)
|
|
|
|
|
|
# For auto-reloading while developing
|
|
|
if __name__ == "__main__":
|