Byaidu il y a 1 an
Parent
commit
a79b39ef27
4 fichiers modifiés avec 90 ajouts et 10 suppressions
  1. 39 2
      README.md
  2. 39 0
      README_zh-CN.md
  3. 10 6
      pdf2zh/backend.py
  4. 2 2
      pdf2zh/pdf2zh.py

+ 39 - 2
README.md

@@ -233,6 +233,45 @@ Use `-t` to specify how many threads to use in translation:
 pdf2zh example.pdf -t 1
 ```
 
+<h2 id="todo">API</h2>
+
+### Python
+
+```python
+from pdf2zh import translate, translate_stream
+
+params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4}
+doc_mono, doc_dual = translate(files=["example.pdf"], **params)
+with open("example.pdf", "rb") as f:
+    stream_mono, stream_dual = translate_stream(stream=f.read(), **params)
+```
+
+### HTTP
+
+```bash
+pip install pdf2zh[backend]
+pdf2zh --flask
+pdf2zh --celery worker
+```
+
+```bash
+curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"l
+ang_out\":\"zh\",\"service\":\"google\",\"thread\":4}"
+{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"}
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a
+{"info":{"n":13,"total":506},"state":"PROGRESS"}
+
+curl http://localhost:11008/v1/tasks/d9894125-2f4e-45ea-9d93-1a9068d2045a
+{"state":"SUCCESS"}
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE
+```
+
 <h2 id="todo">TODO</h2>
 
 - [ ] Parse layout with DocLayNet based models, [PaddleX](https://github.com/PaddlePaddle/PaddleX/blob/17cc27ac3842e7880ca4aad92358d3ef8555429a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py#L81), [PaperMage](https://github.com/allenai/papermage/blob/9cd4bb48cbedab45d0f7a455711438f1632abebe/README.md?plain=1#L102), [SAM2](https://github.com/facebookresearch/sam2)
@@ -247,8 +286,6 @@ pdf2zh example.pdf -t 1
 
 - [ ] Support non-PDF/A files
 
-- [ ] Provide API interface
-
 - [ ] Plugins of [Zotero](https://github.com/zotero/zotero) and [Obsidian](https://github.com/obsidianmd/obsidian-releases)
 
 <h2 id="acknowledgement">Acknowledgements</h2>

+ 39 - 0
README_zh-CN.md

@@ -233,6 +233,45 @@ pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\u
 pdf2zh example.pdf -t 1
 ```
 
+<h2 id="todo">API</h2>
+
+### Python
+
+```python
+from pdf2zh import translate, translate_stream
+
+params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4}
+doc_mono, doc_dual = translate(files=["example.pdf"], **params)
+with open("example.pdf", "rb") as f:
+    stream_mono, stream_dual = translate_stream(stream=f.read(), **params)
+```
+
+### HTTP
+
+```bash
+pip install pdf2zh[backend]
+pdf2zh --flask
+pdf2zh --celery worker
+```
+
+```bash
+curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"l
+ang_out\":\"zh\",\"service\":\"google\",\"thread\":4}"
+{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"}
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a
+{"info":{"n":13,"total":506},"state":"PROGRESS"}
+
+curl http://localhost:11008/v1/tasks/d9894125-2f4e-45ea-9d93-1a9068d2045a
+{"state":"SUCCESS"}
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf
+
+curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE
+```
+
 <h2 id="acknowledgement">致谢</h2>
 
 - 文档合并:[PyMuPDF](https://github.com/pymupdf/PyMuPDF)

+ 10 - 6
pdf2zh/backend.py

@@ -5,6 +5,7 @@ from celery.result import AsyncResult
 from pdf2zh import translate_stream
 import tqdm
 import json
+import io
 
 flask_app = Flask("pdf2zh")
 flask_app.config.from_mapping(
@@ -61,20 +62,23 @@ def create_translate_tasks():
     return {"id": task.id}
 
 
-@flask_app.route("/v1/tasks/<id>", methods=["GET"])
+@flask_app.route("/v1/translate/<id>", methods=["GET"])
 def get_translate_task(id: str):
     result: AsyncResult = celery_app.AsyncResult(id)
-    return {"state": result.state, "info": result.info}
+    if str(result.state) == "PROGRESS":
+        return {"state": str(result.state), "info": result.info}
+    else:
+        return {"state": str(result.state)}
 
 
-@flask_app.route("/v1/tasks/<id>", methods=["DELETE"])
+@flask_app.route("/v1/translate/<id>", methods=["DELETE"])
 def delete_translate_task(id: str):
     result: AsyncResult = celery_app.AsyncResult(id)
     result.revoke(terminate=True)
-    return {"state": result.state, "info": result.info}
+    return {"state": str(result.state)}
 
 
-@flask_app.route("/v1/tasks/<id>/<format>")
+@flask_app.route("/v1/translate/<id>/<format>")
 def get_translate_result(id: str, format: str):
     result = celery_app.AsyncResult(id)
     if not result.ready():
@@ -83,7 +87,7 @@ def get_translate_result(id: str, format: str):
         return {"error": "task failed"}, 400
     doc_mono, doc_dual = result.get()
     to_send = doc_mono if format == "mono" else doc_dual
-    return send_file(to_send, "application/pdf")
+    return send_file(io.BytesIO(to_send), "application/pdf")
 
 
 if __name__ == "__main__":

+ 2 - 2
pdf2zh/pdf2zh.py

@@ -152,13 +152,13 @@ def main(args: Optional[List[str]] = None) -> int:
     if parsed_args.flask:
         from pdf2zh.backend import flask_app
 
-        flask_app.run()
+        flask_app.run(port=11008)
         return 0
 
     if parsed_args.celery:
         from pdf2zh.backend import celery_app
 
-        celery_app.start(argv=["worker", "--pool=prefork"])
+        celery_app.start(argv=sys.argv[2:])
         return 0
 
     translate(**vars(parsed_args))