|
|
@@ -24,15 +24,7 @@ from pdf2zh.converter import TranslateConverter
|
|
|
from pdf2zh.doclayout import OnnxModel
|
|
|
from pdf2zh.pdfinterp import PDFPageInterpreterEx
|
|
|
|
|
|
-resfont_map = {
|
|
|
- "zh-cn": "china-ss",
|
|
|
- "zh-tw": "china-ts",
|
|
|
- "zh-hans": "china-ss",
|
|
|
- "zh-hant": "china-ts",
|
|
|
- "zh": "china-ss",
|
|
|
- "ja": "japan-s",
|
|
|
- "ko": "korea-s",
|
|
|
-}
|
|
|
+NOTO_NAME = "noto"
|
|
|
|
|
|
noto_list = [
|
|
|
"am", # Amharic
|
|
|
@@ -44,18 +36,14 @@ noto_list = [
|
|
|
"gu", # Gujarati
|
|
|
"iw", # Hebrew
|
|
|
"hi", # Hindi
|
|
|
- # "ja", # Japanese
|
|
|
"kn", # Kannada
|
|
|
- # "ko", # Korean
|
|
|
"ml", # Malayalam
|
|
|
"mr", # Marathi
|
|
|
"ru", # Russian
|
|
|
"sr", # Serbian
|
|
|
- # "zh-cn",# SC
|
|
|
"ta", # Tamil
|
|
|
"te", # Telugu
|
|
|
"th", # Thai
|
|
|
- # "zh-tw",# TC
|
|
|
"ur", # Urdu
|
|
|
"uk", # Ukrainian
|
|
|
]
|
|
|
@@ -82,7 +70,7 @@ def translate_patch(
|
|
|
lang_in: str = "",
|
|
|
lang_out: str = "",
|
|
|
service: str = "",
|
|
|
- resfont: str = "",
|
|
|
+ noto_name: str = "",
|
|
|
noto: Font = None,
|
|
|
callback: object = None,
|
|
|
cancellation_event: asyncio.Event = None,
|
|
|
@@ -102,7 +90,7 @@ def translate_patch(
|
|
|
lang_in,
|
|
|
lang_out,
|
|
|
service,
|
|
|
- resfont,
|
|
|
+ noto_name,
|
|
|
noto,
|
|
|
envs,
|
|
|
prompt,
|
|
|
@@ -186,35 +174,18 @@ def translate_stream(
|
|
|
**kwarg: Any,
|
|
|
):
|
|
|
font_list = [("tiro", None)]
|
|
|
- noto = None
|
|
|
- if lang_out.lower() in resfont_map: # CJK
|
|
|
- resfont = resfont_map[lang_out.lower()]
|
|
|
- font_list.append((resfont, None))
|
|
|
- elif lang_out.lower() in noto_list: # noto
|
|
|
- resfont = "noto"
|
|
|
- # docker
|
|
|
- ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
|
|
|
-
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- print("Downloading Noto font...")
|
|
|
- urllib.request.urlretrieve(
|
|
|
- "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
|
|
|
- ttf_path,
|
|
|
- )
|
|
|
- font_list.append(("noto", ttf_path))
|
|
|
- noto = Font("noto", ttf_path)
|
|
|
- else: # fallback
|
|
|
- resfont = "china-ss"
|
|
|
- font_list.append(("china-ss", None))
|
|
|
+
|
|
|
+ font_path = download_remote_fonts(lang_out.lower())
|
|
|
+ noto_name = NOTO_NAME
|
|
|
+ noto = Font(noto_name, font_path)
|
|
|
+ font_list.append((noto_name, font_path))
|
|
|
|
|
|
doc_en = Document(stream=stream)
|
|
|
stream = io.BytesIO()
|
|
|
doc_en.save(stream)
|
|
|
doc_zh = Document(stream=stream)
|
|
|
page_count = doc_zh.page_count
|
|
|
- # font_list = [("china-ss", None), ("tiro", None)]
|
|
|
+ # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)]
|
|
|
font_id = {}
|
|
|
for page in doc_zh:
|
|
|
for font in font_list:
|
|
|
@@ -237,6 +208,7 @@ def translate_stream(
|
|
|
pass
|
|
|
|
|
|
fp = io.BytesIO()
|
|
|
+
|
|
|
doc_zh.save(fp)
|
|
|
obj_patch: dict = translate_patch(fp, **locals())
|
|
|
|
|
|
@@ -251,7 +223,12 @@ def translate_stream(
|
|
|
for id in range(page_count):
|
|
|
doc_en.move_page(page_count + id, id * 2 + 1)
|
|
|
|
|
|
- return doc_zh.write(deflate=1), doc_en.write(deflate=1)
|
|
|
+ doc_zh.subset_fonts(fallback=True)
|
|
|
+ doc_en.subset_fonts(fallback=True)
|
|
|
+ return (
|
|
|
+ doc_zh.write(deflate=True, garbage=3, use_objstms=1),
|
|
|
+ doc_en.write(deflate=True, garbage=3, use_objstms=1),
|
|
|
+ )
|
|
|
|
|
|
|
|
|
def convert_to_pdfa(input_path, output_path):
|
|
|
@@ -386,3 +363,31 @@ def translate(
|
|
|
result_files.append((str(file_mono), str(file_dual)))
|
|
|
|
|
|
return result_files
|
|
|
+
|
|
|
+
|
|
|
+def download_remote_fonts(lang: str):
|
|
|
+ URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/"
|
|
|
+ LANG_NAME_MAP = {
|
|
|
+ **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list},
|
|
|
+ **{
|
|
|
+ la: f"SourceHanSerif{region}-Regular.ttf"
|
|
|
+ for region, langs in {
|
|
|
+ "CN": ["zh-cn", "zh-hans", "zh"],
|
|
|
+ "TW": ["zh-tw", "zh-hant"],
|
|
|
+ "JP": ["ja"],
|
|
|
+ "KR": ["ko"],
|
|
|
+ }.items()
|
|
|
+ for la in langs
|
|
|
+ },
|
|
|
+ }
|
|
|
+ font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf")
|
|
|
+
|
|
|
+ # docker
|
|
|
+ font_path = os.environ.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix())
|
|
|
+ if not Path(font_path).exists():
|
|
|
+ font_path = Path(tempfile.gettempdir(), font_name).as_posix()
|
|
|
+ if not Path(font_path).exists():
|
|
|
+ print(f"Downloading {font_name}...")
|
|
|
+ urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path)
|
|
|
+
|
|
|
+ return font_path
|