|
|
@@ -21,23 +21,10 @@ from pdfminer.pdfpage import PDFPage
|
|
|
from pdfminer.pdfparser import PDFParser
|
|
|
from pymupdf import Document, Font
|
|
|
|
|
|
-from pdf2zh.converter import TranslateConverter, shs_name, noto_name
|
|
|
+from pdf2zh.converter import TranslateConverter, noto_name
|
|
|
from pdf2zh.doclayout import OnnxModel
|
|
|
from pdf2zh.pdfinterp import PDFPageInterpreterEx
|
|
|
|
|
|
-# FIXME
|
|
|
-USE_SHS_FONT = True
|
|
|
-
|
|
|
-resfont_map = {
|
|
|
- "zh-cn": "china-ss",
|
|
|
- "zh-tw": "china-ts",
|
|
|
- "zh-hans": "china-ss",
|
|
|
- "zh-hant": "china-ts",
|
|
|
- "zh": "china-ss",
|
|
|
- "ja": "japan-s",
|
|
|
- "ko": "korea-s",
|
|
|
-}
|
|
|
-
|
|
|
noto_list = [
|
|
|
"am", # Amharic
|
|
|
"ar", # Arabic
|
|
|
@@ -48,18 +35,14 @@ noto_list = [
|
|
|
"gu", # Gujarati
|
|
|
"iw", # Hebrew
|
|
|
"hi", # Hindi
|
|
|
- # "ja", # Japanese
|
|
|
"kn", # Kannada
|
|
|
- # "ko", # Korean
|
|
|
"ml", # Malayalam
|
|
|
"mr", # Marathi
|
|
|
"ru", # Russian
|
|
|
"sr", # Serbian
|
|
|
- # "zh-cn",# SC
|
|
|
"ta", # Tamil
|
|
|
"te", # Telugu
|
|
|
"th", # Thai
|
|
|
- # "zh-tw",# TC
|
|
|
"ur", # Urdu
|
|
|
"uk", # Ukrainian
|
|
|
]
|
|
|
@@ -87,7 +70,6 @@ def translate_patch(
|
|
|
lang_out: str = "",
|
|
|
service: str = "",
|
|
|
resfont: str = "",
|
|
|
- shs: Font = None,
|
|
|
noto: Font = None,
|
|
|
callback: object = None,
|
|
|
cancellation_event: asyncio.Event = None,
|
|
|
@@ -108,7 +90,6 @@ def translate_patch(
|
|
|
lang_out,
|
|
|
service,
|
|
|
resfont,
|
|
|
- shs,
|
|
|
noto,
|
|
|
envs,
|
|
|
prompt,
|
|
|
@@ -192,51 +173,18 @@ def translate_stream(
|
|
|
**kwarg: Any,
|
|
|
):
|
|
|
font_list = [("tiro", None)]
|
|
|
- noto = None
|
|
|
- shs = None
|
|
|
- ttf_path = None
|
|
|
- if lang_out.lower() in resfont_map: # CJK
|
|
|
- if not USE_SHS_FONT:
|
|
|
- resfont = resfont_map[lang_out.lower()]
|
|
|
- else:
|
|
|
- resfont = shs_name
|
|
|
- # docker
|
|
|
- ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- ttf_path = os.path.join(
|
|
|
- tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
|
|
|
- )
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- print("Downloading SourceHanSerif font...")
|
|
|
- urllib.request.urlretrieve(
|
|
|
- "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
|
|
|
- ttf_path,
|
|
|
- )
|
|
|
- shs = Font(shs_name, ttf_path)
|
|
|
- elif lang_out.lower() in noto_list: # noto
|
|
|
- resfont = noto_name
|
|
|
- # docker
|
|
|
- ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")
|
|
|
-
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
|
|
|
- if not os.path.exists(ttf_path):
|
|
|
- print("Downloading Noto font...")
|
|
|
- urllib.request.urlretrieve(
|
|
|
- "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
|
|
|
- ttf_path,
|
|
|
- )
|
|
|
- noto = Font(noto_name, ttf_path)
|
|
|
- else: # fallback
|
|
|
- resfont = "china-ss"
|
|
|
- font_list.append((resfont, ttf_path))
|
|
|
+
|
|
|
+ font_path = download_remote_fonts(lang_out.lower())
|
|
|
+ resfont = noto_name
|
|
|
+ noto = Font(noto_name, font_path)
|
|
|
+ font_list.append((resfont, font_path))
|
|
|
|
|
|
doc_en = Document(stream=stream)
|
|
|
stream = io.BytesIO()
|
|
|
doc_en.save(stream)
|
|
|
doc_zh = Document(stream=stream)
|
|
|
page_count = doc_zh.page_count
|
|
|
- # font_list = [("china-ss", None), ("tiro", None)]
|
|
|
+ # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)]
|
|
|
font_id = {}
|
|
|
for page in doc_zh:
|
|
|
for font in font_list:
|
|
|
@@ -414,3 +362,31 @@ def translate(
|
|
|
result_files.append((str(file_mono), str(file_dual)))
|
|
|
|
|
|
return result_files
|
|
|
+
|
|
|
+
|
|
|
+def download_remote_fonts(lang: str):
|
|
|
+ URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/"
|
|
|
+ LANG_NAME_MAP = {
|
|
|
+ **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list},
|
|
|
+ **{
|
|
|
+ la: f"SourceHanSerif{region}-Regular.ttf"
|
|
|
+ for region, langs in {
|
|
|
+ "CN": ["zh-cn", "zh-hans", "zh"],
|
|
|
+ "TW": ["zh-tw", "zh-hant"],
|
|
|
+ "JP": ["ja"],
|
|
|
+ "KR": ["ko"],
|
|
|
+ }.items()
|
|
|
+ for la in langs
|
|
|
+ },
|
|
|
+ }
|
|
|
+ font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf")
|
|
|
+
|
|
|
+ # docker
|
|
|
+ font_path = os.environ.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix())
|
|
|
+ if not Path(font_path).exists():
|
|
|
+ font_path = Path(tempfile.gettempdir(), font_name).as_posix()
|
|
|
+ if not Path(font_path).exists():
|
|
|
+ print(f"Downloading {font_name}...")
|
|
|
+ urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path)
|
|
|
+
|
|
|
+ return font_path
|