| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- from pathlib import Path
- import subprocess
- from mylib.base import ensure_output_dir
- def pandoc_html2docx(html_file_path: Path, output_file_path: Path):
- # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
- pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
- # 执行转换指令
- cmd = [
- pandoc_exe,
- "-f", "html",
- "-t", "docx",
- "-o", str(output_file_path),
- str(html_file_path)
- ]
- try:
- subprocess.run(cmd, check=True)
- print("转换成功!")
- except subprocess.CalledProcessError as e:
- print("转换失败!")
- print(e)
- except Exception as e:
- print("发生错误!")
- print(e)
- def find_pandoc_exe():
- import shutil
- import subprocess
- # 搜索 pandoc 可执行文件的位置
- pandoc_path = shutil.which('pandoc')
- if pandoc_path:
- print(f"Found pandoc at: {pandoc_path}")
-
- # 调用 pandoc
- try:
- result = subprocess.run([pandoc_path, '--version'], capture_output=True, text=True)
- print("Pandoc version:")
- print(result.stdout)
- except subprocess.CalledProcessError as e:
- print(f"Error running pandoc: {e}")
- else:
- print("Pandoc not found in the current environment.")
- def main_convert():
- dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
- # '''
- # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
- # '''
- all_html_dir = dir_path / "all_page_html"
- all_page_pandoc_docx_dir = dir_path / "all_page_pandoc_docx"
- ensure_output_dir(all_page_pandoc_docx_dir)
- # 遍历 html 文件夹下的所有文件
- for file in all_html_dir.iterdir():
- # 如果是文件且后缀为 .html
- if file.is_file() and file.suffix == ".html":
- # 构造输出文件路径
- output_file_path = all_page_pandoc_docx_dir / (file.stem + ".docx")
- # 调用 pandoc_html2docx 函数进行转换
- pandoc_html2docx(file, output_file_path)
- if __name__ == "__main__":
- find_pandoc_exe()
|