pandoc_t.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from pathlib import Path
  2. import subprocess
  3. from mylib.base import ensure_output_dir
  4. def pandoc_html2docx(html_file_path: Path, output_file_path: Path):
  5. # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
  6. pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
  7. # 执行转换指令
  8. cmd = [
  9. pandoc_exe,
  10. "-f", "html",
  11. "-t", "docx",
  12. "-o", str(output_file_path),
  13. str(html_file_path)
  14. ]
  15. try:
  16. subprocess.run(cmd, check=True)
  17. print("转换成功!")
  18. except subprocess.CalledProcessError as e:
  19. print("转换失败!")
  20. print(e)
  21. except Exception as e:
  22. print("发生错误!")
  23. print(e)
  24. def find_pandoc_exe():
  25. import shutil
  26. import subprocess
  27. # 搜索 pandoc 可执行文件的位置
  28. pandoc_path = shutil.which('pandoc')
  29. if pandoc_path:
  30. print(f"Found pandoc at: {pandoc_path}")
  31. # 调用 pandoc
  32. try:
  33. result = subprocess.run([pandoc_path, '--version'], capture_output=True, text=True)
  34. print("Pandoc version:")
  35. print(result.stdout)
  36. except subprocess.CalledProcessError as e:
  37. print(f"Error running pandoc: {e}")
  38. else:
  39. print("Pandoc not found in the current environment.")
  40. def main_convert():
  41. dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
  42. # '''
  43. # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
  44. # '''
  45. all_html_dir = dir_path / "all_page_html"
  46. all_page_pandoc_docx_dir = dir_path / "all_page_pandoc_docx"
  47. ensure_output_dir(all_page_pandoc_docx_dir)
  48. # 遍历 html 文件夹下的所有文件
  49. for file in all_html_dir.iterdir():
  50. # 如果是文件且后缀为 .html
  51. if file.is_file() and file.suffix == ".html":
  52. # 构造输出文件路径
  53. output_file_path = all_page_pandoc_docx_dir / (file.stem + ".docx")
  54. # 调用 pandoc_html2docx 函数进行转换
  55. pandoc_html2docx(file, output_file_path)
  56. if __name__ == "__main__":
  57. find_pandoc_exe()