docling_t.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import os
  2. from docling.document_converter import DocumentConverter
  3. from docx import Document
  4. from bs4 import BeautifulSoup
  5. def html_to_docx(html_content, output_docx_path):
  6. # 创建一个新的 DOCX 文档
  7. doc = Document()
  8. # 使用 BeautifulSoup 解析 HTML 内容
  9. soup = BeautifulSoup(html_content, 'html.parser')
  10. # 将 HTML 内容添加到 DOCX 文档中
  11. for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li']):
  12. if element.name == 'p':
  13. doc.add_paragraph(element.get_text())
  14. elif element.name.startswith('h'):
  15. level = int(element.name[1])
  16. doc.add_heading(element.get_text(), level=level)
  17. elif element.name in ['ul', 'ol']:
  18. for li in element.find_all('li'):
  19. doc.add_paragraph(li.get_text(), style='ListBullet' if element.name == 'ul' else 'ListNumber')
  20. # 保存 DOCX 文档
  21. doc.save(output_docx_path)
  22. def main():
  23. # 本地 HTML 文件路径
  24. file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
  25. # source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
  26. source = file_path
  27. converter = DocumentConverter()
  28. result = converter.convert(source)
  29. # 导出为 Markdown
  30. markdown_content = result.document.export_to_markdown()
  31. print(markdown_content)
  32. if __name__ == "__main__":
  33. main()