| 1234567891011121314151617181920212223242526272829303132333435363738394041 |
- import os
- from docling.document_converter import DocumentConverter
- from docx import Document
- from bs4 import BeautifulSoup
- def html_to_docx(html_content, output_docx_path):
- # 创建一个新的 DOCX 文档
- doc = Document()
-
- # 使用 BeautifulSoup 解析 HTML 内容
- soup = BeautifulSoup(html_content, 'html.parser')
-
- # 将 HTML 内容添加到 DOCX 文档中
- for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li']):
- if element.name == 'p':
- doc.add_paragraph(element.get_text())
- elif element.name.startswith('h'):
- level = int(element.name[1])
- doc.add_heading(element.get_text(), level=level)
- elif element.name in ['ul', 'ol']:
- for li in element.find_all('li'):
- doc.add_paragraph(li.get_text(), style='ListBullet' if element.name == 'ul' else 'ListNumber')
-
- # 保存 DOCX 文档
- doc.save(output_docx_path)
- def main():
- # 本地 HTML 文件路径
- file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
- # source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
- source = file_path
- converter = DocumentConverter()
- result = converter.convert(source)
-
- # 导出为 Markdown
- markdown_content = result.document.export_to_markdown()
- print(markdown_content)
-
- if __name__ == "__main__":
- main()
|