|
|
@@ -0,0 +1,197 @@
|
|
|
+import asyncio
|
|
|
+import os
|
|
|
+import re
|
|
|
+import subprocess
|
|
|
+import sys
|
|
|
+import pickle
|
|
|
+from crawl4ai import *
|
|
|
+from search_keyward import test_dir_links_not_local
|
|
|
+from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
|
|
|
+from mylib.settings import *
|
|
|
+from pathlib import Path
|
|
|
+import markdown
|
|
|
+from docx import Document
|
|
|
+from docx.shared import Pt
|
|
|
+from docx.oxml.ns import qn
|
|
|
+from docx.oxml import OxmlElement
|
|
|
+import re
|
|
|
+from docx import Document
|
|
|
+from docx.oxml.shared import OxmlElement, qn
|
|
|
+from docx.oxml.ns import nsdecls
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+from docling.document_converter import DocumentConverter
|
|
|
+
|
|
|
+__location__ = os.path.dirname(os.path.abspath(__file__))
|
|
|
+__output__ = os.path.join(__location__, "output")
|
|
|
+
|
|
|
+# Append parent directory to system path
|
|
|
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
+sys.path.append(parent_dir)
|
|
|
+
|
|
|
+from typing import List
|
|
|
+
|
|
|
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
|
|
+ # Minimal browser config
|
|
|
+ browser_config = BrowserConfig(
|
|
|
+ headless=True,
|
|
|
+ verbose=False,
|
|
|
+ extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
|
+ )
|
|
|
+ crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
+
|
|
|
+ # Create the crawler instance
|
|
|
+ crawler = AsyncWebCrawler(config=browser_config)
|
|
|
+ await crawler.start()
|
|
|
+
|
|
|
+ results = []
|
|
|
+
|
|
|
+ try:
|
|
|
+ for i in range(0, len(urls), max_concurrent):
|
|
|
+ batch = urls[i : i + max_concurrent]
|
|
|
+ tasks = []
|
|
|
+
|
|
|
+ for j, url in enumerate(batch):
|
|
|
+ # Unique session_id per concurrent sub-task
|
|
|
+ session_id = f"parallel_session_{i + j}"
|
|
|
+ task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
|
|
+ tasks.append(task)
|
|
|
+
|
|
|
+ # Gather results
|
|
|
+ batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
+ results.extend(batch_results)
|
|
|
+
|
|
|
+ finally:
|
|
|
+ await crawler.close()
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+async def multi_crawl_one_dir_html(dir_path:Path):
|
|
|
+ links = await test_dir_links_not_local(dir_path)
|
|
|
+ urls = [link['href'] for link in links]
|
|
|
+
|
|
|
+
|
|
|
+ # Perform parallel crawling
|
|
|
+ results = await crawl_parallel(urls, max_concurrent=10)
|
|
|
+
|
|
|
+ # Save results using pickle
|
|
|
+ output_file = os.path.join(dir_path, "crawl_results.pkl")
|
|
|
+ with open(output_file, "wb") as f:
|
|
|
+ pickle.dump(results, f)
|
|
|
+
|
|
|
+ print(f"Crawling results saved to {output_file}")
|
|
|
+ return output_file
|
|
|
+
|
|
|
+def docling_html2md(html_file_path: Path):
|
|
|
+ source = html_file_path
|
|
|
+ converter = DocumentConverter()
|
|
|
+ result = converter.convert(source)
|
|
|
+
|
|
|
+ # 导出为 Markdown
|
|
|
+ markdown_content = result.document.export_to_markdown()
|
|
|
+ # print(markdown_content)
|
|
|
+ return markdown_content
|
|
|
+
|
|
|
+def pandoc_html2md(html_file_path: Path, output_file_path: Path):
|
|
|
+ # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
|
|
|
+ pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
|
|
|
+ # 执行转换指令
|
|
|
+ cmd = [
|
|
|
+ pandoc_exe,
|
|
|
+ "-f", "html",
|
|
|
+ "-t", "markdown",
|
|
|
+ "-o", str(output_file_path),
|
|
|
+ str(html_file_path)
|
|
|
+ ]
|
|
|
+ try:
|
|
|
+ subprocess.run(cmd, check=True)
|
|
|
+ print("转换成功!")
|
|
|
+ except subprocess.CalledProcessError as e:
|
|
|
+ print("转换失败!")
|
|
|
+ print(e)
|
|
|
+ except Exception as e:
|
|
|
+ print("发生错误!")
|
|
|
+ print(e)
|
|
|
+
|
|
|
+async def load_result(dir_path:Path):
|
|
|
+ results:List[CrawlResult] = load_from_pickle(dir_path / "crawl_results.pkl")
|
|
|
+ print(f"len {len(results)}")
|
|
|
+ output_all_page_html_dir = dir_path / "all_page_html"
|
|
|
+ ensure_output_dir(output_all_page_html_dir)
|
|
|
+ output_all_crawl_md_dir = dir_path / "all_crawl_md"
|
|
|
+ ensure_output_dir(output_all_crawl_md_dir)
|
|
|
+ output_all_docling_markdown_dir = dir_path / "all_docling_markdown"
|
|
|
+ ensure_output_dir(output_all_docling_markdown_dir)
|
|
|
+ output_all_pandoc_markdown_dir = dir_path / "all_pandoc_markdown"
|
|
|
+ ensure_output_dir(output_all_pandoc_markdown_dir)
|
|
|
+
|
|
|
+ converter = DocumentConverter()
|
|
|
+
|
|
|
+ for idx, result in enumerate(results):
|
|
|
+ print(f"{idx} {result.url}")
|
|
|
+ # 保存 HTML 文件
|
|
|
+ html_file_path = output_all_page_html_dir / f"{idx}.html"
|
|
|
+ save_to_file(result.html, html_file_path)
|
|
|
+
|
|
|
+ # 如果存在 markdown 内容,转换为 DOCX 并保存
|
|
|
+ if result.markdown:
|
|
|
+ # 在写入 markdown 之前将文本第一行添加 url 链接
|
|
|
+ markdown = f"[{result.url}]({result.url})\n\n{result.markdown}"
|
|
|
+ save_to_file(markdown, output_all_crawl_md_dir / f"{idx}.md")
|
|
|
+ try:
|
|
|
+ # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
|
|
|
+ docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
|
|
|
+ docling_md = docling_html2md(html_file_path)
|
|
|
+ # 在 md 的第一行加入 url 链接
|
|
|
+ docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
|
|
|
+ save_to_file(docling_md, docling_md_path)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Error converting HTML to Markdown using docling: {e}")
|
|
|
+ print(f"html_file_path {html_file_path}")
|
|
|
+
|
|
|
+ md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
|
|
|
+ pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
|
|
|
+ # 将刚刚写入的md文件第一行加入 url 链接
|
|
|
+ with open(md_file_path, 'r', encoding='utf-8') as f:
|
|
|
+ md_content = f.read()
|
|
|
+ md_content = f"[{result.url}]({result.url})\n\n" + md_content
|
|
|
+ save_to_file(md_content, md_file_path)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+def conver_all_md_to_docx(dir_path:Path):
|
|
|
+ # 列出当前路径下的所有文件夹
|
|
|
+ folders = [f for f in dir_path.iterdir() if f.is_dir()]
|
|
|
+ # print(folders)
|
|
|
+ for folder in folders:
|
|
|
+ # 如果不存在 md 文件,则跳过
|
|
|
+ md_files = [f for f in folder.iterdir() if f.suffix == '.md']
|
|
|
+ if not md_files:
|
|
|
+ continue
|
|
|
+ md_files_to_one_file_content = ""
|
|
|
+ for md_file in md_files:
|
|
|
+ with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
+ md_content = f.read()
|
|
|
+ md_files_to_one_file_content += md_content + "\n\n"
|
|
|
+ all_md_save_path = save_to_file(md_files_to_one_file_content, folder / f"all.md")
|
|
|
+ # pandoc -f markdown -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
|
|
|
+ # 用 pandoc 转换为 docx ,保存到 folder
|
|
|
+ pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
|
|
|
+ pandoc_cmd = f'{pandoc_exe} -f markdown -t docx -o "{folder / f"all.docx"}" "{all_md_save_path}"'
|
|
|
+ print(pandoc_cmd)
|
|
|
+ try:
|
|
|
+ subprocess.run(pandoc_cmd, shell=True, check=True)
|
|
|
+ print(f"转换成功!{all_md_save_path}")
|
|
|
+ except subprocess.CalledProcessError as e:
|
|
|
+ print(f"转换失败!{all_md_save_path}")
|
|
|
+ print(e)
|
|
|
+
|
|
|
+async def main():
|
|
|
+ dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
|
|
|
+ # await multi_crawl_one_dir_html(dir_path)
|
|
|
+ # await load_result(dir_path)
|
|
|
+ conver_all_md_to_docx(dir_path)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ asyncio.run(main())
|