crawl_multi.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. import asyncio
  2. import os
  3. import re
  4. import subprocess
  5. import sys
  6. import pickle
  7. from dotenv import load_dotenv
  8. from crawl4ai import *
  9. from search_keyward import test_dir_links_not_local
  10. from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
  11. from config.settings import *
  12. from pathlib import Path
  13. import markdown
  14. from docx import Document
  15. from docx.shared import Pt
  16. from docx.oxml.ns import qn
  17. from docx.oxml import OxmlElement
  18. import re
  19. from docx import Document
  20. from docx.oxml.shared import OxmlElement, qn
  21. from docx.oxml.ns import nsdecls
  22. from pathlib import Path
  23. from docling.document_converter import DocumentConverter
  24. __location__ = os.path.dirname(os.path.abspath(__file__))
  25. __output__ = os.path.join(__location__, "output")
  26. # Append parent directory to system path
  27. parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  28. sys.path.append(parent_dir)
  29. from typing import List
  30. load_dotenv()
  31. async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
  32. # Minimal browser config
  33. browser_config = BrowserConfig(
  34. headless=True,
  35. verbose=False,
  36. extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
  37. )
  38. crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
  39. # Create the crawler instance
  40. crawler = AsyncWebCrawler(config=browser_config)
  41. await crawler.start()
  42. results = []
  43. try:
  44. for i in range(0, len(urls), max_concurrent):
  45. batch = urls[i : i + max_concurrent]
  46. tasks = []
  47. for j, url in enumerate(batch):
  48. # Unique session_id per concurrent sub-task
  49. session_id = f"parallel_session_{i + j}"
  50. task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
  51. tasks.append(task)
  52. # Gather results
  53. batch_results = await asyncio.gather(*tasks, return_exceptions=True)
  54. results.extend(batch_results)
  55. finally:
  56. await crawler.close()
  57. return results
  58. async def multi_crawl_one_dir_html(dir_path:Path):
  59. links = await test_dir_links_not_local(dir_path)
  60. urls = [link['href'] for link in links]
  61. # Save results using pickle
  62. output_file = dir_path / "crawl_results.pkl"
  63. if output_file.exists():
  64. print(f"{output_file} already exists. Skipping...")
  65. return output_file
  66. # Perform parallel crawling
  67. results = await crawl_parallel(urls, max_concurrent=10)
  68. with open(output_file, "wb") as f:
  69. pickle.dump(results, f)
  70. print(f"Crawling results saved to {output_file}")
  71. return output_file
  72. def docling_html2md(html_file_path: Path):
  73. source = html_file_path
  74. converter = DocumentConverter()
  75. result = converter.convert(source)
  76. # 导出为 Markdown
  77. markdown_content = result.document.export_to_markdown()
  78. # print(markdown_content)
  79. return markdown_content
  80. def pandoc_html2md(html_file_path: Path, output_file_path: Path):
  81. # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
  82. pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
  83. # 执行转换指令
  84. cmd = [
  85. pandoc_exe,
  86. "-f", "html",
  87. "-t", "markdown",
  88. "-o", str(output_file_path),
  89. str(html_file_path)
  90. ]
  91. try:
  92. subprocess.run(cmd, check=True)
  93. print("转换成功!")
  94. except subprocess.CalledProcessError as e:
  95. print("转换失败!")
  96. print(e)
  97. except Exception as e:
  98. print("发生错误!")
  99. print(e)
  100. async def load_result(dir_path:Path):
  101. results:List[CrawlResult] = load_from_pickle(dir_path / "crawl_results.pkl")
  102. print(f"len {len(results)}")
  103. output_all_page_html_dir = dir_path / "all_page_html"
  104. ensure_output_dir(output_all_page_html_dir)
  105. output_all_crawl_md_dir = dir_path / "all_crawl_md"
  106. ensure_output_dir(output_all_crawl_md_dir)
  107. output_all_docling_markdown_dir = dir_path / "all_docling_markdown"
  108. ensure_output_dir(output_all_docling_markdown_dir)
  109. output_all_pandoc_markdown_dir = dir_path / "all_pandoc_markdown"
  110. ensure_output_dir(output_all_pandoc_markdown_dir)
  111. converter = DocumentConverter()
  112. for idx, result in enumerate(results):
  113. print(f"{idx} {result.url}")
  114. # 保存 HTML 文件
  115. html_file_path = output_all_page_html_dir / f"{idx}.html"
  116. if not html_file_path.exists():
  117. save_to_file(result.html, html_file_path)
  118. # 如果存在 markdown 内容,转换为 DOCX 并保存
  119. if result.markdown:
  120. # 在写入 markdown 之前将文本第一行添加 url 链接
  121. markdown = f"[{result.url}]({result.url})\n\n{result.markdown}"
  122. save_to_file(markdown, output_all_crawl_md_dir / f"{idx}.md")
  123. try:
  124. # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
  125. docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
  126. if not docling_md_path.exists():
  127. docling_md = docling_html2md(html_file_path)
  128. # 在 md 的第一行加入 url 链接
  129. docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
  130. save_to_file(docling_md, docling_md_path)
  131. except Exception as e:
  132. print(f"Error converting HTML to Markdown using docling: {e}")
  133. print(f"html_file_path {html_file_path}")
  134. md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
  135. if md_file_path.exists():
  136. pandoc_html2md(html_file_path, md_file_path)
  137. # 将刚刚写入的md文件第一行加入 url 链接
  138. with open(md_file_path, 'r', encoding='utf-8') as f:
  139. md_content = f.read()
  140. md_content = f"[{result.url}]({result.url})\n\n" + md_content
  141. save_to_file(md_content, md_file_path)
  142. return result
  143. def conver_all_md_to_docx(dir_path:Path):
  144. # 列出当前路径下的所有文件夹
  145. folders = [f for f in dir_path.iterdir() if f.is_dir()]
  146. # print(folders)
  147. for folder in folders:
  148. # 如果不存在 md 文件,则跳过
  149. md_files = [f for f in folder.iterdir() if f.suffix == '.md']
  150. if not md_files:
  151. continue
  152. md_files_to_one_file_content = ""
  153. for md_file in md_files:
  154. with open(md_file, 'r', encoding='utf-8') as f:
  155. md_content = f.read()
  156. md_files_to_one_file_content += md_content + "\n\n"
  157. all_md_save_path = save_to_file(md_files_to_one_file_content, folder / f"all.md")
  158. # pandoc -f markdown -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
  159. # 用 pandoc 转换为 docx ,保存到 folder
  160. pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
  161. pandoc_cmd = f'{pandoc_exe} -f markdown -t docx -o "{folder / f"all.docx"}" "{all_md_save_path}"'
  162. print(pandoc_cmd)
  163. try:
  164. subprocess.run(pandoc_cmd, shell=True, check=True)
  165. print(f"转换成功!{all_md_save_path}")
  166. except subprocess.CalledProcessError as e:
  167. print(f"转换失败!{all_md_save_path}")
  168. print(e)
  169. async def all_search_key_main():
  170. # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
  171. search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
  172. # print(search_key_dirs)
  173. for dir_path in search_key_dirs:
  174. await multi_crawl_one_dir_html(dir_path)
  175. await load_result(dir_path)
  176. conver_all_md_to_docx(dir_path)
  177. async def main():
  178. dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
  179. # await multi_crawl_one_dir_html(dir_path)
  180. # await load_result(dir_path)
  181. # conver_all_md_to_docx(dir_path)
  182. await all_search_key_main()
  183. if __name__ == "__main__":
  184. asyncio.run(main())