| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- import asyncio
- import os
- import re
- import subprocess
- import sys
- import pickle
- from dotenv import load_dotenv
- from crawl4ai import *
- from search_keyward import test_dir_links_not_local
- from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
- from config.settings import *
- from pathlib import Path
- import markdown
- from docx import Document
- from docx.shared import Pt
- from docx.oxml.ns import qn
- from docx.oxml import OxmlElement
- import re
- from docx import Document
- from docx.oxml.shared import OxmlElement, qn
- from docx.oxml.ns import nsdecls
- from pathlib import Path
- from docling.document_converter import DocumentConverter
- __location__ = os.path.dirname(os.path.abspath(__file__))
- __output__ = os.path.join(__location__, "output")
- # Append parent directory to system path
- parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- sys.path.append(parent_dir)
- from typing import List
- load_dotenv()
- async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
- # Minimal browser config
- browser_config = BrowserConfig(
- headless=True,
- verbose=False,
- extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
- )
- crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
- # Create the crawler instance
- crawler = AsyncWebCrawler(config=browser_config)
- await crawler.start()
- results = []
- try:
- for i in range(0, len(urls), max_concurrent):
- batch = urls[i : i + max_concurrent]
- tasks = []
- for j, url in enumerate(batch):
- # Unique session_id per concurrent sub-task
- session_id = f"parallel_session_{i + j}"
- task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
- tasks.append(task)
- # Gather results
- batch_results = await asyncio.gather(*tasks, return_exceptions=True)
- results.extend(batch_results)
- finally:
- await crawler.close()
- return results
- async def multi_crawl_one_dir_html(dir_path:Path):
- links = await test_dir_links_not_local(dir_path)
- urls = [link['href'] for link in links]
-
-
- # Save results using pickle
- output_file = dir_path / "crawl_results.pkl"
- if output_file.exists():
- print(f"{output_file} already exists. Skipping...")
- return output_file
- # Perform parallel crawling
- results = await crawl_parallel(urls, max_concurrent=10)
-
- with open(output_file, "wb") as f:
- pickle.dump(results, f)
-
- print(f"Crawling results saved to {output_file}")
- return output_file
- def docling_html2md(html_file_path: Path):
- source = html_file_path
- converter = DocumentConverter()
- result = converter.convert(source)
-
- # 导出为 Markdown
- markdown_content = result.document.export_to_markdown()
- # print(markdown_content)
- return markdown_content
- def pandoc_html2md(html_file_path: Path, output_file_path: Path):
- # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
- pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
- # 执行转换指令
- cmd = [
- pandoc_exe,
- "-f", "html",
- "-t", "markdown",
- "-o", str(output_file_path),
- str(html_file_path)
- ]
- try:
- subprocess.run(cmd, check=True)
- print("转换成功!")
- except subprocess.CalledProcessError as e:
- print("转换失败!")
- print(e)
- except Exception as e:
- print("发生错误!")
- print(e)
-
- async def load_result(dir_path:Path):
- results:List[CrawlResult] = load_from_pickle(dir_path / "crawl_results.pkl")
- print(f"len {len(results)}")
- output_all_page_html_dir = dir_path / "all_page_html"
- ensure_output_dir(output_all_page_html_dir)
- output_all_crawl_md_dir = dir_path / "all_crawl_md"
- ensure_output_dir(output_all_crawl_md_dir)
- output_all_docling_markdown_dir = dir_path / "all_docling_markdown"
- ensure_output_dir(output_all_docling_markdown_dir)
- output_all_pandoc_markdown_dir = dir_path / "all_pandoc_markdown"
- ensure_output_dir(output_all_pandoc_markdown_dir)
-
- converter = DocumentConverter()
-
- for idx, result in enumerate(results):
- print(f"{idx} {result.url}")
- # 保存 HTML 文件
- html_file_path = output_all_page_html_dir / f"{idx}.html"
- if not html_file_path.exists():
- save_to_file(result.html, html_file_path)
-
- # 如果存在 markdown 内容,转换为 DOCX 并保存
- if result.markdown:
- # 在写入 markdown 之前将文本第一行添加 url 链接
- markdown = f"[{result.url}]({result.url})\n\n{result.markdown}"
- save_to_file(markdown, output_all_crawl_md_dir / f"{idx}.md")
- try:
- # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
- docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
- if not docling_md_path.exists():
- docling_md = docling_html2md(html_file_path)
- # 在 md 的第一行加入 url 链接
- docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
- save_to_file(docling_md, docling_md_path)
- except Exception as e:
- print(f"Error converting HTML to Markdown using docling: {e}")
- print(f"html_file_path {html_file_path}")
-
- md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
- if md_file_path.exists():
- pandoc_html2md(html_file_path, md_file_path)
- # 将刚刚写入的md文件第一行加入 url 链接
- with open(md_file_path, 'r', encoding='utf-8') as f:
- md_content = f.read()
- md_content = f"[{result.url}]({result.url})\n\n" + md_content
- save_to_file(md_content, md_file_path)
-
- return result
- def conver_all_md_to_docx(dir_path:Path):
- # 列出当前路径下的所有文件夹
- folders = [f for f in dir_path.iterdir() if f.is_dir()]
- # print(folders)
- for folder in folders:
- # 如果不存在 md 文件,则跳过
- md_files = [f for f in folder.iterdir() if f.suffix == '.md']
- if not md_files:
- continue
- md_files_to_one_file_content = ""
- for md_file in md_files:
- with open(md_file, 'r', encoding='utf-8') as f:
- md_content = f.read()
- md_files_to_one_file_content += md_content + "\n\n"
- all_md_save_path = save_to_file(md_files_to_one_file_content, folder / f"all.md")
- # pandoc -f markdown -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
- # 用 pandoc 转换为 docx ,保存到 folder
- pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
- pandoc_cmd = f'{pandoc_exe} -f markdown -t docx -o "{folder / f"all.docx"}" "{all_md_save_path}"'
- print(pandoc_cmd)
- try:
- subprocess.run(pandoc_cmd, shell=True, check=True)
- print(f"转换成功!{all_md_save_path}")
- except subprocess.CalledProcessError as e:
- print(f"转换失败!{all_md_save_path}")
- print(e)
- async def all_search_key_main():
- # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
- search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
- # print(search_key_dirs)
- for dir_path in search_key_dirs:
- await multi_crawl_one_dir_html(dir_path)
- await load_result(dir_path)
- conver_all_md_to_docx(dir_path)
- async def main():
- dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
- # await multi_crawl_one_dir_html(dir_path)
- # await load_result(dir_path)
- # conver_all_md_to_docx(dir_path)
- await all_search_key_main()
-
- if __name__ == "__main__":
- asyncio.run(main())
|