part-time-job
/
zhang_crawl_bio


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
							import asyncio
import os
import re
import subprocess
import sys
import pickle
from dotenv import load_dotenv
from crawl4ai import *
from search_keyward import test_dir_links_not_local
from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
from config.settings import *
from pathlib import Path
import markdown
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import re
from docx import Document
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsdecls
from pathlib import Path

from docling.document_converter import DocumentConverter

__location__ = os.path.dirname(os.path.abspath(__file__))
__output__ = os.path.join(__location__, "output")

# Append parent directory to system path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from typing import List
load_dotenv()
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
    # Minimal browser config
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,
        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
    )
    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)

    # Create the crawler instance
    crawler = AsyncWebCrawler(config=browser_config)
    await crawler.start()

    results = []

    try:
        for i in range(0, len(urls), max_concurrent):
            batch = urls[i : i + max_concurrent]
            tasks = []

            for j, url in enumerate(batch):
                # Unique session_id per concurrent sub-task
                session_id = f"parallel_session_{i + j}"
                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
                tasks.append(task)

            # Gather results
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)
            results.extend(batch_results)

    finally:
        await crawler.close()

    return results

async def multi_crawl_one_dir_html(dir_path:Path):
    links = await test_dir_links_not_local(dir_path)
    urls = [link['href'] for link in links]
    
    
    # Save results using pickle
    output_file = dir_path / "crawl_results.pkl"
    if output_file.exists():
        print(f"{output_file} already exists. Skipping...")
        return output_file
    # Perform parallel crawling
    results = await crawl_parallel(urls, max_concurrent=10)
    
    with open(output_file, "wb") as f:
        pickle.dump(results, f)
    
    print(f"Crawling results saved to {output_file}")
    return output_file

def docling_html2md(html_file_path: Path):
    source = html_file_path
    converter = DocumentConverter()
    result = converter.convert(source)
    
    # 导出为 Markdown
    markdown_content = result.document.export_to_markdown()
    # print(markdown_content)
    return markdown_content

def pandoc_html2md(html_file_path: Path, output_file_path: Path):
    # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
    pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
    # 执行转换指令
    cmd = [
        pandoc_exe,
        "-f", "html",
        "-t", "markdown",
        "-o", str(output_file_path),
        str(html_file_path)
    ]
    try:
        subprocess.run(cmd, check=True)
        print("转换成功！")
    except subprocess.CalledProcessError as e:
        print("转换失败！")
        print(e)
    except Exception as e:
        print("发生错误！")
        print(e)
    
async def load_result(dir_path:Path):
    results:List[CrawlResult] = load_from_pickle(dir_path / "crawl_results.pkl")
    print(f"len {len(results)}")
    output_all_page_html_dir = dir_path / "all_page_html"
    ensure_output_dir(output_all_page_html_dir)
    output_all_crawl_md_dir = dir_path / "all_crawl_md"
    ensure_output_dir(output_all_crawl_md_dir)
    output_all_docling_markdown_dir = dir_path / "all_docling_markdown"
    ensure_output_dir(output_all_docling_markdown_dir)
    output_all_pandoc_markdown_dir = dir_path / "all_pandoc_markdown"
    ensure_output_dir(output_all_pandoc_markdown_dir)
    
    converter = DocumentConverter()
    
    for idx, result in enumerate(results):
        print(f"{idx} {result.url}")
        # 保存 HTML 文件
        html_file_path = output_all_page_html_dir / f"{idx}.html"
        if not html_file_path.exists():
            save_to_file(result.html, html_file_path)
        
        # 如果存在 markdown 内容，转换为 DOCX 并保存
        if result.markdown:
            # 在写入 markdown 之前将文本第一行添加 url 链接
            markdown = f"[{result.url}]({result.url})\n\n{result.markdown}"
            save_to_file(markdown, output_all_crawl_md_dir / f"{idx}.md")
        try:
            # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
            docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
            if not docling_md_path.exists():
                docling_md = docling_html2md(html_file_path)
                # 在 md 的第一行加入 url 链接
                docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
                save_to_file(docling_md, docling_md_path)
        except Exception as e:
            print(f"Error converting HTML to Markdown using docling: {e}")
            print(f"html_file_path {html_file_path}")
            
        md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
        if md_file_path.exists():
            pandoc_html2md(html_file_path, md_file_path)
            # 将刚刚写入的md文件第一行加入 url 链接
            with open(md_file_path, 'r', encoding='utf-8') as f:
                md_content = f.read()
                md_content = f"[{result.url}]({result.url})\n\n" + md_content
                save_to_file(md_content, md_file_path)
        
    return result

def conver_all_md_to_docx(dir_path:Path):
    # 列出当前路径下的所有文件夹
    folders = [f for f in dir_path.iterdir() if f.is_dir()]
    # print(folders)
    for folder in folders:
        # 如果不存在 md 文件，则跳过
        md_files = [f for f in folder.iterdir() if f.suffix == '.md']
        if not md_files:
            continue
        md_files_to_one_file_content = ""
        for md_file in md_files:
            with open(md_file, 'r', encoding='utf-8') as f:
                md_content = f.read()
                md_files_to_one_file_content += md_content + "\n\n"
        all_md_save_path = save_to_file(md_files_to_one_file_content, folder / f"all.md")
        # pandoc -f markdown -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
        # 用 pandoc 转换为 docx ，保存到 folder
        pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
        pandoc_cmd = f'{pandoc_exe} -f markdown -t docx -o "{folder / f"all.docx"}" "{all_md_save_path}"'
        print(pandoc_cmd)
        try:
            subprocess.run(pandoc_cmd, shell=True, check=True)
            print(f"转换成功！{all_md_save_path}")
        except subprocess.CalledProcessError as e:
            print(f"转换失败！{all_md_save_path}")
            print(e)

async def all_search_key_main():
    # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
    search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
    # print(search_key_dirs)
    for dir_path in search_key_dirs:
        await multi_crawl_one_dir_html(dir_path)
        await load_result(dir_path)
        conver_all_md_to_docx(dir_path)

async def main():
    dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
    # await multi_crawl_one_dir_html(dir_path)
    # await load_result(dir_path)
    # conver_all_md_to_docx(dir_path)
    await all_search_key_main()
    

if __name__ == "__main__":
    asyncio.run(main())