1 anno fa · 4875fb3dc9
--- a/crawl_multi.py
+++ b/crawl_multi.py
@@ -4,6 +4,7 @@ import re
 
				 import subprocess
			
 
				 import sys
			
 
				 import pickle
			
 
				+from dotenv import load_dotenv
			
 
				 from crawl4ai import *
			
 
				 from search_keyward import test_dir_links_not_local
			
 
				 from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
			
@@ -30,7 +31,7 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
				 sys.path.append(parent_dir)
			
 
				 
			
 
				 from typing import List
			
 
				-
			
 
				+load_dotenv()
			
 
				 async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
			
 
				     # Minimal browser config
			
 
				     browser_config = BrowserConfig(
			
@@ -71,11 +72,14 @@ async def multi_crawl_one_dir_html(dir_path:Path):
 
				     urls = [link['href'] for link in links]
			
 
				     
			
 
				     
			
 
				+    # Save results using pickle
			
 
				+    output_file = dir_path / "crawl_results.pkl"
			
 
				+    if output_file.exists():
			
 
				+        print(f"{output_file} already exists. Skipping...")
			
 
				+        return output_file
			
 
				     # Perform parallel crawling
			
 
				     results = await crawl_parallel(urls, max_concurrent=10)
			
 
				     
			
 
				-    # Save results using pickle
			
 
				-    output_file = os.path.join(dir_path, "crawl_results.pkl")
			
 
				     with open(output_file, "wb") as f:
			
 
				         pickle.dump(results, f)
			
 
				     
			
@@ -93,7 +97,7 @@ def docling_html2md(html_file_path: Path):
 
				     return markdown_content
			
 
				 
			
 
				 def pandoc_html2md(html_file_path: Path, output_file_path: Path):
			
 
				-    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
			
 
				+    # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
			
 
				     pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
			
 
				     # 执行转换指令
			
 
				     cmd = [
			
@@ -131,7 +135,8 @@ async def load_result(dir_path:Path):
 
				         print(f"{idx} {result.url}")
			
 
				         # 保存 HTML 文件
			
 
				         html_file_path = output_all_page_html_dir / f"{idx}.html"
			
 
				-        save_to_file(result.html, html_file_path)
			
 
				+        if not html_file_path.exists():
			
 
				+            save_to_file(result.html, html_file_path)
			
 
				         
			
 
				         # 如果存在 markdown 内容，转换为 DOCX 并保存
			
 
				         if result.markdown:
			
@@ -141,21 +146,23 @@ async def load_result(dir_path:Path):
 
				         try:
			
 
				             # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
			
 
				             docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
			
 
				-            docling_md = docling_html2md(html_file_path)
			
 
				-            # 在 md 的第一行加入 url 链接
			
 
				-            docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
			
 
				-            save_to_file(docling_md, docling_md_path)
			
 
				+            if not docling_md_path.exists():
			
 
				+                docling_md = docling_html2md(html_file_path)
			
 
				+                # 在 md 的第一行加入 url 链接
			
 
				+                docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
			
 
				+                save_to_file(docling_md, docling_md_path)
			
 
				         except Exception as e:
			
 
				             print(f"Error converting HTML to Markdown using docling: {e}")
			
 
				             print(f"html_file_path {html_file_path}")
			
 
				             
			
 
				         md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
			
 
				-        pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
			
 
				-        # 将刚刚写入的md文件第一行加入 url 链接
			
 
				-        with open(md_file_path, 'r', encoding='utf-8') as f:
			
 
				-            md_content = f.read()
			
 
				-            md_content = f"[{result.url}]({result.url})\n\n" + md_content
			
 
				-            save_to_file(md_content, md_file_path)
			
 
				+        if md_file_path.exists():
			
 
				+            pandoc_html2md(html_file_path, md_file_path)
			
 
				+            # 将刚刚写入的md文件第一行加入 url 链接
			
 
				+            with open(md_file_path, 'r', encoding='utf-8') as f:
			
 
				+                md_content = f.read()
			
 
				+                md_content = f"[{result.url}]({result.url})\n\n" + md_content
			
 
				+                save_to_file(md_content, md_file_path)
			
 
				         
			
 
				     return result
			
 
				 
			
@@ -186,11 +193,21 @@ def conver_all_md_to_docx(dir_path:Path):
 
				             print(f"转换失败！{all_md_save_path}")
			
 
				             print(e)
			
 
				 
			
 
				+async def all_search_key_main():
			
 
				+    # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
			
 
				+    search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
			
 
				+    # print(search_key_dirs)
			
 
				+    for dir_path in search_key_dirs:
			
 
				+        await multi_crawl_one_dir_html(dir_path)
			
 
				+        await load_result(dir_path)
			
 
				+        conver_all_md_to_docx(dir_path)
			
 
				+
			
 
				 async def main():
			
 
				     dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
			
 
				     # await multi_crawl_one_dir_html(dir_path)
			
 
				     # await load_result(dir_path)
			
 
				-    conver_all_md_to_docx(dir_path)
			
 
				+    # conver_all_md_to_docx(dir_path)
			
 
				+    await all_search_key_main()
			
 
				     
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/mytest/news_paper_t.py
+++ b/mytest/news_paper_t.py
@@ -3,7 +3,7 @@ import os
 
				 
			
 
				 def main():
			
 
				     # 本地 HTML 文件路径
			
 
				-    file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
			
 
				+    file_path = r"K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha martiana essential oil\all_page_html\3.html"
			
 
				     
			
 
				     # 读取本地 HTML 文件内容
			
 
				     with open(file_path, 'r', encoding='utf-8') as file:
			
@@ -20,6 +20,7 @@ def main():
 
				 
			
 
				     # 打印文章标题和附加数据
			
 
				     print(first_article.title)
			
 
				+    print(first_article.authors)
			
 
				     print(first_article.additional_data)
			
 
				     print(f"summary {first_article.summary}")
			
 
				     print(f"url {first_article.url}")
			
--- a/search_keyward.py
+++ b/search_keyward.py
@@ -161,6 +161,9 @@ async def test_sigle_html_links(save_html_path=None):
 
				     return links_not_local
			
 
				 
			
 
				 async def test_dir_links_not_local(dir_path:Path):
			
 
				+    '''
			
 
				+    获取所有目录中的 html 文件，解析得到过滤后的域名列表， base_domain 不包含本地路径
			
 
				+    '''
			
 
				     html_files = [f for f in dir_path.iterdir() if f.suffix == '.html']
			
 
				     all_links = []
			
 
				     for html_file in html_files: