10 mēneši atpakaļ · 4875fb3dc9
--- a/crawl_multi.py
+++ b/crawl_multi.py
@@ -4,6 +4,7 @@ import re
 
															 import subprocess
														
 
															 import sys
														
 
															 import pickle
														
 
															+from dotenv import load_dotenv
														
 
															 from crawl4ai import *
														
 
															 from search_keyward import test_dir_links_not_local
														
 
															 from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
														
@@ -30,7 +31,7 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
															 sys.path.append(parent_dir)
														
 
															 from typing import List
														
 
															-
														
 
															+load_dotenv()
														
 
															 async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
														
 
															     # Minimal browser config
														
 
															     browser_config = BrowserConfig(
														
@@ -71,11 +72,14 @@ async def multi_crawl_one_dir_html(dir_path:Path):
 
															     urls = [link['href'] for link in links]
														
 
															+    # Save results using pickle
														
 
															+    output_file = dir_path / "crawl_results.pkl"
														
 
															+    if output_file.exists():
														
 
															+        print(f"{output_file} already exists. Skipping...")
														
 
															+        return output_file
														
 
															     # Perform parallel crawling
														
 
															     results = await crawl_parallel(urls, max_concurrent=10)
														
 
															-    # Save results using pickle
														
 
															-    output_file = os.path.join(dir_path, "crawl_results.pkl")
														
 
															     with open(output_file, "wb") as f:
														
 
															         pickle.dump(results, f)
														
@@ -93,7 +97,7 @@ def docling_html2md(html_file_path: Path):
 
															     return markdown_content
														
 
															 def pandoc_html2md(html_file_path: Path, output_file_path: Path):
														
 
															-    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
														
 
															+    # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
														
 
															     pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
														
 
															     # 执行转换指令
														
 
															     cmd = [
														
@@ -131,7 +135,8 @@ async def load_result(dir_path:Path):
 
															         print(f"{idx} {result.url}")
														
 
															         # 保存 HTML 文件
														
 
															         html_file_path = output_all_page_html_dir / f"{idx}.html"
														
 
															-        save_to_file(result.html, html_file_path)
														
 
															+        if not html_file_path.exists():
														
 
															+            save_to_file(result.html, html_file_path)
														
 
															         # 如果存在 markdown 内容，转换为 DOCX 并保存
														
 
															         if result.markdown:
														
@@ -141,21 +146,23 @@ async def load_result(dir_path:Path):
 
															         try:
														
 
															             # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
														
 
															             docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
														
 
															-            docling_md = docling_html2md(html_file_path)
														
 
															-            # 在 md 的第一行加入 url 链接
														
 
															-            docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
														
 
															-            save_to_file(docling_md, docling_md_path)
														
 
															+            if not docling_md_path.exists():
														
 
															+                docling_md = docling_html2md(html_file_path)
														
 
															+                # 在 md 的第一行加入 url 链接
														
 
															+                docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
														
 
															+                save_to_file(docling_md, docling_md_path)
														
 
															         except Exception as e:
														
 
															             print(f"Error converting HTML to Markdown using docling: {e}")
														
 
															             print(f"html_file_path {html_file_path}")
														
 
															         md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
														
 
															-        pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
														
 
															-        # 将刚刚写入的md文件第一行加入 url 链接
														
 
															-        with open(md_file_path, 'r', encoding='utf-8') as f:
														
 
															-            md_content = f.read()
														
 
															-            md_content = f"[{result.url}]({result.url})\n\n" + md_content
														
 
															-            save_to_file(md_content, md_file_path)
														
 
															+        if md_file_path.exists():
														
 
															+            pandoc_html2md(html_file_path, md_file_path)
														
 
															+            # 将刚刚写入的md文件第一行加入 url 链接
														
 
															+            with open(md_file_path, 'r', encoding='utf-8') as f:
														
 
															+                md_content = f.read()
														
 
															+                md_content = f"[{result.url}]({result.url})\n\n" + md_content
														
 
															+                save_to_file(md_content, md_file_path)
														
 
															     return result
														
@@ -186,11 +193,21 @@ def conver_all_md_to_docx(dir_path:Path):
 
															             print(f"转换失败！{all_md_save_path}")
														
 
															             print(e)
														
 
															+async def all_search_key_main():
														
 
															+    # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
														
 
															+    search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
														
 
															+    # print(search_key_dirs)
														
 
															+    for dir_path in search_key_dirs:
														
 
															+        await multi_crawl_one_dir_html(dir_path)
														
 
															+        await load_result(dir_path)
														
 
															+        conver_all_md_to_docx(dir_path)
														
 
															+
														
 
															 async def main():
														
 
															     dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
														
 
															     # await multi_crawl_one_dir_html(dir_path)
														
 
															     # await load_result(dir_path)
														
 
															-    conver_all_md_to_docx(dir_path)
														
 
															+    # conver_all_md_to_docx(dir_path)
														
 
															+    await all_search_key_main()
														
 
															 if __name__ == "__main__":
														
--- a/mytest/news_paper_t.py
+++ b/mytest/news_paper_t.py
@@ -3,7 +3,7 @@ import os
 
															 def main():
														
 
															     # 本地 HTML 文件路径
														
 
															-    file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
														
 
															+    file_path = r"K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha martiana essential oil\all_page_html\3.html"
														
 
															     # 读取本地 HTML 文件内容
														
 
															     with open(file_path, 'r', encoding='utf-8') as file:
														
@@ -20,6 +20,7 @@ def main():
 
															     # 打印文章标题和附加数据
														
 
															     print(first_article.title)
														
 
															+    print(first_article.authors)
														
 
															     print(first_article.additional_data)
														
 
															     print(f"summary {first_article.summary}")
														
 
															     print(f"url {first_article.url}")
														
--- a/search_keyward.py
+++ b/search_keyward.py
@@ -161,6 +161,9 @@ async def test_sigle_html_links(save_html_path=None):
 
															     return links_not_local
														
 
															 async def test_dir_links_not_local(dir_path:Path):
														
 
															+    '''
														
 
															+    获取所有目录中的 html 文件，解析得到过滤后的域名列表， base_domain 不包含本地路径
														
 
															+    '''
														
 
															     html_files = [f for f in dir_path.iterdir() if f.suffix == '.html']
														
 
															     all_links = []
														
 
															     for html_file in html_files: