Răsfoiți Sursa

single search key convert all format:md,html,docx, use pandoc, docling

mrh 1 an în urmă
părinte
comite
b356480683

+ 197 - 0
crawl_multi.py

@@ -0,0 +1,197 @@
+import asyncio
+import os
+import re
+import subprocess
+import sys
+import pickle
+from crawl4ai import *
+from search_keyward import test_dir_links_not_local
+from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
+from mylib.settings import *
+from pathlib import Path
+import markdown
+from docx import Document
+from docx.shared import Pt
+from docx.oxml.ns import qn
+from docx.oxml import OxmlElement
+import re
+from docx import Document
+from docx.oxml.shared import OxmlElement, qn
+from docx.oxml.ns import nsdecls
+from pathlib import Path
+
+from docling.document_converter import DocumentConverter
+
+__location__ = os.path.dirname(os.path.abspath(__file__))
+__output__ = os.path.join(__location__, "output")
+
+# Append parent directory to system path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from typing import List
+
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
+    # Minimal browser config
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=False,
+        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+    )
+    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+
+    # Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    results = []
+
+    try:
+        for i in range(0, len(urls), max_concurrent):
+            batch = urls[i : i + max_concurrent]
+            tasks = []
+
+            for j, url in enumerate(batch):
+                # Unique session_id per concurrent sub-task
+                session_id = f"parallel_session_{i + j}"
+                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
+                tasks.append(task)
+
+            # Gather results
+            batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+            results.extend(batch_results)
+
+    finally:
+        await crawler.close()
+
+    return results
+
+async def multi_crawl_one_dir_html(dir_path:Path):
+    links = await test_dir_links_not_local(dir_path)
+    urls = [link['href'] for link in links]
+    
+    
+    # Perform parallel crawling
+    results = await crawl_parallel(urls, max_concurrent=10)
+    
+    # Save results using pickle
+    output_file = os.path.join(dir_path, "crawl_results.pkl")
+    with open(output_file, "wb") as f:
+        pickle.dump(results, f)
+    
+    print(f"Crawling results saved to {output_file}")
+    return output_file
+
+def docling_html2md(html_file_path: Path):
+    source = html_file_path
+    converter = DocumentConverter()
+    result = converter.convert(source)
+    
+    # 导出为 Markdown
+    markdown_content = result.document.export_to_markdown()
+    # print(markdown_content)
+    return markdown_content
+
+def pandoc_html2md(html_file_path: Path, output_file_path: Path):
+    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
+    # 执行转换指令
+    cmd = [
+        pandoc_exe,
+        "-f", "html",
+        "-t", "markdown",
+        "-o", str(output_file_path),
+        str(html_file_path)
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+        print("转换成功!")
+    except subprocess.CalledProcessError as e:
+        print("转换失败!")
+        print(e)
+    except Exception as e:
+        print("发生错误!")
+        print(e)
+    
+async def load_result(dir_path:Path):
+    results:List[CrawlResult] = load_from_pickle(dir_path / "crawl_results.pkl")
+    print(f"len {len(results)}")
+    output_all_page_html_dir = dir_path / "all_page_html"
+    ensure_output_dir(output_all_page_html_dir)
+    output_all_crawl_md_dir = dir_path / "all_crawl_md"
+    ensure_output_dir(output_all_crawl_md_dir)
+    output_all_docling_markdown_dir = dir_path / "all_docling_markdown"
+    ensure_output_dir(output_all_docling_markdown_dir)
+    output_all_pandoc_markdown_dir = dir_path / "all_pandoc_markdown"
+    ensure_output_dir(output_all_pandoc_markdown_dir)
+    
+    converter = DocumentConverter()
+    
+    for idx, result in enumerate(results):
+        print(f"{idx} {result.url}")
+        # 保存 HTML 文件
+        html_file_path = output_all_page_html_dir / f"{idx}.html"
+        save_to_file(result.html, html_file_path)
+        
+        # 如果存在 markdown 内容,转换为 DOCX 并保存
+        if result.markdown:
+            # 在写入 markdown 之前将文本第一行添加 url 链接
+            markdown = f"[{result.url}]({result.url})\n\n{result.markdown}"
+            save_to_file(markdown, output_all_crawl_md_dir / f"{idx}.md")
+        try:
+            # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
+            docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
+            docling_md = docling_html2md(html_file_path)
+            # 在 md 的第一行加入 url 链接
+            docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
+            save_to_file(docling_md, docling_md_path)
+        except Exception as e:
+            print(f"Error converting HTML to Markdown using docling: {e}")
+            print(f"html_file_path {html_file_path}")
+            
+        md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
+        pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
+        # 将刚刚写入的md文件第一行加入 url 链接
+        with open(md_file_path, 'r', encoding='utf-8') as f:
+            md_content = f.read()
+            md_content = f"[{result.url}]({result.url})\n\n" + md_content
+            save_to_file(md_content, md_file_path)
+        
+    return result
+
+def conver_all_md_to_docx(dir_path:Path):
+    # 列出当前路径下的所有文件夹
+    folders = [f for f in dir_path.iterdir() if f.is_dir()]
+    # print(folders)
+    for folder in folders:
+        # 如果不存在 md 文件,则跳过
+        md_files = [f for f in folder.iterdir() if f.suffix == '.md']
+        if not md_files:
+            continue
+        md_files_to_one_file_content = ""
+        for md_file in md_files:
+            with open(md_file, 'r', encoding='utf-8') as f:
+                md_content = f.read()
+                md_files_to_one_file_content += md_content + "\n\n"
+        all_md_save_path = save_to_file(md_files_to_one_file_content, folder / f"all.md")
+        # pandoc -f markdown -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+        # 用 pandoc 转换为 docx ,保存到 folder
+        pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
+        pandoc_cmd = f'{pandoc_exe} -f markdown -t docx -o "{folder / f"all.docx"}" "{all_md_save_path}"'
+        print(pandoc_cmd)
+        try:
+            subprocess.run(pandoc_cmd, shell=True, check=True)
+            print(f"转换成功!{all_md_save_path}")
+        except subprocess.CalledProcessError as e:
+            print(f"转换失败!{all_md_save_path}")
+            print(e)
+
+async def main():
+    dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
+    # await multi_crawl_one_dir_html(dir_path)
+    # await load_result(dir_path)
+    conver_all_md_to_docx(dir_path)
+    
+
+if __name__ == "__main__":
+    asyncio.run(main())

+ 1 - 1
mylib/conf/9321.ini

@@ -5,7 +5,7 @@ tmp_path =
 [chromium_options]
 address = 127.0.0.1:9321
 browser_path = C:\Program Files\Google\Chrome\Application\chrome.exe
-arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--user-data-dir=K:\\code\\upwork\\zhang_crawl_bio\\output\\user_data_dir2', '--blink-settings=imagesEnabled=false', '--mute-audio', '--incognito', '--inprivate']
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--user-data-dir=K:\\code\\upwork\\zhang_crawl_bio\\output\\user_data_dir2', '--blink-settings=imagesEnabled=false', '--mute-audio', ]
 extensions = []
 prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
 flags = {}

+ 0 - 0
mylib/crawl4ai_html_to_doc.py


+ 56 - 12
mylib/drission_page.py

@@ -3,6 +3,7 @@ from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
 from pathlib import Path
 from mylib.settings import OUTPUT_DIR, WORK_DIR, CONFIG_DIR
 from mylib.random_ua import get_random_user_agent
+from fake_useragent import UserAgent
 
 BROWSER_PATH=r"C:\Program Files\Google\Chrome\Application\chrome.exe"
 
@@ -10,31 +11,74 @@ def genarate_chrome_ini(address="localhost:9321"):
     port = address.split(':')[1]
     chrome_options = ChromiumOptions().set_browser_path(BROWSER_PATH)
     chrome_options.set_address(address)
-    chrome_options.set_user_data_path(str(OUTPUT_DIR / 'user_data_dir2'))
+    chrome_options.set_user_data_path(str(OUTPUT_DIR / f'user_data_dir_{port}'))
     chrome_options.no_imgs(True).mute(True)
-    chrome_options.incognito(True)
-    chrome_options.save(CONFIG_DIR / f'{port}.ini')
+    # chrome_options.incognito(True)
+    path = chrome_options.save(CONFIG_DIR / f'{port}.ini')
+    return path
+def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False):
+    chrome_options = ChromiumOptions(ini_path=path)
+    page = ChromiumPage(chrome_options)
+    return page
+
+def fake_ua():
+
+    # 创建一个 UserAgent 对象
+    ua = UserAgent()
+
+    # 生成支持的浏览器的 User-Agent 字符串
+    chrome_ua = ua.chrome  # Chrome 浏览器
+    firefox_ua = ua.firefox  # Firefox 浏览器
+    safari_ua = ua.safari  # Safari 浏览器
+    edge_ua = ua.edge  # Chromium Edge 浏览器
 
-def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini'):
-    # chrome_options = ChromiumOptions(ini_path=path)
-    # chrome_options.incognito(False)
+    # 打印生成的 User-Agent 字符串
+    print("Chrome User-Agent:", chrome_ua)
+    print("Firefox User-Agent:", firefox_ua)
+    print("Safari User-Agent:", safari_ua)
+    print("Edge User-Agent:", edge_ua)
+    return chrome_ua
+
+def load_random_ua_chrome(headless=False):
     chrome_options = ChromiumOptions()
     chrome_options.auto_port(True)
-    chrome_options.set_user_agent(get_random_user_agent())
+    chrome_options.no_imgs(False)
+    chrome_options.set_user_agent(fake_ua())
+    chrome_options.arguments.append("--lang=en")
+    chrome_options.headless(headless)
     page = ChromiumPage(chrome_options)
     # page.set.auto_handle_alert(True)
     return page
-    
-def main():
+
+def test_random_ua_chrome():
+    page = load_random_ua_chrome()
+    tab = page.latest_tab
+    keyword = "Acalypha rivularis essential oil"
+    url = f"https://www.google.com/search?q={keyword}"
+    # url = f"https://www.google.com/"
+    # url = "https://bot.sannysoft.com/"
+    tab.get(url)
+    print(tab.url)
+    if page.browser._chromium_options.is_headless:
+        tab.get_screenshot('./1.png')
+    # page.quit()
+
+def test_normal_chrome():
     # genarate_chrome_ini()
     page = load_chrome_from_ini()
     tab = page.latest_tab
     keyword = "Acalypha rivularis essential oil"
-    # url = f"https://www.google.com/search?q={keyword}"
-    url = "https://bot.sannysoft.com/"
-
+    url = f"https://www.google.com/search?q={keyword}"
+    # url = "https://bot.sannysoft.com/"
+    # recaptcha 验证码检测
+    # url = "https://patrickhlauke.github.io/recaptcha/"
     tab.get(url)
+    # tab.get_screenshot('./1.png')
     # page.quit()
+
+def main():
+    test_random_ua_chrome()
+    # test_normal_chrome()
     
 if __name__ == "__main__":
     main()

+ 0 - 0
mylib/filtered_user_agents.txt


+ 9 - 10
mylib/search_manager.py

@@ -18,10 +18,9 @@ from mylib.base import (
 from database.search_model import SearchDatabaseManager,SearchResult
 from database.excel_import import ExcelDatabaseManager,KeywordModel
 from database.sqlite_engine import create_db_and_tables, drop_table
-from mylib.drission_page import load_chrome_from_ini
+from mylib.drission_page import load_chrome_from_ini,load_random_ua_chrome
 from lxml import html
 from mylib.settings import GOOGLE_SEARCH_DIR
-page = load_chrome_from_ini()
 
 class SearchManager:
     def __init__(self, page: ChromiumPage):
@@ -47,10 +46,12 @@ class SearchManager:
             print(f"Using existing result for {keyword} {start}")
             if existing_result:
                 return existing_result
-                
-        # 执行搜索
         url = f"https://www.google.com/search?q={keyword}"
-        self.tab.get(url)
+        if start == 0:  
+            # 执行搜索
+            self.tab.get(url)
+        else:
+            self.go_to_next_page()
         # 保存HTML文件
         html_path = self.save_page(keyword, start)
         # 检查是否是最后一页
@@ -184,21 +185,19 @@ class SearchManager:
     def restart_browser(self,):
         """重启浏览器"""
         self.page.quit()
-        self.page = load_chrome_from_ini()
+        self.page = load_random_ua_chrome()
         self.tab = self.page.latest_tab
     
 def test_one():
-    global page
     # 初始化浏览器
-    self = SearchManager(page)
+    self = SearchManager(load_random_ua_chrome())
     key_model_list = self.excel_db_manager.get_keywords_by_status()
     key_model = key_model_list.pop(0)
     print(key_model)
     self.walk_search_one_keywords(key_model)
 def test_all():
-    global page
     # 初始化浏览器
-    self = SearchManager(page)
+    self = SearchManager(load_random_ua_chrome())
     key_model_list = self.excel_db_manager.get_keywords_by_status()
     all_count = self.excel_db_manager.get_keywords_count()
     print("遍历所有搜索词, len = ", len(key_model_list))

+ 41 - 0
mytest/docling_t.py

@@ -0,0 +1,41 @@
+import os
+from docling.document_converter import DocumentConverter
+from docx import Document
+from bs4 import BeautifulSoup
+
+def html_to_docx(html_content, output_docx_path):
+    # 创建一个新的 DOCX 文档
+    doc = Document()
+    
+    # 使用 BeautifulSoup 解析 HTML 内容
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # 将 HTML 内容添加到 DOCX 文档中
+    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li']):
+        if element.name == 'p':
+            doc.add_paragraph(element.get_text())
+        elif element.name.startswith('h'):
+            level = int(element.name[1])
+            doc.add_heading(element.get_text(), level=level)
+        elif element.name in ['ul', 'ol']:
+            for li in element.find_all('li'):
+                doc.add_paragraph(li.get_text(), style='ListBullet' if element.name == 'ul' else 'ListNumber')
+    
+    # 保存 DOCX 文档
+    doc.save(output_docx_path)
+
+def main():
+    # 本地 HTML 文件路径
+    file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    # source = "https://arxiv.org/pdf/2408.09869"  # document per local path or URL
+    source = file_path
+    converter = DocumentConverter()
+    result = converter.convert(source)
+    
+    # 导出为 Markdown
+    markdown_content = result.document.export_to_markdown()
+    print(markdown_content)
+    
+
+if __name__ == "__main__":
+    main()

+ 92 - 0
mytest/get_suport_ua.py

@@ -0,0 +1,92 @@
+# 支持的系统和浏览器品牌
+supported_systems = ["Windows", "Linux", "Mac"]
+supported_browsers = {
+    "Chrome": 91,  # 假设 Chrome 的最低支持版本是 91
+    "Firefox": 89,  # 假设 Firefox 的最低支持版本是 89
+    "Safari": 14,  # 假设 Safari 的最低支持版本是 14
+    "Edg": 91,  # 假设 Edge 的最低支持版本是 91
+}
+
+# 输入和输出文件路径
+input_file = "K:/code/upwork/zhang_crawl_bio/mylib/user_agents.txt"
+output_file = "K:/code/upwork/zhang_crawl_bio/mylib/filtered_user_agents.txt"
+
+def extract_version(ua, browser):
+    """
+    从用户代理字符串中提取浏览器版本号
+    """
+    try:
+        if browser == "Chrome":
+            # Chrome 的版本号通常在 "Chrome/" 后面
+            start = ua.find("Chrome/")
+            if start == -1:
+                return None
+            start += len("Chrome/")
+            # 提取直到非数字字符为止
+            version_str = ""
+            for char in ua[start:]:
+                if char.isdigit():
+                    version_str += char
+                else:
+                    break
+            return int(version_str) if version_str else None
+        elif browser == "Firefox":
+            # Firefox 的版本号通常在 "Firefox/" 后面
+            start = ua.find("Firefox/")
+            if start == -1:
+                return None
+            start += len("Firefox/")
+            # 提取直到非数字字符为止
+            version_str = ""
+            for char in ua[start:]:
+                if char.isdigit():
+                    version_str += char
+                else:
+                    break
+            return int(version_str) if version_str else None
+        elif browser == "Safari":
+            # Safari 的版本号通常在 "Version/" 后面
+            start = ua.find("Version/")
+            if start == -1:
+                return None
+            start += len("Version/")
+            # 提取直到非数字字符为止
+            version_str = ""
+            for char in ua[start:]:
+                if char.isdigit():
+                    version_str += char
+                else:
+                    break
+            return int(version_str) if version_str else None
+        elif browser == "Edg":
+            # Edge 的版本号通常在 "Edg/" 后面
+            start = ua.find("Edg/")
+            if start == -1:
+                return None
+            start += len("Edg/")
+            # 提取直到非数字字符为止
+            version_str = ""
+            for char in ua[start:]:
+                if char.isdigit():
+                    version_str += char
+                else:
+                    break
+            return int(version_str) if version_str else None
+    except Exception as e:
+        print(f"Error extracting version from UA: {ua}, Error: {e}")
+    return None
+
+# 读取并过滤用户代理
+with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+    for line in infile:
+        line = line.strip()
+        # 检查是否包含支持的系统和浏览器品牌
+        if any(system in line for system in supported_systems):
+            for browser, min_version in supported_browsers.items():
+                if browser in line:
+                    version = extract_version(line, browser)
+                    if version is not None and version >= min_version:
+                        outfile.write(line + "\n")
+                        break
+
+print(f"过滤后的用户代理已保存到 {output_file}")

+ 28 - 0
mytest/news_paper_t.py

@@ -0,0 +1,28 @@
+from newspaper import Article
+import os
+
+def main():
+    # 本地 HTML 文件路径
+    file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    
+    # 读取本地 HTML 文件内容
+    with open(file_path, 'r', encoding='utf-8') as file:
+        html_content = file.read()
+
+    # 创建一个 Article 对象
+    first_article = Article(url='', language='en')
+    
+    # 手动设置 HTML 内容
+    first_article.set_html(html_content)
+    
+    # 解析文章
+    first_article.parse()
+
+    # 打印文章标题和附加数据
+    print(first_article.title)
+    print(first_article.additional_data)
+    print(f"summary {first_article.summary}")
+    print(f"url {first_article.url}")
+
+if __name__ == "__main__":
+    main()

+ 44 - 0
mytest/pandoc_t.py

@@ -0,0 +1,44 @@
+from pathlib import Path
+import subprocess
+
+from mylib.base import ensure_output_dir
+def pandoc_html2docx(html_file_path: Path, output_file_path: Path):
+    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
+    # 执行转换指令
+    cmd = [
+        pandoc_exe,
+        "-f", "html",
+        "-t", "docx",
+        "-o", str(output_file_path),
+        str(html_file_path)
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+        print("转换成功!")
+    except subprocess.CalledProcessError as e:
+        print("转换失败!")
+        print(e)
+    except Exception as e:
+        print("发生错误!")
+        print(e)
+
+def main():
+    dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")    
+    # '''
+    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    # '''
+    all_html_dir = dir_path / "all_page_html"
+    all_page_pandoc_docx_dir = dir_path / "all_page_pandoc_docx"
+    ensure_output_dir(all_page_pandoc_docx_dir)
+    # 遍历 html 文件夹下的所有文件
+    for file in all_html_dir.iterdir():
+        # 如果是文件且后缀为 .html
+        if file.is_file() and file.suffix == ".html":
+            # 构造输出文件路径
+            output_file_path = all_page_pandoc_docx_dir / (file.stem + ".docx")
+            # 调用 pandoc_html2docx 函数进行转换
+            pandoc_html2docx(file, output_file_path)
+
+if __name__ == "__main__":
+    main()

+ 16 - 0
mytest/test_fake_ua.py

@@ -0,0 +1,16 @@
+from fake_useragent import UserAgent
+
+# 创建一个 UserAgent 对象
+ua = UserAgent()
+
+# 生成支持的浏览器的 User-Agent 字符串
+chrome_ua = ua.chrome  # Chrome 浏览器
+firefox_ua = ua.firefox  # Firefox 浏览器
+safari_ua = ua.safari  # Safari 浏览器
+edge_ua = ua.edge  # Chromium Edge 浏览器
+
+# 打印生成的 User-Agent 字符串
+print("Chrome User-Agent:", chrome_ua)
+print("Firefox User-Agent:", firefox_ua)
+print("Safari User-Agent:", safari_ua)
+print("Edge User-Agent:", edge_ua)

+ 32 - 2
search_keyward.py

@@ -24,7 +24,7 @@ async def google_search(url:str, config=None)->CrawlResult:
     async with AsyncWebCrawler(config=browser_config) as crawler:
         result = await crawler.arun(
             url=url,
-            cache_mode=CacheMode.DISABLED,
+            cache_mode=CacheMode.ENABLED,
             user_agent='random',
             config=run_config,
 
@@ -141,11 +141,41 @@ async def test_single_search():
     # print(res)
     # 漂亮打印
     # print(json.dumps(res, indent=4))
+async def test_html_to_doc():
+    save_dir = Path(r"K:\code\upwork\zhang_crawl_bio\output\google_search")
+    # 获取所有文件夹
+    folders = [f for f in save_dir.iterdir() if f.is_dir()]
+    print(folders)
+    for folder in folders:
+        # 获取文件夹下的所有 html 文件
+        html_files = [f for f in folder.iterdir() if f.suffix == '.html']
 
+async def test_sigle_html_links(save_html_path=None):
+    if not save_html_path:
+        save_html_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\0.html")
+    url = f"file://{save_html_path}"
+    result: CrawlResult = await google_search(url)
+    not_google_external_links = filter_links(result.links)
+    links_not_local = filter_local_domain(not_google_external_links)
+    # print(links_not_local)
+    return links_not_local
 
+async def test_dir_links_not_local(dir_path:Path):
+    html_files = [f for f in dir_path.iterdir() if f.suffix == '.html']
+    all_links = []
+    for html_file in html_files:
+        print(f"Processing {html_file}")
+        links = await test_sigle_html_links(html_file)
+        print(f"Found {len(links)} links in {html_file}")
+        all_links.extend(links)
+    print(f"Found {len(all_links)} links in total")
+    # 按行写入到文件中
+    save_to_file(all_links, dir_path / "links.json.txt")
+    return all_links
 async def main():
     # await search_all()
-    await test_single_search()
+    # await test_single_search()
+    await test_dir_links_not_local(Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil"))
     
 if __name__ == "__main__":
     asyncio.run(main())