10 月之前 · ad8aa1e1f0
--- a/search_keyward.py
+++ b/search_keyward.py
@@ -3,6 +3,7 @@ from crawl4ai import *
 
				 from pathlib import Path
			
 
				 import json
			
 
				 from lxml import html  # 使用 lxml.html 模块
			
 
				+from sqlmodel import Session, select
			
 
				 from mylib.base import replace_space, save_to_file, save_all_result,OUTPUT_DIR,save_to_pickle,ensure_output_dir,save_base64_to_file,browser_config
			
 
				 from mylib.drission_page import load_chrome_from_ini
			
 
				 async def google_search(search_key:str, start:int=0, config=None)->CrawlResult:
			
@@ -44,30 +45,50 @@ def is_search_result_empty(html_content:str):
 
				     return len(card_sections) != 0
			
 
				 
			
 
				         
			
 
				-async def search_all(search_key:str,start=0, pages_num:int=250):
			
 
				-    search_key = replace_space(search_key)
			
 
				-    save_dir = OUTPUT_DIR / search_key / 'pkl'
			
 
				+async def get_keywords_from_db():
			
 
				+    """从数据库获取所有关键词"""
			
 
				+    from database.sql_model import Keyword, Session, engine
			
 
				+    with Session(engine) as session:
			
 
				+        statement = select(Keyword).where(Keyword.done == False)
			
 
				+        keywords = session.exec(statement).all()
			
 
				+        return [keyword.key_word for keyword in keywords]
			
 
				+
			
 
				+def is_already_processed(keyword: str) -> bool:
			
 
				+    """检查关键词是否已处理"""
			
 
				+    save_dir = OUTPUT_DIR / replace_space(keyword) / 'pkl'
			
 
				+    return save_dir.exists() and any(save_dir.glob("*.pickle"))
			
 
				+
			
 
				+async def process_keyword(keyword: str, start=0, pages_num=250):
			
 
				+    """处理单个关键词"""
			
 
				+    keyword = replace_space(keyword)
			
 
				+    save_dir = OUTPUT_DIR / keyword / 'pkl'
			
 
				     ensure_output_dir(save_dir)
			
 
				+    
			
 
				+    # 如果已经处理过，直接返回保存目录
			
 
				+    if is_already_processed(keyword):
			
 
				+        print(f"关键词 {keyword} 已处理，跳过搜索")
			
 
				+        return save_dir
			
 
				+        
			
 
				+    # 未处理过，执行搜索
			
 
				     for i in range(start, pages_num, 10):
			
 
				-        result:CrawlResult = await google_search(search_key, i)
			
 
				+        result: CrawlResult = await google_search(keyword, i)
			
 
				         if is_search_result_empty(result.html):
			
 
				-            # run_config = CrawlerRunConfig(
			
 
				-            #     screenshot=True,             # Grab a screenshot as base64
			
 
				-            #     screenshot_wait_for=1.0,     # Wait 1s before capturing
			
 
				-            #     pdf=True,                    # Also produce a PDF
			
 
				-            #     image_description_min_word_threshold=5,  # If analyzing alt text
			
 
				-            #     image_score_threshold=3,                # Filter out low-score images
			
 
				-            # )
			
 
				-            # result = await google_search(search_key, i, config=run_config)
			
 
				-            # save_path = save_base64_to_file(result.screenshot, save_dir / f"{search_key}-{i}.png")
			
 
				             print(f"没有找到多余的页面 {result.url}，退出")
			
 
				-            # print(f"screenshot saved to {save_path}")
			
 
				             break
			
 
				             
			
 
				-        linkes = filter_links(result.links)
			
 
				-        print(f"start: {i}, links: {linkes} \n len: {len(linkes)}")
			
 
				+        links = filter_links(result.links)
			
 
				+        print(f"start: {i}, links: {links} \n len: {len(links)}")
			
 
				         save_to_pickle(result, save_dir / f"result-{i}.pickle")
			
 
				     return save_dir
			
 
				+
			
 
				+async def search_all():
			
 
				+    """处理所有未完成的关键词"""
			
 
				+    keywords = await get_keywords_from_db()
			
 
				+    for keyword in keywords:
			
 
				+        if is_already_processed(keyword):
			
 
				+            print(f"关键词 {keyword} 已处理，跳过")
			
 
				+            continue
			
 
				+        await process_keyword(keyword)
			
 
				 async def test_single_search():
			
 
				     result = await google_search("Acalypha malabarica essential oil", start=50)
			
 
				     print(f"result clean html:\n {result.cleaned_html}")
			
@@ -79,8 +100,8 @@ async def test_single_search():
 
				 
			
 
				 
			
 
				 async def main():
			
 
				-    # await search_all("Acalypha malabarica essential oil",start=20)
			
 
				-    await test_single_search()
			
 
				+    await search_all()
			
 
				+    # await test_single_search()
			
 
				     
			
 
				 if __name__ == "__main__":
			
 
				-    asyncio.run(main())
			
 
				+    asyncio.run(main())