|
|
@@ -3,6 +3,7 @@ from crawl4ai import *
|
|
|
from pathlib import Path
|
|
|
import json
|
|
|
from lxml import html # 使用 lxml.html 模块
|
|
|
+from sqlmodel import Session, select
|
|
|
from mylib.base import replace_space, save_to_file, save_all_result,OUTPUT_DIR,save_to_pickle,ensure_output_dir,save_base64_to_file,browser_config
|
|
|
from mylib.drission_page import load_chrome_from_ini
|
|
|
async def google_search(search_key:str, start:int=0, config=None)->CrawlResult:
|
|
|
@@ -44,30 +45,50 @@ def is_search_result_empty(html_content:str):
|
|
|
return len(card_sections) != 0
|
|
|
|
|
|
|
|
|
-async def search_all(search_key:str,start=0, pages_num:int=250):
|
|
|
- search_key = replace_space(search_key)
|
|
|
- save_dir = OUTPUT_DIR / search_key / 'pkl'
|
|
|
+async def get_keywords_from_db():
|
|
|
+ """从数据库获取所有关键词"""
|
|
|
+ from database.sql_model import Keyword, Session, engine
|
|
|
+ with Session(engine) as session:
|
|
|
+ statement = select(Keyword).where(Keyword.done == False)
|
|
|
+ keywords = session.exec(statement).all()
|
|
|
+ return [keyword.key_word for keyword in keywords]
|
|
|
+
|
|
|
+def is_already_processed(keyword: str) -> bool:
|
|
|
+ """检查关键词是否已处理"""
|
|
|
+ save_dir = OUTPUT_DIR / replace_space(keyword) / 'pkl'
|
|
|
+ return save_dir.exists() and any(save_dir.glob("*.pickle"))
|
|
|
+
|
|
|
+async def process_keyword(keyword: str, start=0, pages_num=250):
|
|
|
+ """处理单个关键词"""
|
|
|
+ keyword = replace_space(keyword)
|
|
|
+ save_dir = OUTPUT_DIR / keyword / 'pkl'
|
|
|
ensure_output_dir(save_dir)
|
|
|
+
|
|
|
+ # 如果已经处理过,直接返回保存目录
|
|
|
+ if is_already_processed(keyword):
|
|
|
+ print(f"关键词 {keyword} 已处理,跳过搜索")
|
|
|
+ return save_dir
|
|
|
+
|
|
|
+ # 未处理过,执行搜索
|
|
|
for i in range(start, pages_num, 10):
|
|
|
- result:CrawlResult = await google_search(search_key, i)
|
|
|
+ result: CrawlResult = await google_search(keyword, i)
|
|
|
if is_search_result_empty(result.html):
|
|
|
- # run_config = CrawlerRunConfig(
|
|
|
- # screenshot=True, # Grab a screenshot as base64
|
|
|
- # screenshot_wait_for=1.0, # Wait 1s before capturing
|
|
|
- # pdf=True, # Also produce a PDF
|
|
|
- # image_description_min_word_threshold=5, # If analyzing alt text
|
|
|
- # image_score_threshold=3, # Filter out low-score images
|
|
|
- # )
|
|
|
- # result = await google_search(search_key, i, config=run_config)
|
|
|
- # save_path = save_base64_to_file(result.screenshot, save_dir / f"{search_key}-{i}.png")
|
|
|
print(f"没有找到多余的页面 {result.url},退出")
|
|
|
- # print(f"screenshot saved to {save_path}")
|
|
|
break
|
|
|
|
|
|
- linkes = filter_links(result.links)
|
|
|
- print(f"start: {i}, links: {linkes} \n len: {len(linkes)}")
|
|
|
+ links = filter_links(result.links)
|
|
|
+ print(f"start: {i}, links: {links} \n len: {len(links)}")
|
|
|
save_to_pickle(result, save_dir / f"result-{i}.pickle")
|
|
|
return save_dir
|
|
|
+
|
|
|
+async def search_all():
|
|
|
+ """处理所有未完成的关键词"""
|
|
|
+ keywords = await get_keywords_from_db()
|
|
|
+ for keyword in keywords:
|
|
|
+ if is_already_processed(keyword):
|
|
|
+ print(f"关键词 {keyword} 已处理,跳过")
|
|
|
+ continue
|
|
|
+ await process_keyword(keyword)
|
|
|
async def test_single_search():
|
|
|
result = await google_search("Acalypha malabarica essential oil", start=50)
|
|
|
print(f"result clean html:\n {result.cleaned_html}")
|
|
|
@@ -79,8 +100,8 @@ async def test_single_search():
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
- # await search_all("Acalypha malabarica essential oil",start=20)
|
|
|
- await test_single_search()
|
|
|
+ await search_all()
|
|
|
+ # await test_single_search()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- asyncio.run(main())
|
|
|
+ asyncio.run(main())
|