|
|
@@ -9,7 +9,16 @@ from mylib.logu import get_logger
|
|
|
from config.settings import PROXIES
|
|
|
from crawl4ai import BrowserConfig
|
|
|
from pydantic import BaseModel
|
|
|
+from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini
|
|
|
+from utils.proxy_pool import get_random_proxy
|
|
|
+from worker.search_engine.search_result_db import SearchResultItem
|
|
|
logger = get_logger('crawl_worker')
|
|
|
+class SearchBrowserConfig(BaseModel):
|
|
|
+ # def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None, no_imgs=True):
|
|
|
+ headless: Optional[bool] = False
|
|
|
+ proxy: Optional[str] = None
|
|
|
+ browser_path: Optional[str] = None
|
|
|
+ no_imgs: Optional[bool] = True
|
|
|
|
|
|
class CrawlBrowserConfig(BaseModel):
|
|
|
headless: bool = True
|
|
|
@@ -18,7 +27,7 @@ class CrawlBrowserConfig(BaseModel):
|
|
|
proxy: str | None = None
|
|
|
|
|
|
class CrawlTaskConfig(BaseModel):
|
|
|
- browser_config: Optional[CrawlBrowserConfig] = None
|
|
|
+ browser_config: Optional[SearchBrowserConfig] = None
|
|
|
overwrite: Optional[bool] = False
|
|
|
dry_run: Optional[bool] = False
|
|
|
proxy_pool_url: Optional[str] = None
|
|
|
@@ -31,12 +40,11 @@ class CrawlTaskConfig(BaseModel):
|
|
|
)
|
|
|
def crawl_all_unprocessed_pages_task(config: dict|CrawlTaskConfig):
|
|
|
"""异步任务:提交所有未处理的页面URL爬取任务"""
|
|
|
- from worker.search_engine.search_result_db import SearchResultManager
|
|
|
+ from worker.search_engine.search_result_db import db_manager
|
|
|
|
|
|
config = CrawlTaskConfig(**config)
|
|
|
try:
|
|
|
- manager = SearchResultManager()
|
|
|
- page_ids = manager.get_pages_with_unprocessed_urls()
|
|
|
+ page_ids = db_manager.get_pages_with_unprocessed_urls()
|
|
|
|
|
|
if not page_ids:
|
|
|
logger.info("没有未处理的页面需要爬取")
|
|
|
@@ -98,24 +106,32 @@ def crawl_page_urls_task(page_id: int, config: dict):
|
|
|
|
|
|
async def _execute_crawl():
|
|
|
try:
|
|
|
- # Convert to BrowserConfig
|
|
|
- browser_config = BrowserConfig(**(crawl_task_config.browser_config or {}))
|
|
|
+ search_browser_config = SearchBrowserConfig(**(crawl_task_config.browser_config.model_dump() or {}))
|
|
|
+ if crawl_task_config.proxy_pool_url:
|
|
|
+ proxy = await asyncio.to_thread(get_random_proxy, crawl_task_config.proxy_pool_url)
|
|
|
+ search_browser_config.proxy = proxy
|
|
|
+ logger.info(f"使用代理池: {crawl_task_config.proxy_pool_url} --> {proxy}")
|
|
|
+ else:
|
|
|
+ logger.info(f"使用代理: 跟随系统")
|
|
|
+ page = load_chrome_from_ini(**search_browser_config.model_dump()) if search_browser_config else load_chrome_from_ini()
|
|
|
|
|
|
+ crawl_browser_config = BrowserConfig(
|
|
|
+ headless=search_browser_config.headless,
|
|
|
+ use_managed_browser=True,
|
|
|
+ cdp_url=page.browser._driver._websocket_url
|
|
|
+ )
|
|
|
logger.info(f"{"(测试模式)" if crawl_task_config.dry_run else ""}开始提取搜索结果页: {page_id}")
|
|
|
crawler = URLCrawler()
|
|
|
- if crawl_task_config.proxy_pool_url:
|
|
|
- browser_config.proxy = await crawler.get_random_proxy(crawl_task_config.proxy_pool_url)
|
|
|
- logger.info(f"BrowserConfig proxy: {browser_config.proxy}")
|
|
|
- save_dir, res = await crawler.crawl_page_urls(page_id, browser_config, crawl_task_config.overwrite, crawl_task_config.dry_run)
|
|
|
+ save_dir, list_res = await crawler.crawl_page_urls(page_id, crawl_browser_config, crawl_task_config.overwrite, crawl_task_config.dry_run)
|
|
|
files = []
|
|
|
- if res:
|
|
|
- for crawl_ret in res:
|
|
|
+ # logger.info(f"list_res {list_res} ")
|
|
|
+ if list_res:
|
|
|
+ for crawl_ret in list_res:
|
|
|
if not isinstance(crawl_ret, dict):
|
|
|
continue
|
|
|
- search_result_model = crawl_ret.get("search_result_model")
|
|
|
+ search_result_model:SearchResultItem = crawl_ret.get("search_result_model")
|
|
|
if search_result_model:
|
|
|
- save_file_path = crawl_ret.get("html_path")
|
|
|
- files.append(save_file_path)
|
|
|
+ files.append(search_result_model.html_path)
|
|
|
ret = {"page_id": page_id, "status": "completed", "save_dir": save_dir, "files":files}
|
|
|
logger.info(f"保存目录: {save_dir}")
|
|
|
logger.info(f"提取到 {len(files)} 个结果: {files}")
|
|
|
@@ -124,7 +140,9 @@ def crawl_page_urls_task(page_id: int, config: dict):
|
|
|
except Exception as e:
|
|
|
logger.error(f"URL crawl task failed for page ID {page_id}: {str(e)}")
|
|
|
raise
|
|
|
-
|
|
|
+ finally:
|
|
|
+ if locals().get("page"):
|
|
|
+ page.quit()
|
|
|
loop = asyncio.new_event_loop()
|
|
|
asyncio.set_event_loop(loop)
|
|
|
try:
|