|
|
@@ -2,9 +2,10 @@ import asyncio
|
|
|
import pickle
|
|
|
from pathlib import Path
|
|
|
import random
|
|
|
-from typing import List
|
|
|
+from typing import List,Optional
|
|
|
import httpx
|
|
|
import ssl
|
|
|
+from pydantic import BaseModel, Field
|
|
|
from sqlmodel import select, Session
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
|
|
|
@@ -12,6 +13,12 @@ from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
|
|
|
from mylib.logu import logger
|
|
|
from utils.proxy_pool import get_random_proxy
|
|
|
|
|
|
+class CrawlerResult(BaseModel):
|
|
|
+ err: Optional[int] = 1
|
|
|
+ search_result_model: Optional[SearchResultItem] = None
|
|
|
+ crawl_result: Optional[CrawlResult] = None
|
|
|
+ message: Optional[str] = None
|
|
|
+
|
|
|
class URLCrawler:
|
|
|
def __init__(self, max_concurrent: int = 3):
|
|
|
self.max_concurrent = max_concurrent
|
|
|
@@ -49,15 +56,16 @@ class URLCrawler:
|
|
|
async def crawl_url(self, url: str, item_id: int, output_dir: Path, browser_config: BrowserConfig = None, overwrite: bool = False) -> CrawlResult:
|
|
|
"""Crawl a single URL and save results with item_id as filename"""
|
|
|
# Check if we should skip this URL
|
|
|
+ item = None
|
|
|
with Session(self.db_manager.engine) as session:
|
|
|
item = session.exec(
|
|
|
select(SearchResultItem)
|
|
|
.where(SearchResultItem.id == item_id)
|
|
|
).first()
|
|
|
|
|
|
- if item and item.html_path and not overwrite:
|
|
|
- logger.info(f"Skipping {url} (item_id: {item_id}) - already has html_path: {item.html_path}")
|
|
|
- return {"search_result_model": item, "crawl_result": None, 'message': 'already has html_path'}
|
|
|
+ if item and item.save_path and not overwrite:
|
|
|
+ logger.info(f"Skipping {url} (item_id: {item_id}) - already has save_path: {item.save_path}")
|
|
|
+ return {"search_result_model": item, "crawl_result": None, 'message': 'already has save_path'}
|
|
|
|
|
|
if not browser_config:
|
|
|
browser_config = BrowserConfig(
|
|
|
@@ -80,26 +88,30 @@ class URLCrawler:
|
|
|
content_type = response.headers.get('content-type', '').lower()
|
|
|
if 'pdf' in content_type:
|
|
|
pdf_path = output_dir / f"{item_id}.pdf"
|
|
|
- logger.info(f"crwal id {item_id} content_type {content_type} {pdf_path}")
|
|
|
- if pdf_path:
|
|
|
- if await self.download_pdf(url, pdf_path):
|
|
|
- # Update database with PDF path
|
|
|
- with Session(self.db_manager.engine) as session:
|
|
|
- item = session.exec(
|
|
|
- select(SearchResultItem)
|
|
|
- .where(SearchResultItem.id == item_id)
|
|
|
- ).first()
|
|
|
- if item:
|
|
|
- item.html_path = str(pdf_path)
|
|
|
- session.add(item)
|
|
|
- session.commit()
|
|
|
- return {"search_result_model": item, "crawl_result": None, 'message': response.headers.get('content-type')}
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Failed to check headers for id: {item_id} , {url} {str(e)}")
|
|
|
- # return {"search_result_model": None, "crawl_result": None, 'message': str(e)}
|
|
|
- # if 'html' not in content_type:
|
|
|
- # logger.info(f"Skipping {url} (item_id: {item_id}) - not html, conent_type {content_type}")
|
|
|
- # return {"search_result_model": None, "crawl_result": None,'message': f'not html, content_type {content_type}'}
|
|
|
+ logger.info(f"crwal id {item_id} content_type {content_type} {pdf_path}")
|
|
|
+ search_result_model = item or SearchResultItem(id=item_id)
|
|
|
+ search_result_model.content_type = content_type
|
|
|
+ if pdf_path:
|
|
|
+ try:
|
|
|
+ if await self.download_pdf(url, pdf_path):
|
|
|
+ search_result_model.save_path = str(pdf_path)
|
|
|
+ else:
|
|
|
+ logger.warning(f"Failed to download PDF for id: {item_id}, {url}")
|
|
|
+ return CrawlerResult(err=1, message='failed to download pdf', search_result_model=None, crawl_result=None)
|
|
|
+ self.db_manager.add_or_update_search_result_item(search_result_model)
|
|
|
+ logger.info(f"{item_id} download_pdf success {pdf_path}")
|
|
|
+ # PDF必须要返回了,因为 crawl4ai 如果是文件类型,
|
|
|
+ # 它会默认下载到路径,反而得不到自动下载的 PDF 文件,除非能额外监听它是否下载成功
|
|
|
+ return CrawlerResult(err=0, message='success', search_result_model=search_result_model, crawl_result=None)
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Failed to download PDF for id: {item_id}, {url} {str(e)}")
|
|
|
+ return CrawlerResult(err=1, message=str(e), search_result_model=None, crawl_result=None)
|
|
|
+
|
|
|
+ if 'html' not in content_type:
|
|
|
+ logger.info(f"Skipping {url} (item_id: {item_id}) - not html, conent_type {content_type}")
|
|
|
+ return CrawlerResult(err=2, message='not html', search_result_model=search_result_model, crawl_result=None)
|
|
|
logger.info(f"crawler.arun start {item_id} content-type: {content_type}, {url} ")
|
|
|
logger.info(f"browser_config use_managed_browser {browser_config.use_managed_browser} , cdp_url: {browser_config.cdp_url}, headless: {browser_config.headless}")
|
|
|
# If not PDF or header check failed, try regular crawl
|
|
|
@@ -110,22 +122,7 @@ class URLCrawler:
|
|
|
crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
|
|
|
logger.info(f"{item_id} crawler.arun result.success: {result.success} {result.status_code}")
|
|
|
- # If crawl failed but URL contains 'download', try PDF again
|
|
|
- if not result.success and 'download' in url.lower():
|
|
|
- pdf_path = output_dir / f"{item_id}.pdf"
|
|
|
- if await self.download_pdf(url, pdf_path):
|
|
|
- # Update database with PDF path
|
|
|
- with Session(self.db_manager.engine) as session:
|
|
|
- item = session.exec(
|
|
|
- select(SearchResultItem)
|
|
|
- .where(SearchResultItem.id == item_id)
|
|
|
- ).first()
|
|
|
- if item:
|
|
|
- item.html_path = str(pdf_path)
|
|
|
- session.add(item)
|
|
|
- session.commit()
|
|
|
- session.refresh(item)
|
|
|
- return {"search_result_model": item, "crawl_result": result}
|
|
|
+
|
|
|
|
|
|
# Save results
|
|
|
ensure_output_dir(output_dir)
|
|
|
@@ -136,32 +133,24 @@ class URLCrawler:
|
|
|
pickle.dump(result, f)
|
|
|
|
|
|
# Save HTML and Markdown if available
|
|
|
- html_path = None
|
|
|
+ save_path = None
|
|
|
if result.html:
|
|
|
- html_path = output_dir / f"{item_id}.html"
|
|
|
- save_to_file(result.html, html_path)
|
|
|
+ save_path = output_dir / f"{item_id}.html"
|
|
|
+ save_to_file(result.html, save_path)
|
|
|
+ search_result_model.save_path = str(save_path)
|
|
|
|
|
|
if result.markdown:
|
|
|
md_path = output_dir / f"{item_id}.md"
|
|
|
save_to_file(result.markdown, md_path)
|
|
|
-
|
|
|
+ search_result_model.markdown_path = str(md_path)
|
|
|
# Update database with HTML path
|
|
|
- if html_path:
|
|
|
- with Session(self.db_manager.engine) as session:
|
|
|
- item = session.exec(
|
|
|
- select(SearchResultItem)
|
|
|
- .where(SearchResultItem.id == item_id)
|
|
|
- ).first()
|
|
|
- if item:
|
|
|
- item.html_path = str(html_path)
|
|
|
- session.add(item)
|
|
|
- session.commit()
|
|
|
- session.refresh(item)
|
|
|
+ if save_path:
|
|
|
+ self.db_manager.add_or_update_search_result_item(search_result_model)
|
|
|
logger.info(f"{item_id} crawler.arun result.success: {item}")
|
|
|
- return {"search_result_model": item, "crawl_result": result}
|
|
|
+ return CrawlerResult(err=0, message='success', search_result_model=search_result_model, crawl_result=result)
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to crawl id: {item_id} , {url} {str(e)}")
|
|
|
- return {"search_result_model": item, "crawl_result": None, 'message': str(e)}
|
|
|
+ return CrawlerResult(err=1, message=str(e), search_result_model=search_result_model, crawl_result=result)
|
|
|
finally:
|
|
|
await crawler.close()
|
|
|
|