|
|
@@ -14,6 +14,7 @@ from typing import List
|
|
|
import httpx
|
|
|
import ssl
|
|
|
from sqlmodel import select, Session
|
|
|
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
|
|
|
@@ -25,17 +26,17 @@ from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
|
|
|
from utils.drission_page import load_chrome_from_ini,ChromeOptions
|
|
|
from utils.file import save_to_file,check_exists,s3,read_file,upload_to_s3,upload_file_to_s3
|
|
|
from config.settings import CFG
|
|
|
+from abc import ABC, abstractmethod
|
|
|
logger = get_logger('browser')
|
|
|
|
|
|
|
|
|
-class CrawlerBase():
|
|
|
- def __init__(self, chrome_options:ChromeOptions):
|
|
|
- self.chrome_options = chrome_options
|
|
|
+class AbstractCrawlerBase(ABC):
|
|
|
+ def __init__(self):
|
|
|
self.browser_config = {
|
|
|
- "headless": self.chrome_options.headless,
|
|
|
+ "headless": False,
|
|
|
"use_managed_browser": True,
|
|
|
}
|
|
|
- self.page = self.run_browser()
|
|
|
+ self.page = None
|
|
|
|
|
|
def get_or_new_tab(self, init_url:str='chrome://version'):
|
|
|
if not self.page:
|
|
|
@@ -68,14 +69,9 @@ class CrawlerBase():
|
|
|
content = read_file(s3_uri)
|
|
|
return save_to_file(content, temp_file)
|
|
|
|
|
|
- def run_browser(self):
|
|
|
- page = load_chrome_from_ini(
|
|
|
- self.chrome_options
|
|
|
- )
|
|
|
- return page
|
|
|
async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
|
|
|
browser_config = BrowserConfig(
|
|
|
- headless=self.chrome_options.headless,
|
|
|
+ headless=False,
|
|
|
use_managed_browser=True,
|
|
|
cdp_url=self.page.browser._driver._websocket_url
|
|
|
)
|
|
|
@@ -90,6 +86,7 @@ class CrawlerBase():
|
|
|
)
|
|
|
)
|
|
|
return result
|
|
|
+
|
|
|
def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, img_path:str=''):
|
|
|
# ('success', '{abs_current_path}\\notice.svg')
|
|
|
p = page or self.page
|
|
|
@@ -109,6 +106,39 @@ class CrawlerBase():
|
|
|
Path(path).unlink()
|
|
|
return status,img_path
|
|
|
return status,path
|
|
|
+
|
|
|
+ def get(self, url:str):
|
|
|
+ if not self.page:
|
|
|
+ self._initialize_page()
|
|
|
+ self.page.get(url)
|
|
|
+ self.browser_config.update({
|
|
|
+ "cdp_url": self.page.browser._driver._websocket_url
|
|
|
+ })
|
|
|
+ # logger.info(f"get {url}, browser_config: {self.browser_config}")
|
|
|
+
|
|
|
+ async def aget(self, url:str):
|
|
|
+ if not self.page:
|
|
|
+ self._initialize_page()
|
|
|
+ await asyncio.to_thread(self.page.get, url)
|
|
|
+ self.browser_config.update({
|
|
|
+ "cdp_url": self.page.browser._driver._websocket_url
|
|
|
+ })
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def _initialize_page(self):
|
|
|
+ """初始化页面的抽象方法,子类必须实现"""
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerBase(AbstractCrawlerBase):
|
|
|
+ def __init__(self, chrome_options:ChromeOptions):
|
|
|
+ super().__init__()
|
|
|
+ self.chrome_options = chrome_options
|
|
|
+ self._initialize_page()
|
|
|
+
|
|
|
+ def _initialize_page(self):
|
|
|
+ self.page = load_chrome_from_ini(self.chrome_options)
|
|
|
+
|
|
|
async def run(self, url:str):
|
|
|
page = load_chrome_from_ini(
|
|
|
self.chrome_options
|
|
|
@@ -130,24 +160,32 @@ class CrawlerBase():
|
|
|
finally:
|
|
|
page.quit()
|
|
|
return result
|
|
|
-
|
|
|
- def get(self, url:str):
|
|
|
- if not self.page:
|
|
|
- self.page = load_chrome_from_ini(
|
|
|
- self.chrome_options
|
|
|
- )
|
|
|
- self.page.get(url)
|
|
|
- self.browser_config.update({
|
|
|
- "cdp_url": self.page.browser._driver._websocket_url
|
|
|
- })
|
|
|
- # logger.info(f"get {url}, browser_config: {self.browser_config}")
|
|
|
|
|
|
- async def aget(self, url:str):
|
|
|
- if not self.page:
|
|
|
- self.page = load_chrome_from_ini(
|
|
|
- self.chrome_options
|
|
|
- )
|
|
|
- await asyncio.to_thread(self.page.get, url)
|
|
|
- self.browser_config.update({
|
|
|
- "cdp_url": self.page.browser._driver._websocket_url
|
|
|
- })
|
|
|
+
|
|
|
+class AsinCrawlerBase(AbstractCrawlerBase):
|
|
|
+ HOME_PAGE='https://www.asinseed.com/en/'
|
|
|
+ def __init__(self, driver:ChromiumPage):
|
|
|
+ super().__init__()
|
|
|
+ self.page = driver
|
|
|
+
|
|
|
+ def _initialize_page(self):
|
|
|
+ # AsinCrawlerBase 的 page 在构造函数中已经初始化,这里不需要做任何事
|
|
|
+ pass
|
|
|
+
|
|
|
+ def get_browser_download_dir(self):
|
|
|
+ return self.page.browser._chromium_options.download_path
|
|
|
+
|
|
|
+ def get_home_page(self):
|
|
|
+ self.page.get(self.HOME_PAGE)
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def create_browser(cls, address='127.0.0.1:16800', user_data_dir='', browser_path=''):
|
|
|
+ chrome_options = ChromiumOptions(read_file=False)
|
|
|
+ # 务必不能小于10000,否则可能由于环境问题导致错误
|
|
|
+ chrome_options.set_address(address)
|
|
|
+ if user_data_dir:
|
|
|
+ chrome_options.set_user_data_path(user_data_dir)
|
|
|
+ if browser_path:
|
|
|
+ chrome_options.set_browser_path(browser_path)
|
|
|
+ driver = ChromiumPage(addr_or_opts=chrome_options)
|
|
|
+ return cls(driver)
|