Kaynağa Gözat

抽象初始化crawler

mrh 7 ay önce
ebeveyn
işleme
370b5fdbe6
1 değiştirilmiş dosya ile 69 ekleme ve 31 silme
  1. 69 31
      src/browser/crawl_base.py

+ 69 - 31
src/browser/crawl_base.py

@@ -14,6 +14,7 @@ from typing import List
 import httpx
 import ssl
 from sqlmodel import select, Session
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
@@ -25,17 +26,17 @@ from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
 from utils.file import save_to_file,check_exists,s3,read_file,upload_to_s3,upload_file_to_s3
 from config.settings import CFG
+from abc import ABC, abstractmethod
 logger = get_logger('browser')
 
 
-class CrawlerBase():
-    def __init__(self, chrome_options:ChromeOptions):
-        self.chrome_options = chrome_options
+class AbstractCrawlerBase(ABC):
+    def __init__(self):
         self.browser_config = {
-            "headless": self.chrome_options.headless,
+            "headless": False,
             "use_managed_browser": True,
         }
-        self.page = self.run_browser()
+        self.page = None
     
     def get_or_new_tab(self, init_url:str='chrome://version'):
         if not self.page:
@@ -68,14 +69,9 @@ class CrawlerBase():
         content = read_file(s3_uri)
         return save_to_file(content, temp_file)
 
-    def run_browser(self):
-        page = load_chrome_from_ini(
-            self.chrome_options
-        )
-        return page
     async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
         browser_config = BrowserConfig(
-            headless=self.chrome_options.headless,
+            headless=False,
             use_managed_browser=True,
             cdp_url=self.page.browser._driver._websocket_url
         )
@@ -90,6 +86,7 @@ class CrawlerBase():
                 )
             )
             return result
+    
     def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, img_path:str=''):
         # ('success', '{abs_current_path}\\notice.svg')
         p = page or self.page
@@ -109,6 +106,39 @@ class CrawlerBase():
                 Path(path).unlink()
                 return status,img_path
         return status,path
+    
+    def get(self, url:str):
+        if not self.page:
+            self._initialize_page()
+        self.page.get(url)
+        self.browser_config.update({
+            "cdp_url": self.page.browser._driver._websocket_url 
+        })
+        # logger.info(f"get {url}, browser_config: {self.browser_config}")
+
+    async def aget(self, url:str):
+        if not self.page:
+            self._initialize_page()
+        await asyncio.to_thread(self.page.get, url)
+        self.browser_config.update({
+            "cdp_url": self.page.browser._driver._websocket_url 
+        })
+    
+    @abstractmethod
+    def _initialize_page(self):
+        """初始化页面的抽象方法,子类必须实现"""
+        pass
+
+
+class CrawlerBase(AbstractCrawlerBase):
+    def __init__(self, chrome_options:ChromeOptions):
+        super().__init__()
+        self.chrome_options = chrome_options
+        self._initialize_page()
+    
+    def _initialize_page(self):
+        self.page = load_chrome_from_ini(self.chrome_options)
+    
     async def run(self, url:str):
         page = load_chrome_from_ini(
             self.chrome_options
@@ -130,24 +160,32 @@ class CrawlerBase():
         finally:
             page.quit()
         return result
-    
-    def get(self, url:str):
-        if not self.page:
-            self.page = load_chrome_from_ini(
-                self.chrome_options 
-            )
-        self.page.get(url)
-        self.browser_config.update({
-            "cdp_url": self.page.browser._driver._websocket_url 
-        })
-        # logger.info(f"get {url}, browser_config: {self.browser_config}")
 
-    async def aget(self, url:str):
-        if not self.page:
-            self.page = load_chrome_from_ini(
-                self.chrome_options
-            )
-        await asyncio.to_thread(self.page.get, url)
-        self.browser_config.update({
-            "cdp_url": self.page.browser._driver._websocket_url 
-        })
+
+class AsinCrawlerBase(AbstractCrawlerBase):
+    HOME_PAGE='https://www.asinseed.com/en/'
+    def __init__(self, driver:ChromiumPage):
+        super().__init__()
+        self.page = driver
+
+    def _initialize_page(self):
+        # AsinCrawlerBase 的 page 在构造函数中已经初始化,这里不需要做任何事
+        pass
+
+    def get_browser_download_dir(self):
+        return self.page.browser._chromium_options.download_path
+
+    def get_home_page(self):
+        self.page.get(self.HOME_PAGE)
+    
+    @classmethod
+    def create_browser(cls, address='127.0.0.1:16800', user_data_dir='', browser_path=''):
+        chrome_options = ChromiumOptions(read_file=False)
+        # 务必不能小于10000,否则可能由于环境问题导致错误
+        chrome_options.set_address(address)
+        if user_data_dir:
+            chrome_options.set_user_data_path(user_data_dir)
+        if browser_path:
+            chrome_options.set_browser_path(browser_path)
+        driver = ChromiumPage(addr_or_opts=chrome_options)
+        return cls(driver)