Przeglądaj źródła

抽象初始化crawler

mrh 7 miesięcy temu
rodzic
commit
370b5fdbe6
1 zmienionych plików z 69 dodań i 31 usunięć
  1. 69 31
      src/browser/crawl_base.py

+ 69 - 31
src/browser/crawl_base.py

@@ -14,6 +14,7 @@ from typing import List
 import httpx
 import httpx
 import ssl
 import ssl
 from sqlmodel import select, Session
 from sqlmodel import select, Session
+from DrissionPage import Chromium, ChromiumOptions, ChromiumPage
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy,JsonXPathExtractionStrategy,ExtractionStrategy
@@ -25,17 +26,17 @@ from config.settings import OUTPUT_DIR,TEMP_PAGE_DIR
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
 from utils.drission_page import load_chrome_from_ini,ChromeOptions
 from utils.file import save_to_file,check_exists,s3,read_file,upload_to_s3,upload_file_to_s3
 from utils.file import save_to_file,check_exists,s3,read_file,upload_to_s3,upload_file_to_s3
 from config.settings import CFG
 from config.settings import CFG
+from abc import ABC, abstractmethod
 logger = get_logger('browser')
 logger = get_logger('browser')
 
 
 
 
-class CrawlerBase():
-    def __init__(self, chrome_options:ChromeOptions):
-        self.chrome_options = chrome_options
+class AbstractCrawlerBase(ABC):
+    def __init__(self):
         self.browser_config = {
         self.browser_config = {
-            "headless": self.chrome_options.headless,
+            "headless": False,
             "use_managed_browser": True,
             "use_managed_browser": True,
         }
         }
-        self.page = self.run_browser()
+        self.page = None
     
     
     def get_or_new_tab(self, init_url:str='chrome://version'):
     def get_or_new_tab(self, init_url:str='chrome://version'):
         if not self.page:
         if not self.page:
@@ -68,14 +69,9 @@ class CrawlerBase():
         content = read_file(s3_uri)
         content = read_file(s3_uri)
         return save_to_file(content, temp_file)
         return save_to_file(content, temp_file)
 
 
-    def run_browser(self):
-        page = load_chrome_from_ini(
-            self.chrome_options
-        )
-        return page
     async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
     async def excra_strategy_raw_html(self, raw_html:str, schema:dict, strategy:ExtractionStrategy=JsonXPathExtractionStrategy):
         browser_config = BrowserConfig(
         browser_config = BrowserConfig(
-            headless=self.chrome_options.headless,
+            headless=False,
             use_managed_browser=True,
             use_managed_browser=True,
             cdp_url=self.page.browser._driver._websocket_url
             cdp_url=self.page.browser._driver._websocket_url
         )
         )
@@ -90,6 +86,7 @@ class CrawlerBase():
                 )
                 )
             )
             )
             return result
             return result
+    
     def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, img_path:str=''):
     def download_img(self,url:str,save_dir:str=TEMP_PAGE_DIR, page:str=None,as_img_base64:bool=True, img_path:str=''):
         # ('success', '{abs_current_path}\\notice.svg')
         # ('success', '{abs_current_path}\\notice.svg')
         p = page or self.page
         p = page or self.page
@@ -109,6 +106,39 @@ class CrawlerBase():
                 Path(path).unlink()
                 Path(path).unlink()
                 return status,img_path
                 return status,img_path
         return status,path
         return status,path
+    
+    def get(self, url:str):
+        if not self.page:
+            self._initialize_page()
+        self.page.get(url)
+        self.browser_config.update({
+            "cdp_url": self.page.browser._driver._websocket_url 
+        })
+        # logger.info(f"get {url}, browser_config: {self.browser_config}")
+
+    async def aget(self, url:str):
+        if not self.page:
+            self._initialize_page()
+        await asyncio.to_thread(self.page.get, url)
+        self.browser_config.update({
+            "cdp_url": self.page.browser._driver._websocket_url 
+        })
+    
+    @abstractmethod
+    def _initialize_page(self):
+        """初始化页面的抽象方法,子类必须实现"""
+        pass
+
+
+class CrawlerBase(AbstractCrawlerBase):
+    def __init__(self, chrome_options:ChromeOptions):
+        super().__init__()
+        self.chrome_options = chrome_options
+        self._initialize_page()
+    
+    def _initialize_page(self):
+        self.page = load_chrome_from_ini(self.chrome_options)
+    
     async def run(self, url:str):
     async def run(self, url:str):
         page = load_chrome_from_ini(
         page = load_chrome_from_ini(
             self.chrome_options
             self.chrome_options
@@ -130,24 +160,32 @@ class CrawlerBase():
         finally:
         finally:
             page.quit()
             page.quit()
         return result
         return result
-    
-    def get(self, url:str):
-        if not self.page:
-            self.page = load_chrome_from_ini(
-                self.chrome_options 
-            )
-        self.page.get(url)
-        self.browser_config.update({
-            "cdp_url": self.page.browser._driver._websocket_url 
-        })
-        # logger.info(f"get {url}, browser_config: {self.browser_config}")
 
 
-    async def aget(self, url:str):
-        if not self.page:
-            self.page = load_chrome_from_ini(
-                self.chrome_options
-            )
-        await asyncio.to_thread(self.page.get, url)
-        self.browser_config.update({
-            "cdp_url": self.page.browser._driver._websocket_url 
-        })
+
+class AsinCrawlerBase(AbstractCrawlerBase):
+    HOME_PAGE='https://www.asinseed.com/en/'
+    def __init__(self, driver:ChromiumPage):
+        super().__init__()
+        self.page = driver
+
+    def _initialize_page(self):
+        # AsinCrawlerBase 的 page 在构造函数中已经初始化,这里不需要做任何事
+        pass
+
+    def get_browser_download_dir(self):
+        return self.page.browser._chromium_options.download_path
+
+    def get_home_page(self):
+        self.page.get(self.HOME_PAGE)
+    
+    @classmethod
+    def create_browser(cls, address='127.0.0.1:16800', user_data_dir='', browser_path=''):
+        chrome_options = ChromiumOptions(read_file=False)
+        # 务必不能小于10000,否则可能由于环境问题导致错误
+        chrome_options.set_address(address)
+        if user_data_dir:
+            chrome_options.set_user_data_path(user_data_dir)
+        if browser_path:
+            chrome_options.set_browser_path(browser_path)
+        driver = ChromiumPage(addr_or_opts=chrome_options)
+        return cls(driver)