浏览代码

有些 resutl items 存在 cloudflare 人机验证,尝试跳过这些页面转换

mrh 10 月之前
父节点
当前提交
61c7a90974

+ 2 - 1
mylib/drission_page.py

@@ -19,7 +19,7 @@ def genarate_chrome_ini(address="localhost:9321"):
     # chrome_options.incognito(True)
     path = chrome_options.save(CONFIG_DIR / f'{port}.ini')
     return path
-def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None):
+def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None, no_imgs=True):
     chrome_options = ChromiumOptions(ini_path=path)
     if browser_path:
         chrome_options.set_browser_path(browser_path)
@@ -29,6 +29,7 @@ def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=Non
     elif 'HTTP_PROXY' in os.environ:
         chrome_options.set_proxy(os.environ['HTTP_PROXY'])
     chrome_options.auto_port(True)
+    chrome_options.no_imgs(no_imgs)
     logger.info(f"proxy {proxy}")
     page = ChromiumPage(chrome_options)
     return page

+ 10 - 2
tests/mytest/scrapling_t.py

@@ -54,10 +54,18 @@ def find_search_div():
         # if 'search' in textarea.html_content.lower():
         #     print("找到 search 关键字的 textarea")
         # print("textarea.path:", textarea.path)
+def find_verify_page():
+    path = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acantholimon erythraeum essential oil\crawled_urls\4801.html')
+    content = path.read_text(encoding='utf-8')
+    page = Adaptor(content)
+    body = Adaptor(page.body)
+    print("body:", body.get_all_text())
+    print("body.tag:", "真人" in body.get_all_text())
 def main():
     # google_search_demo()
-    res = find_search_div()
-    print("res:", res)
+    # res = find_search_div()
+    # print("res:", res)
+    find_verify_page()
     # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
     # html_content = file.read_text(encoding='utf-8')
     # page = Adaptor(html_content)

+ 192 - 0
utils/chrome_driver/cv_common.py

@@ -0,0 +1,192 @@
+import asyncio
+import time
+import cv2
+from cv import detect
+from DrissionPage import ChromiumPage
+from DrissionPage.common import Actions,Keys
+
+from sqlmodel import SQLModel, Field, Column, JSON
+from sqlalchemy.types import LargeBinary
+from typing import List, Optional,Coroutine
+
+class ImageMatchResult(SQLModel, table=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    screen_img_data: bytes = Field(sa_column=Column(LargeBinary))
+    template_img_path: str
+    template_img_h: int
+    template_img_w: int
+    threshold: float
+    match_min_val: float
+    match_max_val: float
+    min_location: list = Field(sa_column=Column(JSON))
+    max_location: list = Field(sa_column=Column(JSON))
+    max_location_center: List[int] = Field(sa_column=Column(JSON))
+
+class CVPage:
+    def __init__(self, tab:ChromiumPage=None,) -> None:
+        self.tab =  tab
+
+    @classmethod
+    def match_img_in_screen(cls, screen, target_path) -> ImageMatchResult:
+        screen_img = detect.read_image(screen)
+        template_img = detect.read_image(target_path)
+
+        # 获取模板图像的宽度和高度
+        w, h = template_img.shape[1], template_img.shape[0]
+
+        # 模板匹配
+        result = cv2.matchTemplate(screen_img, template_img, cv2.TM_CCOEFF_NORMED)
+        min_val, max_val, min_location, max_location = cv2.minMaxLoc(result)
+        match_result = ImageMatchResult(
+            screen_img_path=screen_img,
+            template_img_path=target_path,
+            template_img_h=h,
+            template_img_w=w,
+            match_min_val=min_val,
+            match_max_val=max_val,
+            min_location=min_location,
+            max_location=max_location,
+            max_location_center= [max_location[0] + w // 2, max_location[1] + h // 2]
+        )
+        return match_result
+    
+    @classmethod
+    def match_img_in_screen_region(cls, screen, target_path, region=None) -> ImageMatchResult:
+        screen_img: cv2.typing.MatLike = detect.read_image(screen)
+        template_img: cv2.typing.MatLike = detect.read_image(target_path)
+
+        # 获取屏幕图像的尺寸
+        screen_h, screen_w = screen_img.shape[:2]
+
+        # 如果指定了region,计算实际的匹配区域
+        if region:
+            x1, y1, x2, y2 = region
+            x1, y1 = int(x1 * screen_w), int(y1 * screen_h)
+            x2, y2 = int(x2 * screen_w), int(y2 * screen_h)
+            roi = screen_img[y1:y2, x1:x2]
+        else:
+            roi = screen_img
+
+        # 获取模板图像的宽度和高度
+        w, h = template_img.shape[1], template_img.shape[0]
+        # cv2.imwrite('test1.png', roi)
+        # 模板匹配
+        result = cv2.matchTemplate(roi, template_img, cv2.TM_CCOEFF_NORMED)
+        min_val, max_val, min_location, max_location = cv2.minMaxLoc(result)
+
+        match_result = ImageMatchResult(
+            screen_img_path=screen if type(screen) == str else None,
+            template_img_path=target_path,
+            template_img_h=h,
+            template_img_w=w,
+            match_min_val=min_val,
+            match_max_val=max_val,
+            min_location=min_location,
+            max_location=max_location,
+            max_location_center=[max_location[0] + w // 2, max_location[1] + h // 2]
+        )
+        return match_result
+    
+    def show_match_result(screen, match_result:ImageMatchResult, threshold=0.8, imshow=True):
+        # 读取源图像和目标图像
+        source_img: cv2.typing.MatLike = detect.read_image(screen)
+        
+        w = match_result.template_img_w   
+        h = match_result.template_img_h     
+        # 获取匹配结果
+        max_val = match_result.match_max_val
+        max_loc = match_result.max_location
+        
+        if max_val >= threshold:  # 如果最大相似度大于或等于阈值
+            top_left = max_loc
+            bottom_right = (top_left[0] + w, top_left[1] + h)
+            label = f"Similarity: {max_val:.2f}"
+            cv2.putText(source_img, label, (top_left[0], top_left[1] - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
+            # 绘制矩形框
+            cv2.rectangle(source_img, top_left, bottom_right, (0, 255, 0), 2)
+        #     print(f"匹配位置: {top_left,bottom_right}, 相似度: {max_val}")
+
+        # else:
+        #     print("没有找到相似度足够高的匹配.")
+            
+        if imshow:
+            # 创建一个窗口来显示结果
+            cv2.namedWindow('Detected', cv2.WINDOW_NORMAL)
+            cv2.imshow('Detected', source_img)
+            
+            # 等待按键
+            cv2.waitKey(0)
+            cv2.destroyAllWindows()
+        else:
+            return source_img
+    
+    def wait_for_image_match(self, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):  
+        """  
+        等待直到图片匹配度超过给定阈值或超时。  
+  
+        :param target_path: 目标图片的路径  
+        :param threshold: 匹配度阈值,默认为0.8  
+        :param timeout: 等待超时时间(秒),默认为30秒  
+        :param interval: 每次检查之间的间隔时间(秒),默认为0.5秒  
+        :return: 如果找到匹配的图片,则返回ImageMatchResult,否则返回None  
+        """  
+        end_time = time.time() + timeout  
+        while time.time() < end_time:  
+            match_result = self.match_img_in_screen(target_path)  
+            if match_result.match_max_val >= threshold:  
+                return match_result  
+            time.sleep(interval)  
+        if raise_error:  
+            raise Exception(f'超时未找到匹配的图片 {target_path} ,阈值为{threshold}')
+        return None  # 超时未找到符合条件的匹配 
+
+    async def wait_for_image_match_async(self, screen, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False)->ImageMatchResult:
+            """
+            异步等待直到图片匹配度超过给定阈值或超时。
+            """
+            end_time = time.time() + timeout
+            while time.time() < end_time:
+                if isinstance(screen, Coroutine):
+                    print(screen)
+                    screen = await screen()
+                match_result = self.match_img_in_screen(screen, target_path)
+                if match_result.match_max_val >= threshold:
+                    return match_result
+                await asyncio.sleep(interval)
+            if raise_error:
+                raise Exception(f'超时未找到匹配的图片 {target_path} ,阈值为{threshold}')
+            return None  # 超时未找到符合条件的匹配    
+    def wait_for_image_disappear(self, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):  
+        """  
+        等待直到目标图片从屏幕上消失或超时。  
+  
+        :param target_path: 目标图片的路径  
+        :param threshold: 匹配度阈值,当匹配度低于此值时认为图片已消失,默认为0.3  
+        :param timeout: 等待超时时间(秒),默认为30秒  
+        :param interval: 每次检查之间的间隔时间(秒),默认为0.5秒  
+        :return: 如果图片消失,则返回True,否则返回False  
+        """  
+        end_time = time.time() + timeout  
+        while time.time() < end_time:  
+            match_result = self.match_img_in_screen(target_path)  
+            if match_result.match_max_val < threshold:  
+                return True  # 图片已消失  
+            time.sleep(interval)  
+        if raise_error:  
+            raise Exception(f'超时未找到匹配的图片 {target_path} ,阈值为{threshold}')
+        return None  # 超时未找到符合条件的匹配 
+
+    async def wait_for_image_disappear_async(self, screen, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):
+            """
+            异步等待直到目标图片从屏幕上消失或超时。
+            """
+            end_time = time.time() + timeout
+            while time.time() < end_time:
+                match_result = self.match_img_in_screen(screen, target_path)
+                if match_result.match_max_val < threshold:
+                    return True  # 图片已消失
+                await asyncio.sleep(interval)
+            if raise_error:
+                raise Exception(f'超时未找到匹配的图片 {target_path} ,阈值为{threshold}')
+            return False  # 超时未找到符合条件的匹配

+ 454 - 0
utils/chrome_driver/pyauto_windows.py

@@ -0,0 +1,454 @@
+from datetime import datetime
+from paddleocr import PaddleOCR, draw_ocr
+from rapidfuzz import fuzz
+import json
+import time
+from typing import List, Optional,Coroutine, Dict, Tuple,Any
+import win32con
+import win32clipboard
+import win32gui
+from pathlib import Path
+from sqlmodel import SQLModel, Field, Column, JSON
+from DrissionPage import ChromiumPage
+from ai.driver.cv_common import ImageMatchResult
+from ai.driver import backend_win32com,bro_page_pyautogui,browser_win32,send_input,cv_common
+from ai.conf_ai.config import load_chrome_from_ini,get_logger,RESOURCE,get_browser,BRO_INI_FILE
+from ai.gpt_node.async_wraps import async_wrapper,thread_safe
+logger = get_logger(f'ai/gpt_node-driver-{BRO_INI_FILE}')
+
+class WindowsInfo(SQLModel, table=False):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    win_rect: Optional[list] = Field(default=[],sa_column=Column(JSON))
+    win_size: Optional[list] = Field(default=[],sa_column=Column(JSON))
+    hwnd: Optional[int] = Field(default=None)
+    window_title:Optional[str] = Field(default='')
+    pid: Optional[int] = Field(default=None)
+    tab_id: Optional[str] = Field(default='')
+
+
+class OCRMatch(SQLModel, table=False):
+    top_left: Tuple[float, float] = Field(default=(0.0, 0.0))
+    top_right: Tuple[float, float] = Field(default=(0.0, 0.0))
+    bottom_right: Tuple[float, float] = Field(default=(0.0, 0.0))
+    bottom_left: Tuple[float, float] = Field(default=(0.0, 0.0))
+    find_txt: str = Field(default="")
+    find_txt_similarity: int = Field(default=0)
+    ocr_txt: str = Field(default="")
+    ocr_confidence: float = Field(default=0.0)
+    win_info: WindowsInfo = Field(default=None)
+    
+    def center(self):
+        return (self.top_left[0] + self.bottom_right[0]) / 2, (self.top_left[1] + self.bottom_right[1]) / 2
+
+    def is_match(self, similarity:int=90):
+        return self.find_txt_similarity > similarity
+
+class GptImgMatch(SQLModel, table=False):
+    img_path:Optional[str] = Field(default=None)
+    init_pos:Optional[list] = Field(default=[0,0],sa_column=Column(JSON))
+    match_res:Optional[ImageMatchResult] = Field(default=None)
+    
+    def img_match(self, img, thread:float=0.92) ->ImageMatchResult|None:
+        self.match_res:ImageMatchResult = cv_common.CVPage.match_img_in_screen(img, self.img_path)
+        self.init_pos = self.match_res.max_location
+        if self.is_match(thread):
+            return self.match_res
+    
+    def is_match(self, thread:float=0.92):
+        if not self.match_res:
+            return False
+        return self.match_res.match_max_val > thread
+
+    def abs_pos(self, x,y, win_rect:list=None):
+        win_x,win_y,_ , _ = win_rect
+        return win_x + x, win_y + y
+    
+    def click(self, pos:list=[], hwnd=None):
+        if not pos:
+            if self.match_res:
+                pos = self.match_res.max_location_center
+            else:
+                raise Exception("未识别到目标,请先识别坐标以点击")
+        backend_win32com.VirtualKeyboard(hwnd).mouse_move_press(*pos)
+        logger.info(f"{Path(self.img_path).name} {pos}")
+
+class BaseOCRMatch(SQLModel, table=False):
+    copy: Optional[OCRMatch] = OCRMatch(find_txt='copy')
+    paste: Optional[OCRMatch] = OCRMatch(find_txt='paste')
+    select_all: Optional[OCRMatch] = OCRMatch(find_txt='select all')
+
+
+# 浏览器窗口标签图标
+class CvWinBroModel(SQLModel, table=False):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    chatgpt: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'openai' / 'tab-icon.png'))
+    claude: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'claude' / 'tab-icon.png'))
+    refresh_btn: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'refrsh-btn.png'))
+    refresh_btn2: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'refrsh-btn2.png'))
+    paste: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'paste.png'))
+    
+    
+class DriverBase:
+    name = 'About Version'
+    url = ''
+    # 浏览器对象,根据主进程启动 Chrome 得到浏览器主页面。仅有一个主进程,其他子窗口或标签都是子进程 tab 页面。
+    page:ChromiumPage = None
+    # 根据 PID 获得所有的窗口句柄
+    all_windows_info:Dict[str,WindowsInfo] = {}
+    paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en",)
+    _lock = False
+    
+    def __init__(self, ini_file=BRO_INI_FILE) -> None:
+        self.tab = None
+        self.bromodel = CvWinBroModel()
+        self.windows_info:WindowsInfo = None
+        self.vb:backend_win32com.VirtualKeyboard = None
+        self.ocr_model:BaseOCRMatch = None
+        self.ini_file = ini_file
+
+    @thread_safe
+    def init(self):
+        if not DriverBase.page:
+            DriverBase.page: ChromiumPage = get_browser(self.ini_file)
+            # logger.info(f"tab.rect {tab.title} tab.rect {tab.tab_id} tab.window_location {tab.rect.window_location} tab.size {tab.rect.size}")
+            # DriverBase.page.get('chrome://version/')
+            DriverBase.page.wait.doc_loaded(timeout=10)
+            self.init_all_windows_info()
+            return DriverBase.page
+            # 获取所有窗口句柄和PID
+            results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
+            while results:
+                hwnd, pid, title = results.pop()
+                logger.info(f"{DriverBase.name} init {hwnd, pid, title} {DriverBase.name in title}")
+                if main_windows is None and DriverBase.name in title:
+                    main_windows = (hwnd, pid, title)
+                else:
+                    logger.info(f"close other {hwnd, pid, title}")
+                    win32gui.PostMessage(hwnd, win32con.WM_CLOSE, 0, 0)
+            hwnd, pid, title = main_windows
+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
+            _, _, win_w, win_h = client_rect
+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title, tab_id=DriverBase.page.tab_id)
+            DriverBase.all_windows_info.update({DriverBase.page.tab_id: window_info})
+            logger.info(f"init page browser_version {DriverBase.page.browser_version} get_tabs {DriverBase.page.get_tabs()}")
+    
+    def init_all_windows_info(self, process_id=None, cache=False):
+        if cache:
+            return self.all_windows_info
+        results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
+        tabs = DriverBase.page.get_tabs()
+        for hwnd, pid, title in results:
+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
+            _, _, win_w, win_h = client_rect
+            tab_id = None
+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title, tab_id=tab_id)
+            DriverBase.all_windows_info.update({hwnd: window_info})
+        return DriverBase.all_windows_info
+    
+    def update_all_windows_info(self, hwnd, window_info):
+        DriverBase.all_windows_info.update({hwnd: window_info})
+        return DriverBase.all_windows_info
+    
+    
+    def new_windows(self, url:str=''):
+        while DriverBase._lock:
+            time.sleep(0.1)
+        DriverBase._lock = True
+        try:
+            self.tab = DriverBase.page.new_tab(url=url,new_window=True)
+            # 获取所有窗口句柄和PID
+            results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
+            hwnd, pid, title = results.pop(0)
+            self.windows_info = self.get_windows_info(hwnd, tab=self.tab)
+            self.update_all_windows_info(hwnd, self.windows_info)
+            self.vb = backend_win32com.VirtualKeyboard(hwnd)
+            logger.info(f"{self.name} 打开新窗口 {self.windows_info.model_dump()}")
+            logger.debug(f"self.all_windows_info {self.all_windows_info}")
+            return self.windows_info
+        except Exception as e:
+            logger.exception(f"new_windows {e}")
+        finally:
+            DriverBase._lock = False
+    
+    @thread_safe
+    def find_or_new_windows(self, title:str, url:str='') -> list[WindowsInfo] | WindowsInfo:
+        res = []
+        win = None
+        for windows_info in DriverBase.all_windows_info.values():
+            if title.lower() in windows_info.window_title.lower():
+                if not windows_info.tab_id:
+                    win = windows_info
+        if win:
+            self.tab = self.find_shifted_tab(win)
+            win.tab_id = self.tab.tab_id
+            backend_win32com.show_win(win.hwnd)
+            self.update_all_windows_info(win.hwnd, win)
+            logger.info(f"{self.name} 已存在窗口 {win.model_dump()}")
+        else:
+            win = self.new_windows(url=url)
+        return win
+    
+    @thread_safe
+    def find_shifted_tab(self, windows_info: WindowsInfo):
+        # Get initial tab positions
+        initial_tabs = DriverBase.page.get_tabs()
+        initial_positions = {tab.title: tab.rect.window_location for tab in initial_tabs}
+        
+        # Log initial positions
+        # for tab in initial_tabs:
+        #     logger.info(f"Initial: tab.title {tab.title} tab.rect.window_location {tab.rect.window_location}")
+        
+        # Move window to the right
+        x, y, _, _ = windows_info.win_rect
+        win32gui.SetWindowPos(windows_info.hwnd, win32con.HWND_TOP, x + 100, y, 0, 0, win32con.SWP_NOSIZE)
+        
+        # Get new tab positions
+        shifted_tabs = DriverBase.page.get_tabs()
+        
+        # Find the tab that has shifted
+        shifted_tab = None
+        for tab in shifted_tabs:
+            new_position = tab.rect.window_location
+            initial_position = initial_positions.get(tab.title)
+            
+            if initial_position and new_position != initial_position:
+                shifted_tab = tab
+                break
+        
+        # Move window back to original position
+        win32gui.SetWindowPos(windows_info.hwnd, win32con.HWND_TOP, x, y, 0, 0, win32con.SWP_NOSIZE)
+        
+        return shifted_tab        
+    
+    def close_windows(self):
+        if not self.windows_info:
+            return
+        win32gui.PostMessage(self.windows_info.hwnd, win32con.WM_CLOSE, 0, 0)
+        self.all_windows_info.pop(self.windows_info.hwnd)
+        self.tab = None
+        self.windows_info = None        
+    
+    def close_no_tab_id():
+        keys_to_remove = []
+        
+        for key in list(DriverBase.all_windows_info.keys()):
+            windows_info = DriverBase.all_windows_info[key]
+            if not windows_info.tab_id:
+                logger.info(f"close windows {windows_info.model_dump()}")
+                win32gui.PostMessage(windows_info.hwnd, win32con.WM_CLOSE, 0, 0)
+                time.sleep(0.1)
+                keys_to_remove.append(key)
+        
+        for key in keys_to_remove:
+            DriverBase.all_windows_info.pop(key)    
+            
+    def get_windows_info(self, hwnd:int, tab:ChromiumPage=None):
+        pid = None if not self.page else self.page.process_id
+        tab_id = None if not tab else tab.tab_id
+        rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
+        _, _, win_w, win_h = client_rect
+        windows_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title,tab_id=tab_id)
+        return windows_info
+    
+    def find(self, locator: GptImgMatch, screenshot=None, thread:float=0.98, region=None, raise_error=False) -> ImageMatchResult:
+        if screenshot is None or not screenshot.any():
+            screenshot = self.screenshot(region=region)
+        result = locator.img_match(screenshot, thread=thread)
+        if result:
+            log = f"识别到目标 {Path(locator.img_path).name}"
+            logger.debug(f"{log}")
+        else:
+            log = f"未识别到目标 {Path(locator.img_path).name} {locator}"
+            if raise_error:
+                raise Exception(log)
+            logger.debug(f"{log}")
+        return result
+
+    @async_wrapper
+    def click(self, locator: GptImgMatch, screenshot=None, thread:float=0.92, raise_error=False, button: str = "L"):
+        if not locator.match_res:
+            locator.match_res = self.find(locator,screenshot,thread)
+            if not locator.match_res:
+                logger.error(f"Failed to find {locator.img_path}")
+                return False
+        if locator.is_match(thread=thread):
+            logger.info(f"{locator}")
+            self.vb.mouse_move_press(*locator.match_res.max_location_center, button=button)
+            return True
+        if raise_error:
+            raise Exception(f"Failed to find {locator}")
+    
+    def get_click_right_windows(self, pos:list, hwnd=None, wait_pop_time:float=0.3):
+        if not hwnd:
+            hwnd = self.windows_info.hwnd
+            vb = backend_win32com.VirtualKeyboard(hwnd)
+        else:
+            vb = self.vb
+        logger.info(f"hwnd {hwnd} pos {pos}")
+        before_click_windows = backend_win32com.get_child_windows(hwnd)
+        vb.mouse_move_press(*pos, button= 'R')
+        time.sleep(wait_pop_time)
+        after_click_windows = backend_win32com.get_child_windows(hwnd)
+        new_windows = [hwnd for hwnd in after_click_windows if hwnd not in before_click_windows]
+        if not new_windows:
+            logger.error(f"右键找到新窗口 click")
+            return
+        pop_hwnd = new_windows[0]
+        return pop_hwnd
+    
+    @async_wrapper
+    def click_paste(self, txt_area_locator: GptImgMatch, hwnd=None, thread:float=0.92, raise_error=False):
+        if not hwnd:
+            hwnd = self.windows_info.hwnd
+        before_click_windows = backend_win32com.get_child_windows(hwnd)
+        self.click(txt_area_locator, thread=thread, raise_error=True, button="R")
+        time.sleep(0.3)
+        after_click_windows = backend_win32com.get_child_windows(hwnd)
+        new_windows = [hwnd for hwnd in after_click_windows if hwnd not in before_click_windows]
+        if not new_windows:
+            logger.error(f"右键找到新窗口 click {txt_area_locator} ")
+            return
+        pop_hwnd = new_windows[0]
+        screenshot = backend_win32com.back_end_screenshot(pop_hwnd)
+        child_windows = backend_win32com.get_window_info(pop_hwnd)
+        logger.info(f"child_windows {child_windows}")
+        paste = self.bromodel.paste.img_match(screenshot, thread=thread)
+        logger.info(f"paste img_match {paste}")
+        vb = backend_win32com.VirtualKeyboard(pop_hwnd)
+        vb.mouse_move_press(*self.bromodel.paste.match_res.max_location_center)
+        time.sleep(0.3)
+    
+    def select_all(self, txt_area_pos:list, hwnd=None):
+        pop_hwnd = self.get_click_right_windows(txt_area_pos, hwnd=hwnd)
+        vb = backend_win32com.VirtualKeyboard(pop_hwnd)
+        self.ocr_model.select_all = self.ocr_find_txt(self.ocr_model.select_all, pop_hwnd)
+        vb.mouse_move_press(*self.ocr_model.select_all.center(), button= 'R')
+        
+
+    def paste_str(self, text: str, txt_area_locator: GptImgMatch, hwnd=None):
+        logger.info(f"send_str {text}")
+        win32clipboard.OpenClipboard()
+        win32clipboard.EmptyClipboard()
+        win32clipboard.SetClipboardText(text)
+        win32clipboard.CloseClipboard()
+        try:
+            self.click_paste(txt_area_locator)
+        except Exception as e:
+            logger.error(f"send_str {text} error {e}")
+        # Clear clipboard after pasting
+        win32clipboard.OpenClipboard()
+        win32clipboard.EmptyClipboard()
+        win32clipboard.CloseClipboard()
+    
+    @async_wrapper
+    def wait_for(self, locator: GptImgMatch, timeout: float = 10, interval: float = 0.5, thread: float = 0.98) -> ImageMatchResult:
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            result = self.find(locator, thread=thread, raise_error=False)
+            if result:
+                return result
+            time.sleep(interval)
+        
+        # If the timeout is reached without finding the image
+        log = f"Timeout reached. Unable to find {Path(locator.img_path).name} within {timeout} seconds"
+        logger.warning(log)
+        return None
+    
+    def find_or_new_tab(self, url:str=''):
+        if not url:
+            url = self.url
+        for tab_id in self.page.get_tabs():
+            tab:ChromiumPage = self.page.get_tab(tab_id)
+            if self.name in tab.url:
+                return tab
+        return self.page.new_tab(url=self.url,new_window=True)
+    
+    
+    
+    @async_wrapper
+    def click_refresh(self):
+        screenshot = self.screenshot()
+        return (self.click(self.bromodel.refresh_btn, screenshot=screenshot) or
+                self.click(self.bromodel.refresh_btn2, screenshot=screenshot))
+        # return self.click(self.bromodel.refresh_btn)
+    
+    def get_tabs_icon(self, hwnd_list:List[int]):
+        for hwnd in hwnd_list:
+            self.find(self.bromodel.chatgpt,screen=backend_win32com.back_end_screenshot(hwnd))
+            if self.bromodel.chatgpt.is_match():
+                return 
+    def screenshot(self, hwnd=None, filename:str=None, region=None):
+        if not hwnd:
+            hwnd = self.windows_info.hwnd
+        if region:
+            return backend_win32com.back_end_screenshot_region(hwnd,filename, region)
+        return backend_win32com.back_end_screenshot(hwnd,filename)
+
+    def get_all_windows_info(self):
+        results = backend_win32com.enum_chrome_windows_by_pid(self.page.process_id)
+        ret = {}
+        hwnd_to_tab_ids: Dict[int, List[str]] = {}
+
+        for hwnd, pid, title in results:
+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
+            _, _, win_w, win_h = client_rect
+            # logger.info(f"window_location {rect}")
+            # logger.info(f"window_size {client_rect}")
+            tab = self.find_tab_title(window_title)
+            logger.info(f"window_title {window_title} hwnd {hwnd} client_rect {client_rect}")
+            logger.info(f"tab title {tab.title}")
+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title,tab_id=tab.tab_id)
+            ret.update({tab.tab_id:window_info})
+        return ret
+
+    def find_tab_title(self, window_title:str):
+        for tab in self.page.get_tabs():
+            logger.debug(f"{tab.title.lower()}")
+            if tab.title.lower() in window_title.lower():
+                return tab
+    
+    def ocr(self, screen)->List[OCRMatch]:
+        '''
+        例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
+        need to run only once to download and load model into memory
+        '''
+        result = self.paddle_ocr.ocr(screen, cls=True)
+        lines_model = []
+        for idx in range(len(result)):
+            res = result[idx]
+            for line in res:
+                coordinates, (text, confidence) = line
+                ocr_match = OCRMatch(
+                    top_left=tuple(coordinates[0]),
+                    top_right=tuple(coordinates[1]),
+                    bottom_right=tuple(coordinates[2]),
+                    bottom_left=tuple(coordinates[3]),
+                    ocr_txt=text,
+                    ocr_confidence=confidence
+                )
+                lines_model.append(ocr_match)
+        return lines_model
+    
+    def ocr_find_txt(self, ocr_model: OCRMatch, hwnd=None, ocr_result=None, threshold: float = 0.90, similarity_threshold: float = 80):
+        if not ocr_result:
+            screen = self.screenshot(hwnd)
+            ocr_result = self.ocr(screen)
+        assert ocr_model is not None
+        logger.debug(f"ocr_model {ocr_model}")
+        for ocr_match in ocr_result:
+            if not ocr_match.ocr_confidence > threshold:
+                continue
+            similarity = fuzz.ratio(ocr_model.find_txt.lower(), ocr_match.ocr_txt.lower())
+            # logger.info(f"ocr_match.ocr_txt {ocr_match.ocr_txt} - ocr_model.find_txt {ocr_model.find_txt} - similarity {similarity}")
+            
+            if similarity > similarity_threshold:
+                ocr_match.find_txt_similarity = similarity
+                ocr_match.find_txt = ocr_model.find_txt
+                return ocr_match
+
+        return None
+    
+    @async_wrapper
+    def quit():
+        if DriverBase.page:
+            DriverBase.page.quit()

+ 14 - 0
utils/proxy_pool.py

@@ -0,0 +1,14 @@
+import random
+
+
+def get_random_proxy() -> str:
+    """Get random proxy from proxy pool"""
+    proxies = [
+    'http://127.0.0.1:9360',
+    'http://127.0.0.1:9362',
+    'http://127.0.0.1:9364',
+    'http://127.0.0.1:9366',
+    'http://127.0.0.1:9368',
+    'http://127.0.0.1:1881',
+    ]
+    return random.choice(proxies)

+ 1 - 1
worker/celery/app.py

@@ -7,7 +7,7 @@ app = Celery(
         'worker.celery.tasks',  # 取消注释原始任务模块
         'worker.celery.crawl_tasks',
         'worker.celery.async_tasks',
-        'worker.celery.pandoc_tasks'  # 注册新的 Pandoc 任务模块
+        'worker.celery.html_convert_tasks'  # 注册新的 Pandoc 任务模块
     ]
 )
 app.config_from_object(celeryconfig)

+ 38 - 104
worker/celery/html_convert_tasks.py

@@ -1,142 +1,76 @@
 from celery import current_app
-from worker.html_convert.pandoc import PandocConverter
+from worker.html_convert.pandoc import process_single_example, process_all_results
 from mylib.logu import get_logger
-from sqlmodel import Session
-from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem
-from worker.html_convert.models import HtmlConvertResult
+from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
+from sqlmodel import Session, select
 
 logger = get_logger('pandoc_tasks')
 
 @current_app.task(name='html_convert_tasks_worker.convert_single_result')
-def convert_single_result_task(result_id: int, font_name: str = "宋体", include_toc: bool = False, skip_existing: bool = True):
+def convert_single_result_task(result_id: int):
     """
     Celery task to convert a single SearchResultItem using Pandoc.
     
     Args:
         result_id (int): The ID of the SearchResultItem to process.
-        font_name (str): Font name for DOCX output.
-        include_toc (bool): Whether to include a table of contents.
-        skip_existing (bool): Skip conversion if DOCX already exists.
     
     Returns:
         dict: Task result status.
     """
     try:
         logger.info(f"Starting Pandoc conversion for SearchResultItem ID: {result_id}")
-        converter = PandocConverter(font_name=font_name, include_toc=include_toc)
-        success = converter.process_single_result(result_id, skip_existing=skip_existing)
-        
-        if success:
-            logger.info(f"Pandoc conversion completed for SearchResultItem ID: {result_id}")
-            return {"result_id": result_id, "status": "completed"}
-        else:
-            logger.error(f"Pandoc conversion failed for SearchResultItem ID: {result_id}")
-            return {"result_id": result_id, "status": "failed"}
+        process_single_example(result_id)
+        logger.info(f"Pandoc conversion completed for SearchResultItem ID: {result_id}")
+        return {"result_id": result_id, "status": "completed"}
     except Exception as e:
         logger.exception(f"Error during Pandoc conversion for SearchResultItem ID: {result_id}: {str(e)}")
         return {"result_id": result_id, "status": "failed"}
 
 @current_app.task(name='html_convert_tasks.convert_all_results')
-def convert_all_results_task(font_name: str = "宋体", include_toc: bool = False, skip_existing: bool = True):
+def convert_all_results_task():
     """
     Celery task to convert all SearchResultItems using Pandoc.
     
-    Args:
-        font_name (str): Font name for DOCX output.
-        include_toc (bool): Whether to include a table of contents.
-        skip_existing (bool): Skip conversion if DOCX already exists.
-    
     Returns:
         dict: Task result status.
     """
     try:
         logger.info("Starting Pandoc conversion for all SearchResultItems")
-        db_manager = SearchResultManager()
-        with Session(db_manager.engine) as session:
-            # Fetch all SearchResultItem IDs with explicit ordering
-            result_ids = session.exec(
-                session.query(SearchResultItem.id).order_by(SearchResultItem.id)
-            ).all()
-            logger.info(f"Total results to process: {len(result_ids)}")
-        
-        converter = PandocConverter(font_name=font_name, include_toc=include_toc)
-        success_count = 0
-        
-        for result_id in result_ids:
-            success = converter.process_single_result(result_id, skip_existing=skip_existing)
-            if success:
-                success_count += 1
-        
-        logger.info(f"Pandoc conversion completed for {success_count}/{len(result_ids)} SearchResultItems")
-        return {"total_results": len(result_ids), "success_count": success_count, "status": "completed"}
+        process_all_results()
+        logger.info("Pandoc conversion completed for all SearchResultItems")
+        return {"status": "completed"}
     except Exception as e:
         logger.exception(f"Error during bulk Pandoc conversion: {str(e)}")
         return {"status": "failed", "error": str(e)}
 
-def main():
-    """
-    Client example to demonstrate how to use the Pandoc conversion tasks.
-    This can be run directly from the command line for testing purposes.
-    """
-    import sys
-    
-    if len(sys.argv) < 2:
-        print("Usage: python pandoc_tasks.py [single|all] [options]")
-        print("Options for 'single':")
-        print("  --id <result_id>       Specify the SearchResultItem ID to process")
-        print("  --font <font_name>     Specify font name for DOCX output (default: 宋体)")
-        print("  --toc                  Include table of contents in DOCX output")
-        print("  --overwrite            Overwrite existing DOCX files")
-        print("\nOptions for 'all':")
-        print("  --font <font_name>     Specify font name for DOCX output (default: 宋体)")
-        print("  --toc                  Include table of contents in DOCX output")
-        print("  --overwrite            Overwrite existing DOCX files")
-        sys.exit(1)
-    
-    mode = sys.argv[1]
-    
-    def parse_common_args(args):
-        font_name = "宋体"
-        include_toc = True
-        skip_existing = True
-        
-        for i, arg in enumerate(args):
-            if arg == "--font" and i + 1 < len(args):
-                font_name = args[i + 1]
-            elif arg == "--toc":
-                include_toc = True
-            elif arg == "--overwrite":
-                skip_existing = False
+def test_task_process_all_results():
+    # Process all results in the database
+    db_manager = SearchResultManager()
+    with Session(db_manager.engine) as session:
+        # Fetch all IDs with explicit ordering
+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
+        logger.info(f"Total results: {len(result_ids)}")
+        logger.info(f"First 5 result IDs: {result_ids[:5]}")
         
-        return font_name, include_toc, skip_existing
-    
-    if mode == "single":
-        if "--id" not in sys.argv:
-            print("Error: '--id' argument is required for single result processing.")
-            sys.exit(1)
-        
-        result_id_index = sys.argv.index("--id") + 1
-        if result_id_index >= len(sys.argv):
-            print("Error: Missing SearchResultItem ID after '--id'.")
-            sys.exit(1)
-        
-        result_id = int(sys.argv[result_id_index])
-        font_name, include_toc, skip_existing = parse_common_args(sys.argv)
-        
-        # Call the single result conversion task
-        result = convert_single_result_task.delay(result_id, font_name, include_toc, skip_existing)
-        print(f"Conversion task submitted for SearchResultItem ID {result_id}. Task ID: {result.id}")
-    
-    elif mode == "all":
-        font_name, include_toc, skip_existing = parse_common_args(sys.argv)
-        
-        # Call the bulk conversion task
-        result = convert_all_results_task.delay(font_name, include_toc, skip_existing)
-        print(f"Bulk conversion task submitted. Task ID: {result.id}")
-    
-    else:
-        print(f"Unknown mode: {mode}")
-        sys.exit(1)
+        for result_id, html_path in result_ids:
+            try:
+                if html_path.endswith('.html'):
+                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
+                    convert_single_result_task.delay(result_id)
+            except Exception as e:
+                logger.error(f"Error processing result {result_id}: {e}")
+
+def clear_existing_tasks():
+    """清除所有待处理的任务"""
+    try:
+        discarded_count = current_app.control.discard_all()
+        logger.info(f"已清除 {discarded_count} 个待处理任务")
+    except Exception as e:
+        logger.error(f"清除任务失败: {str(e)}")
+
+def main():
+    test_task_process_all_results()
+    # clear_existing_tasks()
 
 if __name__ == "__main__":
     main()

+ 2 - 0
worker/celery/tasks.py

@@ -12,6 +12,8 @@ def get_random_proxy():
         'http://127.0.0.1:9360',
         'http://127.0.0.1:9362',
         'http://127.0.0.1:9364',
+        'http://127.0.0.1:9366',
+        'http://127.0.0.1:9368',
         'http://127.0.0.1:1881',
     ]
     return random.choice(proxies)

+ 4 - 2
worker/html_convert/pandoc.py

@@ -107,7 +107,7 @@ class PandocConverter:
             if not html_convert:
                 logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
                 return False
-            
+            logger.info(f"pandoc start html_convert id {html_convert.id}  result_id {result_id}")
             # Initialize success flags
             docling_success = False
             filtered_success = False
@@ -177,7 +177,9 @@ class PandocConverter:
 def process_single_example(result_id: int):
     # Process a single result example
     docling_converter = DoclingConverter()
-    docling_converter.process_conversion_by_id(result_id)
+    search_result_item = docling_converter.get_search_result_item(result_id)
+    if search_result_item.html_path.endswith('.html'):
+        docling_converter.process_conversion_by_id(result_id)
     
     crawl_filter = CrawlFilter()
     crawl_filter.process_filter_by_id(result_id)

+ 2 - 1
worker/readme.md

@@ -20,5 +20,6 @@ $env:PC_NAME="w2"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
 $env:PC_NAME="w3"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
 $env:PC_NAME="w4"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
 $env:PC_NAME="w5"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
-
+$env:PC_NAME="w6"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
+$env:PC_NAME="w7"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
 ```

+ 36 - 0
worker/search_engine/search_result_db.py

@@ -1,6 +1,7 @@
 from datetime import datetime
 from typing import Optional, List
 from sqlmodel import SQLModel, Field, Relationship, create_engine, Session, select, delete, func
+from sqlalchemy.orm import relationship
 from sqlalchemy import UniqueConstraint
 from sqlalchemy.sql import text
 from pathlib import Path
@@ -49,6 +50,17 @@ class SearchResultItem(SQLModel, table=True):
     keyword_task: Optional[KeywordTask] = Relationship(back_populates="items")
     search_page: Optional[SearchPageResult] = Relationship(back_populates="items")
 
+class VerificationItem(SQLModel, table=True):
+    __table_args__ = (UniqueConstraint("result_item_id", name="uq_verification_item"),)
+    
+    id: Optional[int] = Field(default=None, primary_key=True)
+    result_item_id: int = Field(foreign_key="searchresultitem.id")
+    search_result_item: Optional[SearchResultItem] = Relationship(
+        sa_relationship=relationship("SearchResultItem", lazy="joined")
+    )
+    verified: bool = Field(default=False)
+    created_at: datetime = Field(default_factory=datetime.now)
+
 class SearchResultManager:
     def __init__(self, db_url: str = DB_URL):
         self.engine = create_engine(db_url)
@@ -191,3 +203,27 @@ class SearchResultManager:
     def is_task_completed(self, keyword: str) -> bool:
         task = self.get_keyword_task(keyword)
         return task.is_completed if task else False
+    
+    def get_all_search_result_items(self) -> List[SearchResultItem]:
+        """
+        获取数据库中所有的 SearchResultItem。
+        """
+        with Session(self.engine) as session:
+            return session.exec(select(SearchResultItem)).all()
+    
+    def add_to_verification(self, result_item_id: int):
+        """
+        将 SearchResultItem 添加到 VerificationItem 表中,避免重复添加。
+        """
+        with Session(self.engine) as session:
+            exists = session.exec(
+                select(VerificationItem)
+                .where(VerificationItem.result_item_id == result_item_id)
+            ).first()
+            if not exists:
+                verification_item = VerificationItem(result_item_id=result_item_id)
+                session.add(verification_item)
+                session.commit()
+                session.refresh(verification_item)
+                return verification_item
+            return exists

+ 91 - 0
worker/search_engine/valid_google_search.py

@@ -0,0 +1,91 @@
+import time
+import re
+import logging
+from pathlib import Path
+from typing import Dict, Optional
+from DrissionPage import ChromiumPage
+from pydantic import BaseModel
+from scrapling import Adaptor
+from sqlmodel import Session
+from mylib.logu import logger
+from mylib.base import save_to_file
+from config.settings import GOOGLE_SEARCH_DIR
+from mylib.drission_page import load_chrome_from_ini
+from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem, KeywordTask, VerificationItem
+from worker.search_engine.smart_selector import get_search_ele
+from DrissionPage.common import Keys
+from utils.proxy_pool import get_random_proxy
+from mylib.base import ensure_output_dir, save_to_file
+from scrapling import Adaptor
+
+class ValidSearchResult:
+    def __init__(self):
+        self.db_manager = SearchResultManager()
+
+    def find_first_item_with_keyword(self, keyword: str = "真人") -> Optional[SearchResultItem]:
+        """
+        获取数据库中所有 SearchResultItem,检查每个 html_path 文件内容,
+        如果包含指定关键词,则返回第一个匹配的 SearchResultItem。
+        """
+        # 获取所有 SearchResultItem
+        items = self.db_manager.get_all_search_result_items()
+        
+        for item in items:
+            if item.html_path and Path(item.html_path).exists() and item.html_path.endswith(".html"):
+                try:
+                    # 读取 HTML 文件内容
+                    with open(item.html_path, 'r', encoding='utf-8') as file:
+                        content = file.read()
+                        # 检查是否包含关键词
+                        if keyword in content:
+                            logger.info(f"找到包含关键词 '{keyword}' 的结果: {item}")
+                            return item
+                except Exception as e:
+                    logger.error(f"读取文件 {item.html_path} 时出错: {e}")
+        
+        logger.info(f"未找到包含关键词 '{keyword}' 的结果")
+        return None
+
+    def populate_verification_table(self, keyword: str = "真人"):
+        """
+        遍历所有 SearchResultItem,将包含关键词的结果存入 VerificationItem 表。
+        """
+        items = self.db_manager.get_all_search_result_items()
+        
+        for item in items:
+            if item.html_path and Path(item.html_path).exists() and item.html_path.endswith(".html"):
+                if item.id % 100 == 0:
+                    logger.info(f"处理第 {item.id} 个结果")
+                try:
+                    with open(item.html_path, 'r', encoding='utf-8') as file:
+                        content = file.read()
+                        page = Adaptor(content)
+                        body = Adaptor(page.body)
+                        if keyword in body.get_all_text():
+                            logger.info(f"将包含关键词 '{keyword}' 的结果 {item.id} 添加到 VerificationItem 表")
+                            self.db_manager.add_to_verification(item.id)
+                except Exception as e:
+                    logger.error(f"处理文件 {item.html_path} 时出错: {e}")
+
+
+    def try_get_url(self, browser_config: dict={}):
+        browser_config.update({'proxy': get_random_proxy()})
+        logger.info(f"browser_config: {browser_config}")
+        page = load_chrome_from_ini(**browser_config) if browser_config else load_chrome_from_ini()
+        result_item = self.find_first_item_with_keyword()
+        if result_item:
+            page.get(result_item.url)
+            logger.info(f"访问 URL: {result_item.url}")
+        else:
+            logger.warning("未找到包含关键词的结果")
+        # page.quit()
+        return result_item
+        
+
+def main():
+    vsr = ValidSearchResult()
+    vsr.populate_verification_table()
+    # vsr.try_get_url()
+
+if __name__ == "__main__":
+    main()