10 månader sedan · 61c7a90974
--- a/mylib/drission_page.py
+++ b/mylib/drission_page.py
@@ -19,7 +19,7 @@ def genarate_chrome_ini(address="localhost:9321"):
 
				     # chrome_options.incognito(True)
			
 
				     path = chrome_options.save(CONFIG_DIR / f'{port}.ini')
			
 
				     return path
			
 
				-def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None):
			
 
				+def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None, no_imgs=True):
			
 
				     chrome_options = ChromiumOptions(ini_path=path)
			
 
				     if browser_path:
			
 
				         chrome_options.set_browser_path(browser_path)
			
@@ -29,6 +29,7 @@ def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=Non
 
				     elif 'HTTP_PROXY' in os.environ:
			
 
				         chrome_options.set_proxy(os.environ['HTTP_PROXY'])
			
 
				     chrome_options.auto_port(True)
			
 
				+    chrome_options.no_imgs(no_imgs)
			
 
				     logger.info(f"proxy {proxy}")
			
 
				     page = ChromiumPage(chrome_options)
			
 
				     return page
			
--- a/tests/mytest/scrapling_t.py
+++ b/tests/mytest/scrapling_t.py
@@ -54,10 +54,18 @@ def find_search_div():
 
				         # if 'search' in textarea.html_content.lower():
			
 
				         #     print("找到 search 关键字的 textarea")
			
 
				         # print("textarea.path:", textarea.path)
			
 
				+def find_verify_page():
			
 
				+    path = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acantholimon erythraeum essential oil\crawled_urls\4801.html')
			
 
				+    content = path.read_text(encoding='utf-8')
			
 
				+    page = Adaptor(content)
			
 
				+    body = Adaptor(page.body)
			
 
				+    print("body:", body.get_all_text())
			
 
				+    print("body.tag:", "真人" in body.get_all_text())
			
 
				 def main():
			
 
				     # google_search_demo()
			
 
				-    res = find_search_div()
			
 
				-    print("res:", res)
			
 
				+    # res = find_search_div()
			
 
				+    # print("res:", res)
			
 
				+    find_verify_page()
			
 
				     # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
			
 
				     # html_content = file.read_text(encoding='utf-8')
			
 
				     # page = Adaptor(html_content)
			
--- a/utils/chrome_driver/cv_common.py
+++ b/utils/chrome_driver/cv_common.py
@@ -0,0 +1,192 @@
 
				+import asyncio
			
 
				+import time
			
 
				+import cv2
			
 
				+from cv import detect
			
 
				+from DrissionPage import ChromiumPage
			
 
				+from DrissionPage.common import Actions,Keys
			
 
				+
			
 
				+from sqlmodel import SQLModel, Field, Column, JSON
			
 
				+from sqlalchemy.types import LargeBinary
			
 
				+from typing import List, Optional,Coroutine
			
 
				+
			
 
				+class ImageMatchResult(SQLModel, table=True):
			
 
				+    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				+    screen_img_data: bytes = Field(sa_column=Column(LargeBinary))
			
 
				+    template_img_path: str
			
 
				+    template_img_h: int
			
 
				+    template_img_w: int
			
 
				+    threshold: float
			
 
				+    match_min_val: float
			
 
				+    match_max_val: float
			
 
				+    min_location: list = Field(sa_column=Column(JSON))
			
 
				+    max_location: list = Field(sa_column=Column(JSON))
			
 
				+    max_location_center: List[int] = Field(sa_column=Column(JSON))
			
 
				+
			
 
				+class CVPage:
			
 
				+    def __init__(self, tab:ChromiumPage=None,) -> None:
			
 
				+        self.tab =  tab
			
 
				+
			
 
				+    @classmethod
			
 
				+    def match_img_in_screen(cls, screen, target_path) -> ImageMatchResult:
			
 
				+        screen_img = detect.read_image(screen)
			
 
				+        template_img = detect.read_image(target_path)
			
 
				+
			
 
				+        # 获取模板图像的宽度和高度
			
 
				+        w, h = template_img.shape[1], template_img.shape[0]
			
 
				+
			
 
				+        # 模板匹配
			
 
				+        result = cv2.matchTemplate(screen_img, template_img, cv2.TM_CCOEFF_NORMED)
			
 
				+        min_val, max_val, min_location, max_location = cv2.minMaxLoc(result)
			
 
				+        match_result = ImageMatchResult(
			
 
				+            screen_img_path=screen_img,
			
 
				+            template_img_path=target_path,
			
 
				+            template_img_h=h,
			
 
				+            template_img_w=w,
			
 
				+            match_min_val=min_val,
			
 
				+            match_max_val=max_val,
			
 
				+            min_location=min_location,
			
 
				+            max_location=max_location,
			
 
				+            max_location_center= [max_location[0] + w // 2, max_location[1] + h // 2]
			
 
				+        )
			
 
				+        return match_result
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def match_img_in_screen_region(cls, screen, target_path, region=None) -> ImageMatchResult:
			
 
				+        screen_img: cv2.typing.MatLike = detect.read_image(screen)
			
 
				+        template_img: cv2.typing.MatLike = detect.read_image(target_path)
			
 
				+
			
 
				+        # 获取屏幕图像的尺寸
			
 
				+        screen_h, screen_w = screen_img.shape[:2]
			
 
				+
			
 
				+        # 如果指定了region，计算实际的匹配区域
			
 
				+        if region:
			
 
				+            x1, y1, x2, y2 = region
			
 
				+            x1, y1 = int(x1 * screen_w), int(y1 * screen_h)
			
 
				+            x2, y2 = int(x2 * screen_w), int(y2 * screen_h)
			
 
				+            roi = screen_img[y1:y2, x1:x2]
			
 
				+        else:
			
 
				+            roi = screen_img
			
 
				+
			
 
				+        # 获取模板图像的宽度和高度
			
 
				+        w, h = template_img.shape[1], template_img.shape[0]
			
 
				+        # cv2.imwrite('test1.png', roi)
			
 
				+        # 模板匹配
			
 
				+        result = cv2.matchTemplate(roi, template_img, cv2.TM_CCOEFF_NORMED)
			
 
				+        min_val, max_val, min_location, max_location = cv2.minMaxLoc(result)
			
 
				+
			
 
				+        match_result = ImageMatchResult(
			
 
				+            screen_img_path=screen if type(screen) == str else None,
			
 
				+            template_img_path=target_path,
			
 
				+            template_img_h=h,
			
 
				+            template_img_w=w,
			
 
				+            match_min_val=min_val,
			
 
				+            match_max_val=max_val,
			
 
				+            min_location=min_location,
			
 
				+            max_location=max_location,
			
 
				+            max_location_center=[max_location[0] + w // 2, max_location[1] + h // 2]
			
 
				+        )
			
 
				+        return match_result
			
 
				+    
			
 
				+    def show_match_result(screen, match_result:ImageMatchResult, threshold=0.8, imshow=True):
			
 
				+        # 读取源图像和目标图像
			
 
				+        source_img: cv2.typing.MatLike = detect.read_image(screen)
			
 
				+        
			
 
				+        w = match_result.template_img_w   
			
 
				+        h = match_result.template_img_h     
			
 
				+        # 获取匹配结果
			
 
				+        max_val = match_result.match_max_val
			
 
				+        max_loc = match_result.max_location
			
 
				+        
			
 
				+        if max_val >= threshold:  # 如果最大相似度大于或等于阈值
			
 
				+            top_left = max_loc
			
 
				+            bottom_right = (top_left[0] + w, top_left[1] + h)
			
 
				+            label = f"Similarity: {max_val:.2f}"
			
 
				+            cv2.putText(source_img, label, (top_left[0], top_left[1] - 10),
			
 
				+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
			
 
				+            # 绘制矩形框
			
 
				+            cv2.rectangle(source_img, top_left, bottom_right, (0, 255, 0), 2)
			
 
				+        #     print(f"匹配位置: {top_left,bottom_right}, 相似度: {max_val}")
			
 
				+
			
 
				+        # else:
			
 
				+        #     print("没有找到相似度足够高的匹配.")
			
 
				+            
			
 
				+        if imshow:
			
 
				+            # 创建一个窗口来显示结果
			
 
				+            cv2.namedWindow('Detected', cv2.WINDOW_NORMAL)
			
 
				+            cv2.imshow('Detected', source_img)
			
 
				+            
			
 
				+            # 等待按键
			
 
				+            cv2.waitKey(0)
			
 
				+            cv2.destroyAllWindows()
			
 
				+        else:
			
 
				+            return source_img
			
 
				+    
			
 
				+    def wait_for_image_match(self, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):  
			
 
				+        """  
			
 
				+        等待直到图片匹配度超过给定阈值或超时。  
			
 
				+  
			
 
				+        :param target_path: 目标图片的路径  
			
 
				+        :param threshold: 匹配度阈值，默认为0.8  
			
 
				+        :param timeout: 等待超时时间（秒），默认为30秒  
			
 
				+        :param interval: 每次检查之间的间隔时间（秒），默认为0.5秒  
			
 
				+        :return: 如果找到匹配的图片，则返回ImageMatchResult，否则返回None  
			
 
				+        """  
			
 
				+        end_time = time.time() + timeout  
			
 
				+        while time.time() < end_time:  
			
 
				+            match_result = self.match_img_in_screen(target_path)  
			
 
				+            if match_result.match_max_val >= threshold:  
			
 
				+                return match_result  
			
 
				+            time.sleep(interval)  
			
 
				+        if raise_error:  
			
 
				+            raise Exception(f'超时未找到匹配的图片 {target_path} ，阈值为{threshold}')
			
 
				+        return None  # 超时未找到符合条件的匹配 
			
 
				+
			
 
				+    async def wait_for_image_match_async(self, screen, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False)->ImageMatchResult:
			
 
				+            """
			
 
				+            异步等待直到图片匹配度超过给定阈值或超时。
			
 
				+            """
			
 
				+            end_time = time.time() + timeout
			
 
				+            while time.time() < end_time:
			
 
				+                if isinstance(screen, Coroutine):
			
 
				+                    print(screen)
			
 
				+                    screen = await screen()
			
 
				+                match_result = self.match_img_in_screen(screen, target_path)
			
 
				+                if match_result.match_max_val >= threshold:
			
 
				+                    return match_result
			
 
				+                await asyncio.sleep(interval)
			
 
				+            if raise_error:
			
 
				+                raise Exception(f'超时未找到匹配的图片 {target_path} ，阈值为{threshold}')
			
 
				+            return None  # 超时未找到符合条件的匹配    
			
 
				+    def wait_for_image_disappear(self, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):  
			
 
				+        """  
			
 
				+        等待直到目标图片从屏幕上消失或超时。  
			
 
				+  
			
 
				+        :param target_path: 目标图片的路径  
			
 
				+        :param threshold: 匹配度阈值，当匹配度低于此值时认为图片已消失，默认为0.3  
			
 
				+        :param timeout: 等待超时时间（秒），默认为30秒  
			
 
				+        :param interval: 每次检查之间的间隔时间（秒），默认为0.5秒  
			
 
				+        :return: 如果图片消失，则返回True，否则返回False  
			
 
				+        """  
			
 
				+        end_time = time.time() + timeout  
			
 
				+        while time.time() < end_time:  
			
 
				+            match_result = self.match_img_in_screen(target_path)  
			
 
				+            if match_result.match_max_val < threshold:  
			
 
				+                return True  # 图片已消失  
			
 
				+            time.sleep(interval)  
			
 
				+        if raise_error:  
			
 
				+            raise Exception(f'超时未找到匹配的图片 {target_path} ，阈值为{threshold}')
			
 
				+        return None  # 超时未找到符合条件的匹配 
			
 
				+
			
 
				+    async def wait_for_image_disappear_async(self, screen, target_path, threshold=0.9, timeout=30, interval=0.3, raise_error=False):
			
 
				+            """
			
 
				+            异步等待直到目标图片从屏幕上消失或超时。
			
 
				+            """
			
 
				+            end_time = time.time() + timeout
			
 
				+            while time.time() < end_time:
			
 
				+                match_result = self.match_img_in_screen(screen, target_path)
			
 
				+                if match_result.match_max_val < threshold:
			
 
				+                    return True  # 图片已消失
			
 
				+                await asyncio.sleep(interval)
			
 
				+            if raise_error:
			
 
				+                raise Exception(f'超时未找到匹配的图片 {target_path} ，阈值为{threshold}')
			
 
				+            return False  # 超时未找到符合条件的匹配
			
--- a/utils/chrome_driver/pyauto_windows.py
+++ b/utils/chrome_driver/pyauto_windows.py
@@ -0,0 +1,454 @@
 
				+from datetime import datetime
			
 
				+from paddleocr import PaddleOCR, draw_ocr
			
 
				+from rapidfuzz import fuzz
			
 
				+import json
			
 
				+import time
			
 
				+from typing import List, Optional,Coroutine, Dict, Tuple,Any
			
 
				+import win32con
			
 
				+import win32clipboard
			
 
				+import win32gui
			
 
				+from pathlib import Path
			
 
				+from sqlmodel import SQLModel, Field, Column, JSON
			
 
				+from DrissionPage import ChromiumPage
			
 
				+from ai.driver.cv_common import ImageMatchResult
			
 
				+from ai.driver import backend_win32com,bro_page_pyautogui,browser_win32,send_input,cv_common
			
 
				+from ai.conf_ai.config import load_chrome_from_ini,get_logger,RESOURCE,get_browser,BRO_INI_FILE
			
 
				+from ai.gpt_node.async_wraps import async_wrapper,thread_safe
			
 
				+logger = get_logger(f'ai/gpt_node-driver-{BRO_INI_FILE}')
			
 
				+
			
 
				+class WindowsInfo(SQLModel, table=False):
			
 
				+    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				+    win_rect: Optional[list] = Field(default=[],sa_column=Column(JSON))
			
 
				+    win_size: Optional[list] = Field(default=[],sa_column=Column(JSON))
			
 
				+    hwnd: Optional[int] = Field(default=None)
			
 
				+    window_title:Optional[str] = Field(default='')
			
 
				+    pid: Optional[int] = Field(default=None)
			
 
				+    tab_id: Optional[str] = Field(default='')
			
 
				+
			
 
				+
			
 
				+class OCRMatch(SQLModel, table=False):
			
 
				+    top_left: Tuple[float, float] = Field(default=(0.0, 0.0))
			
 
				+    top_right: Tuple[float, float] = Field(default=(0.0, 0.0))
			
 
				+    bottom_right: Tuple[float, float] = Field(default=(0.0, 0.0))
			
 
				+    bottom_left: Tuple[float, float] = Field(default=(0.0, 0.0))
			
 
				+    find_txt: str = Field(default="")
			
 
				+    find_txt_similarity: int = Field(default=0)
			
 
				+    ocr_txt: str = Field(default="")
			
 
				+    ocr_confidence: float = Field(default=0.0)
			
 
				+    win_info: WindowsInfo = Field(default=None)
			
 
				+    
			
 
				+    def center(self):
			
 
				+        return (self.top_left[0] + self.bottom_right[0]) / 2, (self.top_left[1] + self.bottom_right[1]) / 2
			
 
				+
			
 
				+    def is_match(self, similarity:int=90):
			
 
				+        return self.find_txt_similarity > similarity
			
 
				+
			
 
				+class GptImgMatch(SQLModel, table=False):
			
 
				+    img_path:Optional[str] = Field(default=None)
			
 
				+    init_pos:Optional[list] = Field(default=[0,0],sa_column=Column(JSON))
			
 
				+    match_res:Optional[ImageMatchResult] = Field(default=None)
			
 
				+    
			
 
				+    def img_match(self, img, thread:float=0.92) ->ImageMatchResult|None:
			
 
				+        self.match_res:ImageMatchResult = cv_common.CVPage.match_img_in_screen(img, self.img_path)
			
 
				+        self.init_pos = self.match_res.max_location
			
 
				+        if self.is_match(thread):
			
 
				+            return self.match_res
			
 
				+    
			
 
				+    def is_match(self, thread:float=0.92):
			
 
				+        if not self.match_res:
			
 
				+            return False
			
 
				+        return self.match_res.match_max_val > thread
			
 
				+
			
 
				+    def abs_pos(self, x,y, win_rect:list=None):
			
 
				+        win_x,win_y,_ , _ = win_rect
			
 
				+        return win_x + x, win_y + y
			
 
				+    
			
 
				+    def click(self, pos:list=[], hwnd=None):
			
 
				+        if not pos:
			
 
				+            if self.match_res:
			
 
				+                pos = self.match_res.max_location_center
			
 
				+            else:
			
 
				+                raise Exception("未识别到目标，请先识别坐标以点击")
			
 
				+        backend_win32com.VirtualKeyboard(hwnd).mouse_move_press(*pos)
			
 
				+        logger.info(f"{Path(self.img_path).name} {pos}")
			
 
				+
			
 
				+class BaseOCRMatch(SQLModel, table=False):
			
 
				+    copy: Optional[OCRMatch] = OCRMatch(find_txt='copy')
			
 
				+    paste: Optional[OCRMatch] = OCRMatch(find_txt='paste')
			
 
				+    select_all: Optional[OCRMatch] = OCRMatch(find_txt='select all')
			
 
				+
			
 
				+
			
 
				+# 浏览器窗口标签图标
			
 
				+class CvWinBroModel(SQLModel, table=False):
			
 
				+    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				+    chatgpt: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'openai' / 'tab-icon.png'))
			
 
				+    claude: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'claude' / 'tab-icon.png'))
			
 
				+    refresh_btn: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'refrsh-btn.png'))
			
 
				+    refresh_btn2: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'refrsh-btn2.png'))
			
 
				+    paste: Optional[GptImgMatch] =  GptImgMatch(img_path=str(RESOURCE / 'windows' / 'paste.png'))
			
 
				+    
			
 
				+    
			
 
				+class DriverBase:
			
 
				+    name = 'About Version'
			
 
				+    url = ''
			
 
				+    # 浏览器对象，根据主进程启动 Chrome 得到浏览器主页面。仅有一个主进程，其他子窗口或标签都是子进程 tab 页面。
			
 
				+    page:ChromiumPage = None
			
 
				+    # 根据 PID 获得所有的窗口句柄
			
 
				+    all_windows_info:Dict[str,WindowsInfo] = {}
			
 
				+    paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en",)
			
 
				+    _lock = False
			
 
				+    
			
 
				+    def __init__(self, ini_file=BRO_INI_FILE) -> None:
			
 
				+        self.tab = None
			
 
				+        self.bromodel = CvWinBroModel()
			
 
				+        self.windows_info:WindowsInfo = None
			
 
				+        self.vb:backend_win32com.VirtualKeyboard = None
			
 
				+        self.ocr_model:BaseOCRMatch = None
			
 
				+        self.ini_file = ini_file
			
 
				+
			
 
				+    @thread_safe
			
 
				+    def init(self):
			
 
				+        if not DriverBase.page:
			
 
				+            DriverBase.page: ChromiumPage = get_browser(self.ini_file)
			
 
				+            # logger.info(f"tab.rect {tab.title} tab.rect {tab.tab_id} tab.window_location {tab.rect.window_location} tab.size {tab.rect.size}")
			
 
				+            # DriverBase.page.get('chrome://version/')
			
 
				+            DriverBase.page.wait.doc_loaded(timeout=10)
			
 
				+            self.init_all_windows_info()
			
 
				+            return DriverBase.page
			
 
				+            # 获取所有窗口句柄和PID
			
 
				+            results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
			
 
				+            while results:
			
 
				+                hwnd, pid, title = results.pop()
			
 
				+                logger.info(f"{DriverBase.name} init {hwnd, pid, title} {DriverBase.name in title}")
			
 
				+                if main_windows is None and DriverBase.name in title:
			
 
				+                    main_windows = (hwnd, pid, title)
			
 
				+                else:
			
 
				+                    logger.info(f"close other {hwnd, pid, title}")
			
 
				+                    win32gui.PostMessage(hwnd, win32con.WM_CLOSE, 0, 0)
			
 
				+            hwnd, pid, title = main_windows
			
 
				+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
			
 
				+            _, _, win_w, win_h = client_rect
			
 
				+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title, tab_id=DriverBase.page.tab_id)
			
 
				+            DriverBase.all_windows_info.update({DriverBase.page.tab_id: window_info})
			
 
				+            logger.info(f"init page browser_version {DriverBase.page.browser_version} get_tabs {DriverBase.page.get_tabs()}")
			
 
				+    
			
 
				+    def init_all_windows_info(self, process_id=None, cache=False):
			
 
				+        if cache:
			
 
				+            return self.all_windows_info
			
 
				+        results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
			
 
				+        tabs = DriverBase.page.get_tabs()
			
 
				+        for hwnd, pid, title in results:
			
 
				+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
			
 
				+            _, _, win_w, win_h = client_rect
			
 
				+            tab_id = None
			
 
				+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title, tab_id=tab_id)
			
 
				+            DriverBase.all_windows_info.update({hwnd: window_info})
			
 
				+        return DriverBase.all_windows_info
			
 
				+    
			
 
				+    def update_all_windows_info(self, hwnd, window_info):
			
 
				+        DriverBase.all_windows_info.update({hwnd: window_info})
			
 
				+        return DriverBase.all_windows_info
			
 
				+    
			
 
				+    
			
 
				+    def new_windows(self, url:str=''):
			
 
				+        while DriverBase._lock:
			
 
				+            time.sleep(0.1)
			
 
				+        DriverBase._lock = True
			
 
				+        try:
			
 
				+            self.tab = DriverBase.page.new_tab(url=url,new_window=True)
			
 
				+            # 获取所有窗口句柄和PID
			
 
				+            results = backend_win32com.enum_chrome_windows_by_pid(DriverBase.page.process_id)
			
 
				+            hwnd, pid, title = results.pop(0)
			
 
				+            self.windows_info = self.get_windows_info(hwnd, tab=self.tab)
			
 
				+            self.update_all_windows_info(hwnd, self.windows_info)
			
 
				+            self.vb = backend_win32com.VirtualKeyboard(hwnd)
			
 
				+            logger.info(f"{self.name} 打开新窗口 {self.windows_info.model_dump()}")
			
 
				+            logger.debug(f"self.all_windows_info {self.all_windows_info}")
			
 
				+            return self.windows_info
			
 
				+        except Exception as e:
			
 
				+            logger.exception(f"new_windows {e}")
			
 
				+        finally:
			
 
				+            DriverBase._lock = False
			
 
				+    
			
 
				+    @thread_safe
			
 
				+    def find_or_new_windows(self, title:str, url:str='') -> list[WindowsInfo] | WindowsInfo:
			
 
				+        res = []
			
 
				+        win = None
			
 
				+        for windows_info in DriverBase.all_windows_info.values():
			
 
				+            if title.lower() in windows_info.window_title.lower():
			
 
				+                if not windows_info.tab_id:
			
 
				+                    win = windows_info
			
 
				+        if win:
			
 
				+            self.tab = self.find_shifted_tab(win)
			
 
				+            win.tab_id = self.tab.tab_id
			
 
				+            backend_win32com.show_win(win.hwnd)
			
 
				+            self.update_all_windows_info(win.hwnd, win)
			
 
				+            logger.info(f"{self.name} 已存在窗口 {win.model_dump()}")
			
 
				+        else:
			
 
				+            win = self.new_windows(url=url)
			
 
				+        return win
			
 
				+    
			
 
				+    @thread_safe
			
 
				+    def find_shifted_tab(self, windows_info: WindowsInfo):
			
 
				+        # Get initial tab positions
			
 
				+        initial_tabs = DriverBase.page.get_tabs()
			
 
				+        initial_positions = {tab.title: tab.rect.window_location for tab in initial_tabs}
			
 
				+        
			
 
				+        # Log initial positions
			
 
				+        # for tab in initial_tabs:
			
 
				+        #     logger.info(f"Initial: tab.title {tab.title} tab.rect.window_location {tab.rect.window_location}")
			
 
				+        
			
 
				+        # Move window to the right
			
 
				+        x, y, _, _ = windows_info.win_rect
			
 
				+        win32gui.SetWindowPos(windows_info.hwnd, win32con.HWND_TOP, x + 100, y, 0, 0, win32con.SWP_NOSIZE)
			
 
				+        
			
 
				+        # Get new tab positions
			
 
				+        shifted_tabs = DriverBase.page.get_tabs()
			
 
				+        
			
 
				+        # Find the tab that has shifted
			
 
				+        shifted_tab = None
			
 
				+        for tab in shifted_tabs:
			
 
				+            new_position = tab.rect.window_location
			
 
				+            initial_position = initial_positions.get(tab.title)
			
 
				+            
			
 
				+            if initial_position and new_position != initial_position:
			
 
				+                shifted_tab = tab
			
 
				+                break
			
 
				+        
			
 
				+        # Move window back to original position
			
 
				+        win32gui.SetWindowPos(windows_info.hwnd, win32con.HWND_TOP, x, y, 0, 0, win32con.SWP_NOSIZE)
			
 
				+        
			
 
				+        return shifted_tab        
			
 
				+    
			
 
				+    def close_windows(self):
			
 
				+        if not self.windows_info:
			
 
				+            return
			
 
				+        win32gui.PostMessage(self.windows_info.hwnd, win32con.WM_CLOSE, 0, 0)
			
 
				+        self.all_windows_info.pop(self.windows_info.hwnd)
			
 
				+        self.tab = None
			
 
				+        self.windows_info = None        
			
 
				+    
			
 
				+    def close_no_tab_id():
			
 
				+        keys_to_remove = []
			
 
				+        
			
 
				+        for key in list(DriverBase.all_windows_info.keys()):
			
 
				+            windows_info = DriverBase.all_windows_info[key]
			
 
				+            if not windows_info.tab_id:
			
 
				+                logger.info(f"close windows {windows_info.model_dump()}")
			
 
				+                win32gui.PostMessage(windows_info.hwnd, win32con.WM_CLOSE, 0, 0)
			
 
				+                time.sleep(0.1)
			
 
				+                keys_to_remove.append(key)
			
 
				+        
			
 
				+        for key in keys_to_remove:
			
 
				+            DriverBase.all_windows_info.pop(key)    
			
 
				+            
			
 
				+    def get_windows_info(self, hwnd:int, tab:ChromiumPage=None):
			
 
				+        pid = None if not self.page else self.page.process_id
			
 
				+        tab_id = None if not tab else tab.tab_id
			
 
				+        rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
			
 
				+        _, _, win_w, win_h = client_rect
			
 
				+        windows_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title,tab_id=tab_id)
			
 
				+        return windows_info
			
 
				+    
			
 
				+    def find(self, locator: GptImgMatch, screenshot=None, thread:float=0.98, region=None, raise_error=False) -> ImageMatchResult:
			
 
				+        if screenshot is None or not screenshot.any():
			
 
				+            screenshot = self.screenshot(region=region)
			
 
				+        result = locator.img_match(screenshot, thread=thread)
			
 
				+        if result:
			
 
				+            log = f"识别到目标 {Path(locator.img_path).name}"
			
 
				+            logger.debug(f"{log}")
			
 
				+        else:
			
 
				+            log = f"未识别到目标 {Path(locator.img_path).name} {locator}"
			
 
				+            if raise_error:
			
 
				+                raise Exception(log)
			
 
				+            logger.debug(f"{log}")
			
 
				+        return result
			
 
				+
			
 
				+    @async_wrapper
			
 
				+    def click(self, locator: GptImgMatch, screenshot=None, thread:float=0.92, raise_error=False, button: str = "L"):
			
 
				+        if not locator.match_res:
			
 
				+            locator.match_res = self.find(locator,screenshot,thread)
			
 
				+            if not locator.match_res:
			
 
				+                logger.error(f"Failed to find {locator.img_path}")
			
 
				+                return False
			
 
				+        if locator.is_match(thread=thread):
			
 
				+            logger.info(f"{locator}")
			
 
				+            self.vb.mouse_move_press(*locator.match_res.max_location_center, button=button)
			
 
				+            return True
			
 
				+        if raise_error:
			
 
				+            raise Exception(f"Failed to find {locator}")
			
 
				+    
			
 
				+    def get_click_right_windows(self, pos:list, hwnd=None, wait_pop_time:float=0.3):
			
 
				+        if not hwnd:
			
 
				+            hwnd = self.windows_info.hwnd
			
 
				+            vb = backend_win32com.VirtualKeyboard(hwnd)
			
 
				+        else:
			
 
				+            vb = self.vb
			
 
				+        logger.info(f"hwnd {hwnd} pos {pos}")
			
 
				+        before_click_windows = backend_win32com.get_child_windows(hwnd)
			
 
				+        vb.mouse_move_press(*pos, button= 'R')
			
 
				+        time.sleep(wait_pop_time)
			
 
				+        after_click_windows = backend_win32com.get_child_windows(hwnd)
			
 
				+        new_windows = [hwnd for hwnd in after_click_windows if hwnd not in before_click_windows]
			
 
				+        if not new_windows:
			
 
				+            logger.error(f"右键找到新窗口 click")
			
 
				+            return
			
 
				+        pop_hwnd = new_windows[0]
			
 
				+        return pop_hwnd
			
 
				+    
			
 
				+    @async_wrapper
			
 
				+    def click_paste(self, txt_area_locator: GptImgMatch, hwnd=None, thread:float=0.92, raise_error=False):
			
 
				+        if not hwnd:
			
 
				+            hwnd = self.windows_info.hwnd
			
 
				+        before_click_windows = backend_win32com.get_child_windows(hwnd)
			
 
				+        self.click(txt_area_locator, thread=thread, raise_error=True, button="R")
			
 
				+        time.sleep(0.3)
			
 
				+        after_click_windows = backend_win32com.get_child_windows(hwnd)
			
 
				+        new_windows = [hwnd for hwnd in after_click_windows if hwnd not in before_click_windows]
			
 
				+        if not new_windows:
			
 
				+            logger.error(f"右键找到新窗口 click {txt_area_locator} ")
			
 
				+            return
			
 
				+        pop_hwnd = new_windows[0]
			
 
				+        screenshot = backend_win32com.back_end_screenshot(pop_hwnd)
			
 
				+        child_windows = backend_win32com.get_window_info(pop_hwnd)
			
 
				+        logger.info(f"child_windows {child_windows}")
			
 
				+        paste = self.bromodel.paste.img_match(screenshot, thread=thread)
			
 
				+        logger.info(f"paste img_match {paste}")
			
 
				+        vb = backend_win32com.VirtualKeyboard(pop_hwnd)
			
 
				+        vb.mouse_move_press(*self.bromodel.paste.match_res.max_location_center)
			
 
				+        time.sleep(0.3)
			
 
				+    
			
 
				+    def select_all(self, txt_area_pos:list, hwnd=None):
			
 
				+        pop_hwnd = self.get_click_right_windows(txt_area_pos, hwnd=hwnd)
			
 
				+        vb = backend_win32com.VirtualKeyboard(pop_hwnd)
			
 
				+        self.ocr_model.select_all = self.ocr_find_txt(self.ocr_model.select_all, pop_hwnd)
			
 
				+        vb.mouse_move_press(*self.ocr_model.select_all.center(), button= 'R')
			
 
				+        
			
 
				+
			
 
				+    def paste_str(self, text: str, txt_area_locator: GptImgMatch, hwnd=None):
			
 
				+        logger.info(f"send_str {text}")
			
 
				+        win32clipboard.OpenClipboard()
			
 
				+        win32clipboard.EmptyClipboard()
			
 
				+        win32clipboard.SetClipboardText(text)
			
 
				+        win32clipboard.CloseClipboard()
			
 
				+        try:
			
 
				+            self.click_paste(txt_area_locator)
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"send_str {text} error {e}")
			
 
				+        # Clear clipboard after pasting
			
 
				+        win32clipboard.OpenClipboard()
			
 
				+        win32clipboard.EmptyClipboard()
			
 
				+        win32clipboard.CloseClipboard()
			
 
				+    
			
 
				+    @async_wrapper
			
 
				+    def wait_for(self, locator: GptImgMatch, timeout: float = 10, interval: float = 0.5, thread: float = 0.98) -> ImageMatchResult:
			
 
				+        start_time = time.time()
			
 
				+        while time.time() - start_time < timeout:
			
 
				+            result = self.find(locator, thread=thread, raise_error=False)
			
 
				+            if result:
			
 
				+                return result
			
 
				+            time.sleep(interval)
			
 
				+        
			
 
				+        # If the timeout is reached without finding the image
			
 
				+        log = f"Timeout reached. Unable to find {Path(locator.img_path).name} within {timeout} seconds"
			
 
				+        logger.warning(log)
			
 
				+        return None
			
 
				+    
			
 
				+    def find_or_new_tab(self, url:str=''):
			
 
				+        if not url:
			
 
				+            url = self.url
			
 
				+        for tab_id in self.page.get_tabs():
			
 
				+            tab:ChromiumPage = self.page.get_tab(tab_id)
			
 
				+            if self.name in tab.url:
			
 
				+                return tab
			
 
				+        return self.page.new_tab(url=self.url,new_window=True)
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+    @async_wrapper
			
 
				+    def click_refresh(self):
			
 
				+        screenshot = self.screenshot()
			
 
				+        return (self.click(self.bromodel.refresh_btn, screenshot=screenshot) or
			
 
				+                self.click(self.bromodel.refresh_btn2, screenshot=screenshot))
			
 
				+        # return self.click(self.bromodel.refresh_btn)
			
 
				+    
			
 
				+    def get_tabs_icon(self, hwnd_list:List[int]):
			
 
				+        for hwnd in hwnd_list:
			
 
				+            self.find(self.bromodel.chatgpt,screen=backend_win32com.back_end_screenshot(hwnd))
			
 
				+            if self.bromodel.chatgpt.is_match():
			
 
				+                return 
			
 
				+    def screenshot(self, hwnd=None, filename:str=None, region=None):
			
 
				+        if not hwnd:
			
 
				+            hwnd = self.windows_info.hwnd
			
 
				+        if region:
			
 
				+            return backend_win32com.back_end_screenshot_region(hwnd,filename, region)
			
 
				+        return backend_win32com.back_end_screenshot(hwnd,filename)
			
 
				+
			
 
				+    def get_all_windows_info(self):
			
 
				+        results = backend_win32com.enum_chrome_windows_by_pid(self.page.process_id)
			
 
				+        ret = {}
			
 
				+        hwnd_to_tab_ids: Dict[int, List[str]] = {}
			
 
				+
			
 
				+        for hwnd, pid, title in results:
			
 
				+            rect, client_rect, window_title = backend_win32com.get_window_info(hwnd)
			
 
				+            _, _, win_w, win_h = client_rect
			
 
				+            # logger.info(f"window_location {rect}")
			
 
				+            # logger.info(f"window_size {client_rect}")
			
 
				+            tab = self.find_tab_title(window_title)
			
 
				+            logger.info(f"window_title {window_title} hwnd {hwnd} client_rect {client_rect}")
			
 
				+            logger.info(f"tab title {tab.title}")
			
 
				+            window_info = WindowsInfo(pid=pid, hwnd=hwnd, win_rect=list(rect), win_size=[win_w, win_h], window_title=window_title,tab_id=tab.tab_id)
			
 
				+            ret.update({tab.tab_id:window_info})
			
 
				+        return ret
			
 
				+
			
 
				+    def find_tab_title(self, window_title:str):
			
 
				+        for tab in self.page.get_tabs():
			
 
				+            logger.debug(f"{tab.title.lower()}")
			
 
				+            if tab.title.lower() in window_title.lower():
			
 
				+                return tab
			
 
				+    
			
 
				+    def ocr(self, screen)->List[OCRMatch]:
			
 
				+        '''
			
 
				+        例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
			
 
				+        need to run only once to download and load model into memory
			
 
				+        '''
			
 
				+        result = self.paddle_ocr.ocr(screen, cls=True)
			
 
				+        lines_model = []
			
 
				+        for idx in range(len(result)):
			
 
				+            res = result[idx]
			
 
				+            for line in res:
			
 
				+                coordinates, (text, confidence) = line
			
 
				+                ocr_match = OCRMatch(
			
 
				+                    top_left=tuple(coordinates[0]),
			
 
				+                    top_right=tuple(coordinates[1]),
			
 
				+                    bottom_right=tuple(coordinates[2]),
			
 
				+                    bottom_left=tuple(coordinates[3]),
			
 
				+                    ocr_txt=text,
			
 
				+                    ocr_confidence=confidence
			
 
				+                )
			
 
				+                lines_model.append(ocr_match)
			
 
				+        return lines_model
			
 
				+    
			
 
				+    def ocr_find_txt(self, ocr_model: OCRMatch, hwnd=None, ocr_result=None, threshold: float = 0.90, similarity_threshold: float = 80):
			
 
				+        if not ocr_result:
			
 
				+            screen = self.screenshot(hwnd)
			
 
				+            ocr_result = self.ocr(screen)
			
 
				+        assert ocr_model is not None
			
 
				+        logger.debug(f"ocr_model {ocr_model}")
			
 
				+        for ocr_match in ocr_result:
			
 
				+            if not ocr_match.ocr_confidence > threshold:
			
 
				+                continue
			
 
				+            similarity = fuzz.ratio(ocr_model.find_txt.lower(), ocr_match.ocr_txt.lower())
			
 
				+            # logger.info(f"ocr_match.ocr_txt {ocr_match.ocr_txt} - ocr_model.find_txt {ocr_model.find_txt} - similarity {similarity}")
			
 
				+            
			
 
				+            if similarity > similarity_threshold:
			
 
				+                ocr_match.find_txt_similarity = similarity
			
 
				+                ocr_match.find_txt = ocr_model.find_txt
			
 
				+                return ocr_match
			
 
				+
			
 
				+        return None
			
 
				+    
			
 
				+    @async_wrapper
			
 
				+    def quit():
			
 
				+        if DriverBase.page:
			
 
				+            DriverBase.page.quit()
			
--- a/utils/proxy_pool.py
+++ b/utils/proxy_pool.py
@@ -0,0 +1,14 @@
 
				+import random
			
 
				+
			
 
				+
			
 
				+def get_random_proxy() -> str:
			
 
				+    """Get random proxy from proxy pool"""
			
 
				+    proxies = [
			
 
				+    'http://127.0.0.1:9360',
			
 
				+    'http://127.0.0.1:9362',
			
 
				+    'http://127.0.0.1:9364',
			
 
				+    'http://127.0.0.1:9366',
			
 
				+    'http://127.0.0.1:9368',
			
 
				+    'http://127.0.0.1:1881',
			
 
				+    ]
			
 
				+    return random.choice(proxies)
			
--- a/worker/celery/app.py
+++ b/worker/celery/app.py
@@ -7,7 +7,7 @@ app = Celery(
 
				         'worker.celery.tasks',  # 取消注释原始任务模块
			
 
				         'worker.celery.crawl_tasks',
			
 
				         'worker.celery.async_tasks',
			
 
				-        'worker.celery.pandoc_tasks'  # 注册新的 Pandoc 任务模块
			
 
				+        'worker.celery.html_convert_tasks'  # 注册新的 Pandoc 任务模块
			
 
				     ]
			
 
				 )
			
 
				 app.config_from_object(celeryconfig)
			
--- a/worker/celery/html_convert_tasks.py
+++ b/worker/celery/html_convert_tasks.py
@@ -1,142 +1,76 @@
 
				 from celery import current_app
			
 
				-from worker.html_convert.pandoc import PandocConverter
			
 
				+from worker.html_convert.pandoc import process_single_example, process_all_results
			
 
				 from mylib.logu import get_logger
			
 
				-from sqlmodel import Session
			
 
				-from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem
			
 
				-from worker.html_convert.models import HtmlConvertResult
			
 
				+from worker.search_engine.search_result_db import SearchResultItem, SearchResultManager
			
 
				+from sqlmodel import Session, select
			
 
				 
			
 
				 logger = get_logger('pandoc_tasks')
			
 
				 
			
 
				 @current_app.task(name='html_convert_tasks_worker.convert_single_result')
			
 
				-def convert_single_result_task(result_id: int, font_name: str = "宋体", include_toc: bool = False, skip_existing: bool = True):
			
 
				+def convert_single_result_task(result_id: int):
			
 
				     """
			
 
				     Celery task to convert a single SearchResultItem using Pandoc.
			
 
				     
			
 
				     Args:
			
 
				         result_id (int): The ID of the SearchResultItem to process.
			
 
				-        font_name (str): Font name for DOCX output.
			
 
				-        include_toc (bool): Whether to include a table of contents.
			
 
				-        skip_existing (bool): Skip conversion if DOCX already exists.
			
 
				     
			
 
				     Returns:
			
 
				         dict: Task result status.
			
 
				     """
			
 
				     try:
			
 
				         logger.info(f"Starting Pandoc conversion for SearchResultItem ID: {result_id}")
			
 
				-        converter = PandocConverter(font_name=font_name, include_toc=include_toc)
			
 
				-        success = converter.process_single_result(result_id, skip_existing=skip_existing)
			
 
				-        
			
 
				-        if success:
			
 
				-            logger.info(f"Pandoc conversion completed for SearchResultItem ID: {result_id}")
			
 
				-            return {"result_id": result_id, "status": "completed"}
			
 
				-        else:
			
 
				-            logger.error(f"Pandoc conversion failed for SearchResultItem ID: {result_id}")
			
 
				-            return {"result_id": result_id, "status": "failed"}
			
 
				+        process_single_example(result_id)
			
 
				+        logger.info(f"Pandoc conversion completed for SearchResultItem ID: {result_id}")
			
 
				+        return {"result_id": result_id, "status": "completed"}
			
 
				     except Exception as e:
			
 
				         logger.exception(f"Error during Pandoc conversion for SearchResultItem ID: {result_id}: {str(e)}")
			
 
				         return {"result_id": result_id, "status": "failed"}
			
 
				 
			
 
				 @current_app.task(name='html_convert_tasks.convert_all_results')
			
 
				-def convert_all_results_task(font_name: str = "宋体", include_toc: bool = False, skip_existing: bool = True):
			
 
				+def convert_all_results_task():
			
 
				     """
			
 
				     Celery task to convert all SearchResultItems using Pandoc.
			
 
				     
			
 
				-    Args:
			
 
				-        font_name (str): Font name for DOCX output.
			
 
				-        include_toc (bool): Whether to include a table of contents.
			
 
				-        skip_existing (bool): Skip conversion if DOCX already exists.
			
 
				-    
			
 
				     Returns:
			
 
				         dict: Task result status.
			
 
				     """
			
 
				     try:
			
 
				         logger.info("Starting Pandoc conversion for all SearchResultItems")
			
 
				-        db_manager = SearchResultManager()
			
 
				-        with Session(db_manager.engine) as session:
			
 
				-            # Fetch all SearchResultItem IDs with explicit ordering
			
 
				-            result_ids = session.exec(
			
 
				-                session.query(SearchResultItem.id).order_by(SearchResultItem.id)
			
 
				-            ).all()
			
 
				-            logger.info(f"Total results to process: {len(result_ids)}")
			
 
				-        
			
 
				-        converter = PandocConverter(font_name=font_name, include_toc=include_toc)
			
 
				-        success_count = 0
			
 
				-        
			
 
				-        for result_id in result_ids:
			
 
				-            success = converter.process_single_result(result_id, skip_existing=skip_existing)
			
 
				-            if success:
			
 
				-                success_count += 1
			
 
				-        
			
 
				-        logger.info(f"Pandoc conversion completed for {success_count}/{len(result_ids)} SearchResultItems")
			
 
				-        return {"total_results": len(result_ids), "success_count": success_count, "status": "completed"}
			
 
				+        process_all_results()
			
 
				+        logger.info("Pandoc conversion completed for all SearchResultItems")
			
 
				+        return {"status": "completed"}
			
 
				     except Exception as e:
			
 
				         logger.exception(f"Error during bulk Pandoc conversion: {str(e)}")
			
 
				         return {"status": "failed", "error": str(e)}
			
 
				 
			
 
				-def main():
			
 
				-    """
			
 
				-    Client example to demonstrate how to use the Pandoc conversion tasks.
			
 
				-    This can be run directly from the command line for testing purposes.
			
 
				-    """
			
 
				-    import sys
			
 
				-    
			
 
				-    if len(sys.argv) < 2:
			
 
				-        print("Usage: python pandoc_tasks.py [single|all] [options]")
			
 
				-        print("Options for 'single':")
			
 
				-        print("  --id <result_id>       Specify the SearchResultItem ID to process")
			
 
				-        print("  --font <font_name>     Specify font name for DOCX output (default: 宋体)")
			
 
				-        print("  --toc                  Include table of contents in DOCX output")
			
 
				-        print("  --overwrite            Overwrite existing DOCX files")
			
 
				-        print("\nOptions for 'all':")
			
 
				-        print("  --font <font_name>     Specify font name for DOCX output (default: 宋体)")
			
 
				-        print("  --toc                  Include table of contents in DOCX output")
			
 
				-        print("  --overwrite            Overwrite existing DOCX files")
			
 
				-        sys.exit(1)
			
 
				-    
			
 
				-    mode = sys.argv[1]
			
 
				-    
			
 
				-    def parse_common_args(args):
			
 
				-        font_name = "宋体"
			
 
				-        include_toc = True
			
 
				-        skip_existing = True
			
 
				-        
			
 
				-        for i, arg in enumerate(args):
			
 
				-            if arg == "--font" and i + 1 < len(args):
			
 
				-                font_name = args[i + 1]
			
 
				-            elif arg == "--toc":
			
 
				-                include_toc = True
			
 
				-            elif arg == "--overwrite":
			
 
				-                skip_existing = False
			
 
				+def test_task_process_all_results():
			
 
				+    # Process all results in the database
			
 
				+    db_manager = SearchResultManager()
			
 
				+    with Session(db_manager.engine) as session:
			
 
				+        # Fetch all IDs with explicit ordering
			
 
				+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
			
 
				+        logger.info(f"Total results: {len(result_ids)}")
			
 
				+        logger.info(f"First 5 result IDs: {result_ids[:5]}")
			
 
				         
			
 
				-        return font_name, include_toc, skip_existing
			
 
				-    
			
 
				-    if mode == "single":
			
 
				-        if "--id" not in sys.argv:
			
 
				-            print("Error: '--id' argument is required for single result processing.")
			
 
				-            sys.exit(1)
			
 
				-        
			
 
				-        result_id_index = sys.argv.index("--id") + 1
			
 
				-        if result_id_index >= len(sys.argv):
			
 
				-            print("Error: Missing SearchResultItem ID after '--id'.")
			
 
				-            sys.exit(1)
			
 
				-        
			
 
				-        result_id = int(sys.argv[result_id_index])
			
 
				-        font_name, include_toc, skip_existing = parse_common_args(sys.argv)
			
 
				-        
			
 
				-        # Call the single result conversion task
			
 
				-        result = convert_single_result_task.delay(result_id, font_name, include_toc, skip_existing)
			
 
				-        print(f"Conversion task submitted for SearchResultItem ID {result_id}. Task ID: {result.id}")
			
 
				-    
			
 
				-    elif mode == "all":
			
 
				-        font_name, include_toc, skip_existing = parse_common_args(sys.argv)
			
 
				-        
			
 
				-        # Call the bulk conversion task
			
 
				-        result = convert_all_results_task.delay(font_name, include_toc, skip_existing)
			
 
				-        print(f"Bulk conversion task submitted. Task ID: {result.id}")
			
 
				-    
			
 
				-    else:
			
 
				-        print(f"Unknown mode: {mode}")
			
 
				-        sys.exit(1)
			
 
				+        for result_id, html_path in result_ids:
			
 
				+            try:
			
 
				+                if html_path.endswith('.html'):
			
 
				+                    logger.info(f"Submitting task for SearchResultItem ID: {result_id}")
			
 
				+                    convert_single_result_task.delay(result_id)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"Error processing result {result_id}: {e}")
			
 
				+
			
 
				+def clear_existing_tasks():
			
 
				+    """清除所有待处理的任务"""
			
 
				+    try:
			
 
				+        discarded_count = current_app.control.discard_all()
			
 
				+        logger.info(f"已清除 {discarded_count} 个待处理任务")
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"清除任务失败: {str(e)}")
			
 
				+
			
 
				+def main():
			
 
				+    test_task_process_all_results()
			
 
				+    # clear_existing_tasks()
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main()
			
--- a/worker/celery/tasks.py
+++ b/worker/celery/tasks.py
@@ -12,6 +12,8 @@ def get_random_proxy():
 
				         'http://127.0.0.1:9360',
			
 
				         'http://127.0.0.1:9362',
			
 
				         'http://127.0.0.1:9364',
			
 
				+        'http://127.0.0.1:9366',
			
 
				+        'http://127.0.0.1:9368',
			
 
				         'http://127.0.0.1:1881',
			
 
				     ]
			
 
				     return random.choice(proxies)
			
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -107,7 +107,7 @@ class PandocConverter:
 
				             if not html_convert:
			
 
				                 logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
			
 
				                 return False
			
 
				-            
			
 
				+            logger.info(f"pandoc start html_convert id {html_convert.id}  result_id {result_id}")
			
 
				             # Initialize success flags
			
 
				             docling_success = False
			
 
				             filtered_success = False
			
@@ -177,7 +177,9 @@ class PandocConverter:
 
				 def process_single_example(result_id: int):
			
 
				     # Process a single result example
			
 
				     docling_converter = DoclingConverter()
			
 
				-    docling_converter.process_conversion_by_id(result_id)
			
 
				+    search_result_item = docling_converter.get_search_result_item(result_id)
			
 
				+    if search_result_item.html_path.endswith('.html'):
			
 
				+        docling_converter.process_conversion_by_id(result_id)
			
 
				     
			
 
				     crawl_filter = CrawlFilter()
			
 
				     crawl_filter.process_filter_by_id(result_id)
			
--- a/worker/readme.md
+++ b/worker/readme.md
@@ -20,5 +20,6 @@ $env:PC_NAME="w2"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
 
				 $env:PC_NAME="w3"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
			
 
				 $env:PC_NAME="w4"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
			
 
				 $env:PC_NAME="w5"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
			
 
				-
			
 
				+$env:PC_NAME="w6"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
			
 
				+$env:PC_NAME="w7"; celery -A worker.celery.app worker --hostname=$env:PC_NAME@%h
			
 
				 ```
			
--- a/worker/search_engine/search_result_db.py
+++ b/worker/search_engine/search_result_db.py
@@ -1,6 +1,7 @@
 
				 from datetime import datetime
			
 
				 from typing import Optional, List
			
 
				 from sqlmodel import SQLModel, Field, Relationship, create_engine, Session, select, delete, func
			
 
				+from sqlalchemy.orm import relationship
			
 
				 from sqlalchemy import UniqueConstraint
			
 
				 from sqlalchemy.sql import text
			
 
				 from pathlib import Path
			
@@ -49,6 +50,17 @@ class SearchResultItem(SQLModel, table=True):
 
				     keyword_task: Optional[KeywordTask] = Relationship(back_populates="items")
			
 
				     search_page: Optional[SearchPageResult] = Relationship(back_populates="items")
			
 
				 
			
 
				+class VerificationItem(SQLModel, table=True):
			
 
				+    __table_args__ = (UniqueConstraint("result_item_id", name="uq_verification_item"),)
			
 
				+    
			
 
				+    id: Optional[int] = Field(default=None, primary_key=True)
			
 
				+    result_item_id: int = Field(foreign_key="searchresultitem.id")
			
 
				+    search_result_item: Optional[SearchResultItem] = Relationship(
			
 
				+        sa_relationship=relationship("SearchResultItem", lazy="joined")
			
 
				+    )
			
 
				+    verified: bool = Field(default=False)
			
 
				+    created_at: datetime = Field(default_factory=datetime.now)
			
 
				+
			
 
				 class SearchResultManager:
			
 
				     def __init__(self, db_url: str = DB_URL):
			
 
				         self.engine = create_engine(db_url)
			
@@ -191,3 +203,27 @@ class SearchResultManager:
 
				     def is_task_completed(self, keyword: str) -> bool:
			
 
				         task = self.get_keyword_task(keyword)
			
 
				         return task.is_completed if task else False
			
 
				+    
			
 
				+    def get_all_search_result_items(self) -> List[SearchResultItem]:
			
 
				+        """
			
 
				+        获取数据库中所有的 SearchResultItem。
			
 
				+        """
			
 
				+        with Session(self.engine) as session:
			
 
				+            return session.exec(select(SearchResultItem)).all()
			
 
				+    
			
 
				+    def add_to_verification(self, result_item_id: int):
			
 
				+        """
			
 
				+        将 SearchResultItem 添加到 VerificationItem 表中，避免重复添加。
			
 
				+        """
			
 
				+        with Session(self.engine) as session:
			
 
				+            exists = session.exec(
			
 
				+                select(VerificationItem)
			
 
				+                .where(VerificationItem.result_item_id == result_item_id)
			
 
				+            ).first()
			
 
				+            if not exists:
			
 
				+                verification_item = VerificationItem(result_item_id=result_item_id)
			
 
				+                session.add(verification_item)
			
 
				+                session.commit()
			
 
				+                session.refresh(verification_item)
			
 
				+                return verification_item
			
 
				+            return exists
			
--- a/worker/search_engine/valid_google_search.py
+++ b/worker/search_engine/valid_google_search.py
@@ -0,0 +1,91 @@
 
				+import time
			
 
				+import re
			
 
				+import logging
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Optional
			
 
				+from DrissionPage import ChromiumPage
			
 
				+from pydantic import BaseModel
			
 
				+from scrapling import Adaptor
			
 
				+from sqlmodel import Session
			
 
				+from mylib.logu import logger
			
 
				+from mylib.base import save_to_file
			
 
				+from config.settings import GOOGLE_SEARCH_DIR
			
 
				+from mylib.drission_page import load_chrome_from_ini
			
 
				+from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem, KeywordTask, VerificationItem
			
 
				+from worker.search_engine.smart_selector import get_search_ele
			
 
				+from DrissionPage.common import Keys
			
 
				+from utils.proxy_pool import get_random_proxy
			
 
				+from mylib.base import ensure_output_dir, save_to_file
			
 
				+from scrapling import Adaptor
			
 
				+
			
 
				+class ValidSearchResult:
			
 
				+    def __init__(self):
			
 
				+        self.db_manager = SearchResultManager()
			
 
				+
			
 
				+    def find_first_item_with_keyword(self, keyword: str = "真人") -> Optional[SearchResultItem]:
			
 
				+        """
			
 
				+        获取数据库中所有 SearchResultItem，检查每个 html_path 文件内容，
			
 
				+        如果包含指定关键词，则返回第一个匹配的 SearchResultItem。
			
 
				+        """
			
 
				+        # 获取所有 SearchResultItem
			
 
				+        items = self.db_manager.get_all_search_result_items()
			
 
				+        
			
 
				+        for item in items:
			
 
				+            if item.html_path and Path(item.html_path).exists() and item.html_path.endswith(".html"):
			
 
				+                try:
			
 
				+                    # 读取 HTML 文件内容
			
 
				+                    with open(item.html_path, 'r', encoding='utf-8') as file:
			
 
				+                        content = file.read()
			
 
				+                        # 检查是否包含关键词
			
 
				+                        if keyword in content:
			
 
				+                            logger.info(f"找到包含关键词 '{keyword}' 的结果: {item}")
			
 
				+                            return item
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"读取文件 {item.html_path} 时出错: {e}")
			
 
				+        
			
 
				+        logger.info(f"未找到包含关键词 '{keyword}' 的结果")
			
 
				+        return None
			
 
				+
			
 
				+    def populate_verification_table(self, keyword: str = "真人"):
			
 
				+        """
			
 
				+        遍历所有 SearchResultItem，将包含关键词的结果存入 VerificationItem 表。
			
 
				+        """
			
 
				+        items = self.db_manager.get_all_search_result_items()
			
 
				+        
			
 
				+        for item in items:
			
 
				+            if item.html_path and Path(item.html_path).exists() and item.html_path.endswith(".html"):
			
 
				+                if item.id % 100 == 0:
			
 
				+                    logger.info(f"处理第 {item.id} 个结果")
			
 
				+                try:
			
 
				+                    with open(item.html_path, 'r', encoding='utf-8') as file:
			
 
				+                        content = file.read()
			
 
				+                        page = Adaptor(content)
			
 
				+                        body = Adaptor(page.body)
			
 
				+                        if keyword in body.get_all_text():
			
 
				+                            logger.info(f"将包含关键词 '{keyword}' 的结果 {item.id} 添加到 VerificationItem 表")
			
 
				+                            self.db_manager.add_to_verification(item.id)
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"处理文件 {item.html_path} 时出错: {e}")
			
 
				+
			
 
				+
			
 
				+    def try_get_url(self, browser_config: dict={}):
			
 
				+        browser_config.update({'proxy': get_random_proxy()})
			
 
				+        logger.info(f"browser_config: {browser_config}")
			
 
				+        page = load_chrome_from_ini(**browser_config) if browser_config else load_chrome_from_ini()
			
 
				+        result_item = self.find_first_item_with_keyword()
			
 
				+        if result_item:
			
 
				+            page.get(result_item.url)
			
 
				+            logger.info(f"访问 URL: {result_item.url}")
			
 
				+        else:
			
 
				+            logger.warning("未找到包含关键词的结果")
			
 
				+        # page.quit()
			
 
				+        return result_item
			
 
				+        
			
 
				+
			
 
				+def main():
			
 
				+    vsr = ValidSearchResult()
			
 
				+    vsr.populate_verification_table()
			
 
				+    # vsr.try_get_url()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()