Sfoglia il codice sorgente

dp 搜索尚不完善。重新用回 Camoufox ,关闭广告过滤后,又不被频繁检测了,新增 smart search 搜索框解决找不到搜索框问题。

mrh 1 anno fa
parent
commit
2088effb41

+ 2 - 1
.gitignore

@@ -4,4 +4,5 @@ __pycache__
 .vscode
 .aider*
 .env
-.pytest_cache
+.pytest_cache
+download

+ 281 - 0
tests/mytest/scrapin_smart_find.py

@@ -0,0 +1,281 @@
+from pathlib import Path
+
+from scrapling import Adaptor
+
+class SmartElementSelector:
+    def __init__(self, rules):
+        self.rules = rules
+        
+    def is_hidden(self, element):
+        """综合可见性检测"""
+        # 内联样式检测
+        if 'style' in element.attrib:
+            style = element.attrib['style'].lower()
+            if 'display:none' in style or 'visibility:hidden' in style:
+                return True
+        
+        # 隐藏属性检测
+        if element.attrib.get('hidden') in ['', 'hidden']:
+            return True
+        
+        # 常见隐藏类名检测
+        if 'class' in element.attrib:
+            class_names = element.attrib['class'].lower().split()
+            if {'hidden', 'd-none', 'invisible'} & set(class_names):
+                return True
+            
+        return False
+
+    def check_ancestor(self, element, rule):
+        """递归检查祖先节点"""
+        target_tag = rule.get('tag', '').lower()
+        attr_name = rule.get('name', '').lower()
+        attr_value = rule.get('value', '').lower()
+        
+        current = element.parent
+        while current:
+            # 标签匹配检测
+            if current.tag.lower() == target_tag:
+                # 属性值检测
+                current_value = current.attrib.get(attr_name, '').lower()
+                if rule['match_type'] == 'contains':
+                    if attr_value in current_value:
+                        return True
+                elif rule['match_type'] == 'exact':
+                    if current_value == attr_value:
+                        return True
+            current = current.parent
+        return False
+
+    def calculate_score(self, element):
+        """动态权重评分系统"""
+        if self.is_hidden(element):
+            return -1  # 直接排除隐藏元素
+        
+        score = 0
+        for rule in self.rules:
+            # 标签类型检测
+            if rule['type'] == 'tag':
+                if element.tag.lower() == rule['value'].lower():
+                    score += rule['weight']
+                    
+            # 属性包含检测
+            elif rule['type'] == 'attribute_contains':
+                attr_value = element.attrib.get(rule['name'], '').lower()
+                if rule['value'].lower() in attr_value:
+                    score += rule['weight']
+                    
+            # 祖先节点检测
+            elif rule['type'] == 'ancestor':
+                if self.check_ancestor(element, rule):
+                    score += rule['weight']
+                    
+            # 可见性奖励分
+            elif rule['type'] == 'visible':
+                if not self.is_hidden(element):
+                    score += rule['weight']
+                    
+        return score
+
+
+class LocatorGenerator:
+    @staticmethod
+    def generate_locators(element):
+        """生成多种定位策略并按优先级排序"""
+        locators = []
+        
+        # 1. 优先使用唯一标识属性
+        if element_id := element.attrib.get('id'):
+            locators.append(('id', f'#{element_id}'))
+        
+        # 2. 使用稳定的name属性
+        if name := element.attrib.get('name'):
+            locators.append(('name', f'[name="{name}"]'))
+        
+        # 3. 组合关键ARIA属性
+        aria_attrs = []
+        if aria_label := element.attrib.get('aria-label'):
+            aria_attrs.append(f'[aria-label="{aria_label}"]')
+        if role := element.attrib.get('role'):
+            aria_attrs.append(f'[role="{role}"]')
+        if aria_attrs:
+            locators.append(('aria-combo', ''.join(aria_attrs)))
+        
+        # 4. 智能类名处理(过滤动态部分)
+        if classes := element.attrib.get('class', '').split():
+            static_classes = [c for c in classes if len(c) > 3 and not c.isnumeric()]
+            if static_classes:
+                class_selector = '.' + '.'.join(static_classes)
+                locators.append(('class', class_selector))
+        
+        # 5. 生成相对XPath(基于邻近特征)
+        xpath_parts = []
+        if element.tag:
+            xpath_parts.append(f'//{element.tag}')
+        for attr in ['name', 'role', 'placeholder']:
+            if value := element.attrib.get(attr):
+                xpath_parts.append(f'[@{attr}="{value}"]')
+                break
+        if xpath_parts:
+            locators.append(('xpath', ''.join(xpath_parts)))
+        
+        # 按优先级排序:id > name > aria > class > xpath
+        priority_order = ['id', 'name', 'aria-combo', 'class', 'xpath']
+        return sorted(locators, key=lambda x: priority_order.index(x[0]) if x[0] in priority_order else len(priority_order))
+
+
+class EnhancedAdaptor:
+    def __init__(self, html_content):
+        self.html_content = html_content
+        # 这里假设 Adaptor 已经实现了基本的 HTML 解析和元素查找功能
+        self.page = Adaptor(html_content)
+    
+    def find_all(self, tag_name):
+        """查找所有指定标签的元素"""
+        return self.page.find_all(tag_name)
+    
+    def get_locator_strategy(self, element):
+        """获取推荐定位策略"""
+        locators = LocatorGenerator.generate_locators(element)
+        
+        # 选择第一个非空定位器
+        for loc_type, selector in locators:
+            if loc_type == 'id':
+                return {'strategy': 'id', 'selector': selector[1:]}
+            if loc_type in ('name', 'aria-combo', 'class'):
+                return {'strategy': 'css', 'selector': selector}
+            if loc_type == 'xpath':
+                return {'strategy': 'xpath', 'selector': selector}
+        
+        # 默认返回相对XPath
+        return {'strategy': 'xpath', 'selector': element.xpath}
+    
+    def verify_locator(self, locator_info):
+        """验证定位器有效性"""
+        results = {
+            'is_unique': False,
+            'alternatives': []
+        }
+        
+        try:
+            elements = self.find_all(locator_info['selector'], strategy=locator_info['strategy'])
+            if len(elements) == 1:
+                results['is_unique'] = True
+            else:
+                # 生成备选方案
+                results['alternatives'] = self.generate_fallback_locators(elements[0])
+        except:
+            pass
+        return results
+    
+    def generate_fallback_locators(self, element):
+        """生成备选定位器"""
+        fallbacks = []
+        # 添加更多备选策略
+        if element.attrib.get('name'):
+            fallbacks.append({'strategy': 'css', 'selector': f'[name="{element.attrib["name"]}"]'})
+        if element.attrib.get('class'):
+            fallbacks.append({'strategy': 'css', 'selector': f'.{element.attrib["class"].split()[0]}'})
+        return fallbacks
+
+
+def get_search_rule():
+    # 规则配置(可根据实际需求扩展)
+    return [
+        # 基础特征
+        {"type": "tag", "value": "textarea", "weight": 20},
+        
+        # 关键属性检测
+        {"type": "attribute_contains", "name": "title", "value": "search", "weight": 25},
+        {"type": "attribute_contains", "name": "aria-label", "value": "search", "weight": 25},
+        {"type": "attribute_contains", "name": "role", "value": "search", "weight": 30},
+        
+        # 层级关系检测
+        {
+            "type": "ancestor",
+            "tag": "form",
+            "name": "action",
+            "value": "/search",
+            "match_type": "exact",
+            "weight": 40
+        },
+        {
+            "type": "ancestor",
+            "tag": "form",
+            "name": "role",
+            "value": "search",
+            "match_type": "exact",
+            "weight": 35
+        },
+        
+        # 可见性奖励
+        {"type": "visible", "weight": 20}
+    ]
+
+
+def find_target_element(html_content, rules, base_all='textarea'):
+    selector = SmartElementSelector(rules)
+    page = EnhancedAdaptor(html_content)
+    
+    # 获取所有候选元素
+    candidates = page.find_all(base_all)  # 可以扩展为多标签搜索
+    
+    # 计算评分并过滤
+    scored_elements = []
+    for el in candidates:
+        score = selector.calculate_score(el)
+        if score > 0:
+            scored_elements.append((el, score))
+    
+    # 按评分排序
+    scored_elements.sort(key=lambda x: x[1], reverse=True)
+    
+    # 返回最高分元素(带评分验证)
+    if scored_elements:
+        top_score = scored_elements[0][1]
+        # 过滤掉明显低分的候选项
+        finalists = [el for el, s in scored_elements if s >= top_score * 0.8]
+        
+        # 如果有多个高分候选,优先选择更靠近表单的元素
+        if len(finalists) > 1:
+            # 通过DOM深度进行二次排序
+            finalists.sort(key=lambda el: len(el.path))
+        # print(f"finalists {finalists}")
+        best_element = finalists[0]
+        locator_info = page.get_locator_strategy(best_element)
+        # print(f"locator_info {locator_info}")
+        # {'strategy': 'id', 'selector': 'APjFqb'}
+        locator_info['xpath'] = f'//{base_all}[@{locator_info["strategy"]}="{locator_info["selector"]}]'
+        return locator_info
+        # 添加置信度验证
+        verification_results = page.verify_locator(locator_info)
+        if verification_results['is_unique']:
+            return {
+                'element': best_element,
+                'locator': locator_info,
+                'confidence': 'high'
+            }
+        return {
+            'element': best_element,
+            'locator': locator_info,
+            'confidence': 'medium',
+            'fallbacks': verification_results['alternatives']
+        }
+    return None
+
+def get_search_ele(html_content: str, base_all='textarea'):
+    rules = get_search_rule()
+    return find_target_element(html_content, rules, base_all=base_all)
+def search_demo():
+    # 使用示例
+    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
+    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询搜索框结果页面.html')
+    # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\智能选择A标签测试.html')
+
+    html_content = file.read_text(encoding='utf-8')
+    rules = get_search_rule()
+    target_element = find_target_element(html_content, rules)
+    print(target_element)
+
+if __name__ == "__main__":
+    search_demo()

+ 19 - 29
tests/mytest/scrapling_t.py

@@ -37,41 +37,31 @@ def analyze_html(html_content: str) -> dict:
         print(f"save_path: {save_path}")
 
 
-def google_search_demo():
-    file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
-    # file = Path(r'K:\code\upwork\zhang_crawl_bio\output\analyze\search_result.html')
-    html_content = file.read_text(encoding='utf-8')
-    page = Adaptor(html_content)
-    search_div = page.xpath('//div[@id="search"]')
-    print("search_div:", search_div)
-    if search_div:
-        print("找到 search div:")
-        result_list = search_div.xpath('//*[@data-rpos]')
-        # 从 result_list 中过滤掉任何没有 href 属性的所有子元素,xpath 语法筛选
-        result_list = [item for item in result_list if item.xpath('.//cite')]
-        result_list = [item for item in result_list if not item.xpath('.//*[@data-initq]')]
-        print("result_list:", result_list)
-        print("实际 7 ,result_list len:", len(result_list))
-        for result in result_list:
-            result.attrib
-            print("result.attrib:", result.attrib)
-        # print(search_div[0].html_content)  # 打印 div 中的文本内容
-    else:
-        print("未找到 search div")
-
 def find_search_div():
-    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\results\Acampe carinata essential oil\error_20250125_043533.html.html')
+    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
     html_content = file.read_text(encoding='utf-8')
     page = Adaptor(html_content)
-    textarea_list = page.find_all('textarea')
+    # textarea_list = page.xpath('//body//form[@action="/search"]//textarea')
+    textarea_list = page.xpath('//body//form[@action="/search"]//input')
     print("textarea_list:", textarea_list)
     for textarea in textarea_list:
-        print("textarea.generate_full_css_selector:", textarea.generate_full_css_selector)
-        print("textarea.generate_full_xpath_selector:", textarea.generate_full_xpath_selector)
-        print("textarea.generate_xpath_selector:", textarea.generate_xpath_selector)
+        textarea.attrib
+        print("------------------")
+        print("textarea.tag:", textarea.tag)
+        print("textarea.attrib:", textarea.attrib)
+        # print("textarea.text:", textarea.text)
+        # print("textarea.html_content:", textarea.html_content)
+        # if 'search' in textarea.html_content.lower():
+        #     print("找到 search 关键字的 textarea")
+        # print("textarea.path:", textarea.path)
 def main():
     # google_search_demo()
-    find_search_div()
-
+    res = find_search_div()
+    print("res:", res)
+    # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
+    # html_content = file.read_text(encoding='utf-8')
+    # page = Adaptor(html_content)
+    # textarea = page.xpath_first(res)
+    # print("textarea:", textarea.html_content)
 if __name__ == "__main__":
     main()

+ 5 - 1
worker/search_engine/camoufox_broswer.py

@@ -14,10 +14,10 @@ import os
 import datetime
 from typing import Optional, Dict, Type, Protocol, Union,ClassVar
 import logging
+from camoufox import DefaultAddons
 from pydantic import BaseModel
 from config.settings import OUTPUT_DIR, WORK_DIR
 from mylib.logu import logger
-
 # ------------------- Core Implementation -------------------
 class BrowserConfig(BaseModel):
     """浏览器基础配置模型"""
@@ -26,6 +26,10 @@ class BrowserConfig(BaseModel):
     proxy: Optional[Dict] = {'server': 'http://localhost:1881'}
     humanize: bool=True
     constrains: ClassVar[Screen] = Screen(max_width=1920, max_height=1200)
+    # C:\Users\mg\AppData\Local\camoufox\camoufox\Cache\browser\features
+    # addons: Optional[list]=[str(WORK_DIR / r"download\addon")]
+    exclude_addons:Optional[list]=[DefaultAddons.UBO]
+
 
 class BrowserCore():
     """浏览器核心功能实现(支持上下文管理器)"""

+ 3 - 4
worker/search_engine/drission_google_search.py

@@ -70,15 +70,15 @@ class GoogleSearchHandlerDrission:
         
         has_next = True
         search_result_item_count = 0
-        
+        finitsh_flag = False
         while has_next:
             result_ele = self._process_single_page(keyword)
             search_result_item_count += len(result_ele.results) if result_ele.results else 0
             
             if search_result_item_count > max_result_items:
                 logger.info(f"关键词 {keyword} 单页结果数量超过 {max_result_items} ,跳过处理下一页")
+                finitsh_flag = True
                 break
-                
             if result_ele.next_page_url:
                 self.page.scroll.to_bottom()
                 time.sleep(3)
@@ -87,12 +87,11 @@ class GoogleSearchHandlerDrission:
                     next_btn.click()
                     logger.info(f"跳转到下一页: {self.page.url}")
                 else:
+                    finitsh_flag = True
                     break
             else:
                 break
                 
-        key_model = self.db_manager.mark_task_completed(keyword)
-        logger.info(f"完成关键词处理: {keyword}")
         return key_model
 
     def goto_home_page(self):

+ 5 - 2
worker/search_engine/google_search.py

@@ -14,6 +14,7 @@ from mylib.logu import logger
 from mylib.base import save_to_file
 from config.settings import OUTPUT_DIR
 from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem, KeywordTask
+from worker.search_engine.smart_selector import get_search_ele
 async def async_input(prompt):
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(None, input, prompt)
@@ -113,8 +114,10 @@ class GoogleSearchHandler():
         return bool(Adaptor(html_content).xpath_first('//div[@id="search"]'))
     async def search(self, query: str) -> dict:
         await self.goto_home_page()
-        textarea = self.page.locator('xpath=//body//textarea')
-        logger.info(f"textarea {textarea}")
+        search_ele_dict = get_search_ele(await self.page.content())
+        if not search_ele_dict:
+            raise Exception("未找到搜索框")
+        textarea = self.page.locator(search_ele_dict['xpath'])
         await textarea.fill(query, timeout=10000)  # 使用 textarea.fill() 而不是 page.fill()
         await textarea.press('Enter')  # 使用 press 方法模拟按下 Enter 键
         await self.page.wait_for_load_state(state='load', timeout=10000)

+ 281 - 0
worker/search_engine/smart_selector.py

@@ -0,0 +1,281 @@
+from pathlib import Path
+
+from scrapling import Adaptor
+
+class SmartElementSelector:
+    def __init__(self, rules):
+        self.rules = rules
+        
+    def is_hidden(self, element):
+        """综合可见性检测"""
+        # 内联样式检测
+        if 'style' in element.attrib:
+            style = element.attrib['style'].lower()
+            if 'display:none' in style or 'visibility:hidden' in style:
+                return True
+        
+        # 隐藏属性检测
+        if element.attrib.get('hidden') in ['', 'hidden']:
+            return True
+        
+        # 常见隐藏类名检测
+        if 'class' in element.attrib:
+            class_names = element.attrib['class'].lower().split()
+            if {'hidden', 'd-none', 'invisible'} & set(class_names):
+                return True
+            
+        return False
+
+    def check_ancestor(self, element, rule):
+        """递归检查祖先节点"""
+        target_tag = rule.get('tag', '').lower()
+        attr_name = rule.get('name', '').lower()
+        attr_value = rule.get('value', '').lower()
+        
+        current = element.parent
+        while current:
+            # 标签匹配检测
+            if current.tag.lower() == target_tag:
+                # 属性值检测
+                current_value = current.attrib.get(attr_name, '').lower()
+                if rule['match_type'] == 'contains':
+                    if attr_value in current_value:
+                        return True
+                elif rule['match_type'] == 'exact':
+                    if current_value == attr_value:
+                        return True
+            current = current.parent
+        return False
+
+    def calculate_score(self, element):
+        """动态权重评分系统"""
+        if self.is_hidden(element):
+            return -1  # 直接排除隐藏元素
+        
+        score = 0
+        for rule in self.rules:
+            # 标签类型检测
+            if rule['type'] == 'tag':
+                if element.tag.lower() == rule['value'].lower():
+                    score += rule['weight']
+                    
+            # 属性包含检测
+            elif rule['type'] == 'attribute_contains':
+                attr_value = element.attrib.get(rule['name'], '').lower()
+                if rule['value'].lower() in attr_value:
+                    score += rule['weight']
+                    
+            # 祖先节点检测
+            elif rule['type'] == 'ancestor':
+                if self.check_ancestor(element, rule):
+                    score += rule['weight']
+                    
+            # 可见性奖励分
+            elif rule['type'] == 'visible':
+                if not self.is_hidden(element):
+                    score += rule['weight']
+                    
+        return score
+
+
+class LocatorGenerator:
+    @staticmethod
+    def generate_locators(element):
+        """生成多种定位策略并按优先级排序"""
+        locators = []
+        
+        # 1. 优先使用唯一标识属性
+        if element_id := element.attrib.get('id'):
+            locators.append(('id', f'#{element_id}'))
+        
+        # 2. 使用稳定的name属性
+        if name := element.attrib.get('name'):
+            locators.append(('name', f'[name="{name}"]'))
+        
+        # 3. 组合关键ARIA属性
+        aria_attrs = []
+        if aria_label := element.attrib.get('aria-label'):
+            aria_attrs.append(f'[aria-label="{aria_label}"]')
+        if role := element.attrib.get('role'):
+            aria_attrs.append(f'[role="{role}"]')
+        if aria_attrs:
+            locators.append(('aria-combo', ''.join(aria_attrs)))
+        
+        # 4. 智能类名处理(过滤动态部分)
+        if classes := element.attrib.get('class', '').split():
+            static_classes = [c for c in classes if len(c) > 3 and not c.isnumeric()]
+            if static_classes:
+                class_selector = '.' + '.'.join(static_classes)
+                locators.append(('class', class_selector))
+        
+        # 5. 生成相对XPath(基于邻近特征)
+        xpath_parts = []
+        if element.tag:
+            xpath_parts.append(f'//{element.tag}')
+        for attr in ['name', 'role', 'placeholder']:
+            if value := element.attrib.get(attr):
+                xpath_parts.append(f'[@{attr}="{value}"]')
+                break
+        if xpath_parts:
+            locators.append(('xpath', ''.join(xpath_parts)))
+        
+        # 按优先级排序:id > name > aria > class > xpath
+        priority_order = ['id', 'name', 'aria-combo', 'class', 'xpath']
+        return sorted(locators, key=lambda x: priority_order.index(x[0]) if x[0] in priority_order else len(priority_order))
+
+
+class EnhancedAdaptor:
+    def __init__(self, html_content):
+        self.html_content = html_content
+        # 这里假设 Adaptor 已经实现了基本的 HTML 解析和元素查找功能
+        self.page = Adaptor(html_content)
+    
+    def find_all(self, tag_name):
+        """查找所有指定标签的元素"""
+        return self.page.find_all(tag_name)
+    
+    def get_locator_strategy(self, element):
+        """获取推荐定位策略"""
+        locators = LocatorGenerator.generate_locators(element)
+        
+        # 选择第一个非空定位器
+        for loc_type, selector in locators:
+            if loc_type == 'id':
+                return {'strategy': 'id', 'selector': selector[1:]}
+            if loc_type in ('name', 'aria-combo', 'class'):
+                return {'strategy': 'css', 'selector': selector}
+            if loc_type == 'xpath':
+                return {'strategy': 'xpath', 'selector': selector}
+        
+        # 默认返回相对XPath
+        return {'strategy': 'xpath', 'selector': element.xpath}
+    
+    def verify_locator(self, locator_info):
+        """验证定位器有效性"""
+        results = {
+            'is_unique': False,
+            'alternatives': []
+        }
+        
+        try:
+            elements = self.find_all(locator_info['selector'], strategy=locator_info['strategy'])
+            if len(elements) == 1:
+                results['is_unique'] = True
+            else:
+                # 生成备选方案
+                results['alternatives'] = self.generate_fallback_locators(elements[0])
+        except:
+            pass
+        return results
+    
+    def generate_fallback_locators(self, element):
+        """生成备选定位器"""
+        fallbacks = []
+        # 添加更多备选策略
+        if element.attrib.get('name'):
+            fallbacks.append({'strategy': 'css', 'selector': f'[name="{element.attrib["name"]}"]'})
+        if element.attrib.get('class'):
+            fallbacks.append({'strategy': 'css', 'selector': f'.{element.attrib["class"].split()[0]}'})
+        return fallbacks
+
+
+def get_search_rule():
+    # 规则配置(可根据实际需求扩展)
+    return [
+        # 基础特征
+        {"type": "tag", "value": "textarea", "weight": 20},
+        
+        # 关键属性检测
+        {"type": "attribute_contains", "name": "title", "value": "search", "weight": 25},
+        {"type": "attribute_contains", "name": "aria-label", "value": "search", "weight": 25},
+        {"type": "attribute_contains", "name": "role", "value": "search", "weight": 30},
+        
+        # 层级关系检测
+        {
+            "type": "ancestor",
+            "tag": "form",
+            "name": "action",
+            "value": "/search",
+            "match_type": "exact",
+            "weight": 40
+        },
+        {
+            "type": "ancestor",
+            "tag": "form",
+            "name": "role",
+            "value": "search",
+            "match_type": "exact",
+            "weight": 35
+        },
+        
+        # 可见性奖励
+        {"type": "visible", "weight": 20}
+    ]
+
+
+def find_target_element(html_content, rules, base_all='textarea'):
+    selector = SmartElementSelector(rules)
+    page = EnhancedAdaptor(html_content)
+    
+    # 获取所有候选元素
+    candidates = page.find_all(base_all)  # 可以扩展为多标签搜索
+    
+    # 计算评分并过滤
+    scored_elements = []
+    for el in candidates:
+        score = selector.calculate_score(el)
+        if score > 0:
+            scored_elements.append((el, score))
+    
+    # 按评分排序
+    scored_elements.sort(key=lambda x: x[1], reverse=True)
+    
+    # 返回最高分元素(带评分验证)
+    if scored_elements:
+        top_score = scored_elements[0][1]
+        # 过滤掉明显低分的候选项
+        finalists = [el for el, s in scored_elements if s >= top_score * 0.8]
+        
+        # 如果有多个高分候选,优先选择更靠近表单的元素
+        if len(finalists) > 1:
+            # 通过DOM深度进行二次排序
+            finalists.sort(key=lambda el: len(el.path))
+        # print(f"finalists {finalists}")
+        best_element = finalists[0]
+        locator_info = page.get_locator_strategy(best_element)
+        # print(f"locator_info {locator_info}")
+        # {'strategy': 'id', 'selector': 'APjFqb'}
+        locator_info['xpath'] = f'//{base_all}[@{locator_info["strategy"]}="{locator_info["selector"]}"]'
+        return locator_info
+        # 添加置信度验证
+        verification_results = page.verify_locator(locator_info)
+        if verification_results['is_unique']:
+            return {
+                'element': best_element,
+                'locator': locator_info,
+                'confidence': 'high'
+            }
+        return {
+            'element': best_element,
+            'locator': locator_info,
+            'confidence': 'medium',
+            'fallbacks': verification_results['alternatives']
+        }
+    return None
+
+def get_search_ele(html_content: str, base_all='textarea'):
+    rules = get_search_rule()
+    return find_target_element(html_content, rules, base_all=base_all)
+def search_demo():
+    # 使用示例
+    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
+    file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询搜索框结果页面.html')
+    # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\智能选择A标签测试.html')
+
+    html_content = file.read_text(encoding='utf-8')
+    rules = get_search_rule()
+    target_element = find_target_element(html_content, rules)
+    print(target_element)
+
+if __name__ == "__main__":
+    search_demo()