smart_selector.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. from pathlib import Path
  2. from scrapling import Adaptor
  3. class SmartElementSelector:
  4. def __init__(self, rules):
  5. self.rules = rules
  6. def is_hidden(self, element):
  7. """综合可见性检测"""
  8. # 内联样式检测
  9. if 'style' in element.attrib:
  10. style = element.attrib['style'].lower()
  11. if 'display:none' in style or 'visibility:hidden' in style:
  12. return True
  13. # 隐藏属性检测
  14. if element.attrib.get('hidden') in ['', 'hidden']:
  15. return True
  16. # 常见隐藏类名检测
  17. if 'class' in element.attrib:
  18. class_names = element.attrib['class'].lower().split()
  19. if {'hidden', 'd-none', 'invisible'} & set(class_names):
  20. return True
  21. return False
  22. def check_ancestor(self, element, rule):
  23. """递归检查祖先节点"""
  24. target_tag = rule.get('tag', '').lower()
  25. attr_name = rule.get('name', '').lower()
  26. attr_value = rule.get('value', '').lower()
  27. current = element.parent
  28. while current:
  29. # 标签匹配检测
  30. if current.tag.lower() == target_tag:
  31. # 属性值检测
  32. current_value = current.attrib.get(attr_name, '').lower()
  33. if rule['match_type'] == 'contains':
  34. if attr_value in current_value:
  35. return True
  36. elif rule['match_type'] == 'exact':
  37. if current_value == attr_value:
  38. return True
  39. current = current.parent
  40. return False
  41. def calculate_score(self, element):
  42. """动态权重评分系统"""
  43. if self.is_hidden(element):
  44. return -1 # 直接排除隐藏元素
  45. score = 0
  46. for rule in self.rules:
  47. # 标签类型检测
  48. if rule['type'] == 'tag':
  49. if element.tag.lower() == rule['value'].lower():
  50. score += rule['weight']
  51. # 属性包含检测
  52. elif rule['type'] == 'attribute_contains':
  53. attr_value = element.attrib.get(rule['name'], '').lower()
  54. if rule['value'].lower() in attr_value:
  55. score += rule['weight']
  56. # 祖先节点检测
  57. elif rule['type'] == 'ancestor':
  58. if self.check_ancestor(element, rule):
  59. score += rule['weight']
  60. # 可见性奖励分
  61. elif rule['type'] == 'visible':
  62. if not self.is_hidden(element):
  63. score += rule['weight']
  64. return score
  65. class LocatorGenerator:
  66. @staticmethod
  67. def generate_locators(element):
  68. """生成多种定位策略并按优先级排序"""
  69. locators = []
  70. # 1. 优先使用唯一标识属性
  71. if element_id := element.attrib.get('id'):
  72. locators.append(('id', f'#{element_id}'))
  73. # 2. 使用稳定的name属性
  74. if name := element.attrib.get('name'):
  75. locators.append(('name', f'[name="{name}"]'))
  76. # 3. 组合关键ARIA属性
  77. aria_attrs = []
  78. if aria_label := element.attrib.get('aria-label'):
  79. aria_attrs.append(f'[aria-label="{aria_label}"]')
  80. if role := element.attrib.get('role'):
  81. aria_attrs.append(f'[role="{role}"]')
  82. if aria_attrs:
  83. locators.append(('aria-combo', ''.join(aria_attrs)))
  84. # 4. 智能类名处理(过滤动态部分)
  85. if classes := element.attrib.get('class', '').split():
  86. static_classes = [c for c in classes if len(c) > 3 and not c.isnumeric()]
  87. if static_classes:
  88. class_selector = '.' + '.'.join(static_classes)
  89. locators.append(('class', class_selector))
  90. # 5. 生成相对XPath(基于邻近特征)
  91. xpath_parts = []
  92. if element.tag:
  93. xpath_parts.append(f'//{element.tag}')
  94. for attr in ['name', 'role', 'placeholder']:
  95. if value := element.attrib.get(attr):
  96. xpath_parts.append(f'[@{attr}="{value}"]')
  97. break
  98. if xpath_parts:
  99. locators.append(('xpath', ''.join(xpath_parts)))
  100. # 按优先级排序:id > name > aria > class > xpath
  101. priority_order = ['id', 'name', 'aria-combo', 'class', 'xpath']
  102. return sorted(locators, key=lambda x: priority_order.index(x[0]) if x[0] in priority_order else len(priority_order))
  103. class EnhancedAdaptor:
  104. def __init__(self, html_content):
  105. self.html_content = html_content
  106. # 这里假设 Adaptor 已经实现了基本的 HTML 解析和元素查找功能
  107. self.page = Adaptor(html_content)
  108. def find_all(self, tag_name):
  109. """查找所有指定标签的元素"""
  110. return self.page.find_all(tag_name)
  111. def get_locator_strategy(self, element):
  112. """获取推荐定位策略"""
  113. locators = LocatorGenerator.generate_locators(element)
  114. # 选择第一个非空定位器
  115. for loc_type, selector in locators:
  116. if loc_type == 'id':
  117. return {'strategy': 'id', 'selector': selector[1:]}
  118. if loc_type in ('name', 'aria-combo', 'class'):
  119. return {'strategy': 'css', 'selector': selector}
  120. if loc_type == 'xpath':
  121. return {'strategy': 'xpath', 'selector': selector}
  122. # 默认返回相对XPath
  123. return {'strategy': 'xpath', 'selector': element.xpath}
  124. def verify_locator(self, locator_info):
  125. """验证定位器有效性"""
  126. results = {
  127. 'is_unique': False,
  128. 'alternatives': []
  129. }
  130. try:
  131. elements = self.find_all(locator_info['selector'], strategy=locator_info['strategy'])
  132. if len(elements) == 1:
  133. results['is_unique'] = True
  134. else:
  135. # 生成备选方案
  136. results['alternatives'] = self.generate_fallback_locators(elements[0])
  137. except:
  138. pass
  139. return results
  140. def generate_fallback_locators(self, element):
  141. """生成备选定位器"""
  142. fallbacks = []
  143. # 添加更多备选策略
  144. if element.attrib.get('name'):
  145. fallbacks.append({'strategy': 'css', 'selector': f'[name="{element.attrib["name"]}"]'})
  146. if element.attrib.get('class'):
  147. fallbacks.append({'strategy': 'css', 'selector': f'.{element.attrib["class"].split()[0]}'})
  148. return fallbacks
  149. def get_search_rule():
  150. # 规则配置(可根据实际需求扩展)
  151. return [
  152. # 基础特征
  153. {"type": "tag", "value": "textarea", "weight": 20},
  154. # 关键属性检测
  155. {"type": "attribute_contains", "name": "title", "value": "search", "weight": 25},
  156. {"type": "attribute_contains", "name": "aria-label", "value": "search", "weight": 25},
  157. {"type": "attribute_contains", "name": "role", "value": "search", "weight": 30},
  158. # 层级关系检测
  159. {
  160. "type": "ancestor",
  161. "tag": "form",
  162. "name": "action",
  163. "value": "/search",
  164. "match_type": "exact",
  165. "weight": 40
  166. },
  167. {
  168. "type": "ancestor",
  169. "tag": "form",
  170. "name": "role",
  171. "value": "search",
  172. "match_type": "exact",
  173. "weight": 35
  174. },
  175. # 可见性奖励
  176. {"type": "visible", "weight": 20}
  177. ]
  178. def find_target_element(html_content, rules, base_all='textarea'):
  179. selector = SmartElementSelector(rules)
  180. page = EnhancedAdaptor(html_content)
  181. # 获取所有候选元素
  182. candidates = page.find_all(base_all) # 可以扩展为多标签搜索
  183. # 计算评分并过滤
  184. scored_elements = []
  185. for el in candidates:
  186. score = selector.calculate_score(el)
  187. if score > 0:
  188. scored_elements.append((el, score))
  189. # 按评分排序
  190. scored_elements.sort(key=lambda x: x[1], reverse=True)
  191. # 返回最高分元素(带评分验证)
  192. if scored_elements:
  193. top_score = scored_elements[0][1]
  194. # 过滤掉明显低分的候选项
  195. finalists = [el for el, s in scored_elements if s >= top_score * 0.8]
  196. # 如果有多个高分候选,优先选择更靠近表单的元素
  197. if len(finalists) > 1:
  198. # 通过DOM深度进行二次排序
  199. finalists.sort(key=lambda el: len(el.path))
  200. # print(f"finalists {finalists}")
  201. best_element = finalists[0]
  202. locator_info = page.get_locator_strategy(best_element)
  203. # print(f"locator_info {locator_info}")
  204. # {'strategy': 'id', 'selector': 'APjFqb'}
  205. locator_info['xpath'] = f'//{base_all}[@{locator_info["strategy"]}="{locator_info["selector"]}"]'
  206. return locator_info
  207. # 添加置信度验证
  208. verification_results = page.verify_locator(locator_info)
  209. if verification_results['is_unique']:
  210. return {
  211. 'element': best_element,
  212. 'locator': locator_info,
  213. 'confidence': 'high'
  214. }
  215. return {
  216. 'element': best_element,
  217. 'locator': locator_info,
  218. 'confidence': 'medium',
  219. 'fallbacks': verification_results['alternatives']
  220. }
  221. return None
  222. def get_search_ele(html_content: str, base_all='textarea'):
  223. rules = get_search_rule()
  224. return find_target_element(html_content, rules, base_all=base_all)
  225. def search_demo():
  226. # 使用示例
  227. file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html')
  228. file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询搜索框结果页面.html')
  229. # file = Path(r'G:\code\upwork\zhang_crawl_bio\output\debug\智能选择A标签测试.html')
  230. html_content = file.read_text(encoding='utf-8')
  231. rules = get_search_rule()
  232. target_element = find_target_element(html_content, rules)
  233. print(target_element)
  234. if __name__ == "__main__":
  235. search_demo()