|
|
@@ -1,16 +1,17 @@
|
|
|
-from pathlib import Path
|
|
|
import time
|
|
|
import re
|
|
|
-from typing import Optional
|
|
|
+import logging
|
|
|
+from pathlib import Path
|
|
|
from DrissionPage import ChromiumPage
|
|
|
-from DrissionPage.common import Keys
|
|
|
from pydantic import BaseModel
|
|
|
from scrapling import Adaptor
|
|
|
from mylib.logu import logger
|
|
|
from mylib.base import save_to_file
|
|
|
-from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem, KeywordTask
|
|
|
from config.settings import OUTPUT_DIR
|
|
|
from mylib.drission_page import load_chrome_from_ini
|
|
|
+from worker.search_engine.search_result_db import SearchResultManager, SearchResultItem, KeywordTask
|
|
|
+from worker.search_engine.smart_selector import get_search_ele
|
|
|
+from DrissionPage.common import Keys
|
|
|
|
|
|
class SearchResultEle(BaseModel):
|
|
|
search_div: bool | None = None
|
|
|
@@ -24,7 +25,7 @@ class GoogleSearchHandlerDrission:
|
|
|
self.db_manager = SearchResultManager()
|
|
|
self.save_dir = OUTPUT_DIR / 'results'
|
|
|
|
|
|
- def save_current_page(self, keyword: str, filename: str = time.strftime("%Y%m%d_%H%M%S")) -> Path:
|
|
|
+ def save_current_page(self, keyword: str, filename: str=time.strftime("%Y%m%d_%H%M%S")) -> Path:
|
|
|
html_dir = self.save_dir / keyword
|
|
|
html_dir.mkdir(parents=True, exist_ok=True)
|
|
|
html_path = save_to_file(self.page.html, html_dir / f"{filename}.html")
|
|
|
@@ -58,61 +59,78 @@ class GoogleSearchHandlerDrission:
|
|
|
|
|
|
return result_ele
|
|
|
|
|
|
- def process_keyword(self, keyword: str, max_result_items: int = 200, skip_existing: bool = False) -> KeywordTask:
|
|
|
- if skip_existing:
|
|
|
- key_model = self.db_manager.get_keyword_task(keyword)
|
|
|
- if key_model:
|
|
|
- logger.info(f"关键词任务已完成,跳过处理: {keyword}")
|
|
|
- return key_model
|
|
|
+ def check_keyword(self, keyword: str, skip_existing: bool) -> tuple[bool, KeywordTask]:
|
|
|
+ key_model = self.db_manager.get_keyword_task(keyword)
|
|
|
+ if skip_existing and key_model.is_completed:
|
|
|
+ logger.info(f"关键词任务已完成,跳过处理: id={key_model.id} {keyword}")
|
|
|
+ return True, key_model
|
|
|
+
|
|
|
+ if key_model:
|
|
|
+ self.db_manager.delete_keyword_task(keyword)
|
|
|
+ key_model = self.db_manager.create_keyword_task(keyword)
|
|
|
+ return False, key_model
|
|
|
|
|
|
- self.db_manager.create_keyword_task(keyword)
|
|
|
+ def process_keyword(self, keyword: str, max_result_items: int = 200, skip_existing: bool = False):
|
|
|
+ _, key_model = self.check_keyword(keyword, skip_existing)
|
|
|
self.search(keyword)
|
|
|
-
|
|
|
- has_next = True
|
|
|
search_result_item_count = 0
|
|
|
- finitsh_flag = False
|
|
|
- while has_next:
|
|
|
+ should_complete = False
|
|
|
+
|
|
|
+ while True:
|
|
|
result_ele = self._process_single_page(keyword)
|
|
|
+ if not result_ele.search_div:
|
|
|
+ break
|
|
|
+
|
|
|
search_result_item_count += len(result_ele.results) if result_ele.results else 0
|
|
|
-
|
|
|
- if search_result_item_count > max_result_items:
|
|
|
- logger.info(f"关键词 {keyword} 单页结果数量超过 {max_result_items} ,跳过处理下一页")
|
|
|
- finitsh_flag = True
|
|
|
+ if search_result_item_count >= max_result_items or not result_ele.next_page_url:
|
|
|
+ should_complete = True
|
|
|
break
|
|
|
- if result_ele.next_page_url:
|
|
|
+
|
|
|
+ try:
|
|
|
self.page.scroll.to_bottom()
|
|
|
time.sleep(3)
|
|
|
next_btn = self.page.ele('#pnnext')
|
|
|
if next_btn:
|
|
|
next_btn.click()
|
|
|
logger.info(f"跳转到下一页: {self.page.url}")
|
|
|
+ self.page.wait.load_start()
|
|
|
else:
|
|
|
- finitsh_flag = True
|
|
|
break
|
|
|
- else:
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"翻页失败: {str(e)}")
|
|
|
break
|
|
|
-
|
|
|
+
|
|
|
+ if should_complete:
|
|
|
+ key_model = self.db_manager.mark_task_completed(keyword)
|
|
|
+ logger.info(f"正常完成关键词处理: {keyword}")
|
|
|
+ else:
|
|
|
+ logger.warning(f"关键词处理被中断: {keyword}")
|
|
|
return key_model
|
|
|
|
|
|
def goto_home_page(self):
|
|
|
url = "https://www.google.com"
|
|
|
if self.page.url != url:
|
|
|
self.page.get(url)
|
|
|
+ self.page.wait.load_start()
|
|
|
if 'sorry/' in self.page.url:
|
|
|
- raise Exception("出现人机验证,需要人工干预")
|
|
|
+ raise Exception(f"出现人机验证,正在换身份重试。。 {self.page.url}")
|
|
|
|
|
|
def search(self, query: str):
|
|
|
self.goto_home_page()
|
|
|
- search_box = self.page.ele('textarea')
|
|
|
+ search_ele_dict = get_search_ele(self.page.html)
|
|
|
+ if not search_ele_dict:
|
|
|
+ raise Exception("未找到搜索框")
|
|
|
+ search_box = self.page.ele(f'xpath:{search_ele_dict['xpath']}')
|
|
|
search_box.input(query)
|
|
|
self.page.actions.type(Keys.ENTER)
|
|
|
+ self.page.wait.load_start()
|
|
|
|
|
|
def get_current_page_num(self) -> int:
|
|
|
if '/search?q=' in self.page.url:
|
|
|
match = re.search(r'&start=(\d+)', self.page.url)
|
|
|
return int(match.group(1)) // 10 + 1 if match else 1
|
|
|
|
|
|
- def get_search_result_ele(self, html_content: str) -> SearchResultEle:
|
|
|
+ def get_search_result_ele(self, html_content: str):
|
|
|
res = SearchResultEle(
|
|
|
search_div=None,
|
|
|
next_page_url=None,
|
|
|
@@ -126,12 +144,11 @@ class GoogleSearchHandlerDrission:
|
|
|
next_page_url = body.xpath_first('//a[@id="pnnext"]/@href')
|
|
|
res.search_div = bool(search_div)
|
|
|
res.next_page_url = f"https://www.google.com{next_page_url}" if next_page_url else None
|
|
|
-
|
|
|
if not search_div:
|
|
|
return res
|
|
|
|
|
|
result_list = search_div.xpath('//*[@data-snc]')
|
|
|
- logger.info(f"当前页结果数量: {len(result_list)}")
|
|
|
+ logger.info(f"当前页结果数量: {len(result_list)}, next_page_url: {next_page_url}")
|
|
|
|
|
|
for result_item in result_list:
|
|
|
if len(result_item.children) < 2:
|
|
|
@@ -152,18 +169,39 @@ class GoogleSearchHandlerDrission:
|
|
|
res.results.append(result)
|
|
|
return res
|
|
|
|
|
|
-def search_keyword_drission(keyword: str, max_result_items: int = 1, skip_existing: bool = False):
|
|
|
- # page = load_chrome_from_ini(proxy='http://localhost:1881')
|
|
|
- page = load_chrome_from_ini(proxy='http://localhost:1881')
|
|
|
+def search_keyword_drission(keyword, max_result_items=200, skip_existing=True):
|
|
|
+ ret = {'error': 0, 'msg': '', 'data': None}
|
|
|
+ logger.info(f"keyword {keyword} max_result_items: {max_result_items} skip_existing: {skip_existing}")
|
|
|
+ search_handler = GoogleSearchHandlerDrission(None)
|
|
|
+ exist, keyword_model = search_handler.check_keyword(keyword, skip_existing)
|
|
|
+ if exist and keyword_model.is_completed:
|
|
|
+ ret['data'] = keyword_model.model_dump()
|
|
|
+ return ret
|
|
|
+
|
|
|
+ page = load_chrome_from_ini(proxy='http://127.0.0.1:1881')
|
|
|
try:
|
|
|
- handler = GoogleSearchHandlerDrission(page)
|
|
|
- return handler.process_keyword(keyword, max_result_items, skip_existing)
|
|
|
+ search_handler = GoogleSearchHandlerDrission(page)
|
|
|
+ kw = search_handler.process_keyword(keyword, max_result_items=max_result_items, skip_existing=skip_existing)
|
|
|
+ if not kw:
|
|
|
+ ret['error'] = 1
|
|
|
+ html_path = search_handler.save_current_page(keyword, filename=f"warning_{time.strftime('%Y%m%d_%H%M%S')}")
|
|
|
+ logger.warning(f"关键词任务未完成: {keyword} html_path: {html_path}")
|
|
|
+ ret['msg'] = f"关键词任务未完成: {keyword}"
|
|
|
+ ret['data'] = html_path
|
|
|
+ return ret
|
|
|
+ ret['data'] = kw.model_dump()
|
|
|
+ return ret
|
|
|
except Exception as e:
|
|
|
- logger.exception(f"关键词处理失败: {keyword}")
|
|
|
- raise
|
|
|
+ html_path = search_handler.save_current_page(keyword, filename=f"error_{time.strftime('%Y%m%d_%H%M%S')}")
|
|
|
+ logger.exception(f"失败: {str(e)} html_path: {html_path}")
|
|
|
+ ret['error'] = 1
|
|
|
+ ret['msg'] = f"失败: {str(e)}"
|
|
|
+ ret['data'] = html_path
|
|
|
+ page.quit()
|
|
|
+ return ret
|
|
|
|
|
|
def main():
|
|
|
- search_keyword_drission("drission")
|
|
|
+ search_keyword_drission("drission", max_result_items=15)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- main()
|
|
|
+ main()
|