|
|
@@ -37,13 +37,14 @@ class GoogleSearchHandler():
|
|
|
html_path = save_to_file(await self.page.content(), html_dir / f"{filename}.html")
|
|
|
logger.info(f"save_to_file {html_path}")
|
|
|
return html_path
|
|
|
+
|
|
|
async def _process_single_page(self, keyword: str) -> SearchResultEle:
|
|
|
content = await self.page.content()
|
|
|
result_ele = self.get_search_result_ele(content)
|
|
|
|
|
|
if not result_ele.search_div:
|
|
|
logger.warning(f"未找到搜索结果容器,可能遇到验证页面 keyword: {keyword}")
|
|
|
- return False, result_ele
|
|
|
+ return result_ele
|
|
|
|
|
|
html_path = await self.save_current_page(keyword, filename=f"{result_ele.current_page}")
|
|
|
page_result = self.db_manager.save_page_results(
|
|
|
@@ -65,36 +66,48 @@ class GoogleSearchHandler():
|
|
|
return result_ele
|
|
|
|
|
|
async def process_keyword(self, keyword: str, max_result_items: int = 200, skip_existing: bool = False):
|
|
|
- if skip_existing:
|
|
|
- key_model = self.db_manager.get_keyword_task(keyword)
|
|
|
- if key_model:
|
|
|
- logger.info(f"关键词任务已完成,跳过处理: {keyword}")
|
|
|
- return key_model
|
|
|
+ key_model = self.db_manager.get_keyword_task(keyword)
|
|
|
+ if skip_existing and key_model:
|
|
|
+ logger.info(f"关键词任务已完成,跳过处理: {keyword}")
|
|
|
+ return key_model
|
|
|
|
|
|
- self.db_manager.create_keyword_task(keyword)
|
|
|
+ # 删除旧数据重新创建任务
|
|
|
+ if key_model:
|
|
|
+ self.db_manager.delete_keyword_task(keyword)
|
|
|
+ key_model = self.db_manager.create_keyword_task(keyword)
|
|
|
|
|
|
await self.search(keyword)
|
|
|
- has_next = True
|
|
|
- result_ele = None
|
|
|
search_result_item_count = 0
|
|
|
- while has_next:
|
|
|
+ should_complete = False # 标记是否满足完成条件
|
|
|
+
|
|
|
+ while True:
|
|
|
result_ele = await self._process_single_page(keyword)
|
|
|
+ # 处理验证页面等异常情况
|
|
|
+ if not result_ele.search_div:
|
|
|
+ break
|
|
|
+
|
|
|
search_result_item_count += len(result_ele.results) if result_ele.results else 0
|
|
|
- if search_result_item_count > max_result_items:
|
|
|
- logger.info(f"关键词 {keyword} 单页结果数量超过 {max_result_items} ,跳过处理下一页")
|
|
|
+ # 达到最大结果数或没有下一页时标记完成
|
|
|
+ if search_result_item_count >= max_result_items or not result_ele.next_page_url:
|
|
|
+ should_complete = True
|
|
|
break
|
|
|
- if result_ele.next_page_url:
|
|
|
+
|
|
|
+ try:
|
|
|
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
await asyncio.sleep(3)
|
|
|
await self.page.click("//a[@id='pnnext']", timeout=10000)
|
|
|
logger.info(f"self.page.url {self.page.url}")
|
|
|
- logger.debug(f"goto next_page_url {result_ele.next_page_url}")
|
|
|
await self.page.wait_for_load_state('load', timeout=10000)
|
|
|
- else:
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"翻页失败: {str(e)}")
|
|
|
break
|
|
|
-
|
|
|
- key_model = self.db_manager.mark_task_completed(keyword)
|
|
|
- logger.info(f"完成关键词处理: {keyword}")
|
|
|
+
|
|
|
+ # 只有正常完成时才标记任务完成
|
|
|
+ if should_complete:
|
|
|
+ key_model = self.db_manager.mark_task_completed(keyword)
|
|
|
+ logger.info(f"正常完成关键词处理: {keyword}")
|
|
|
+ else:
|
|
|
+ logger.warning(f"关键词处理被中断: {keyword}")
|
|
|
return key_model
|
|
|
|
|
|
async def goto_home_page(self):
|
|
|
@@ -110,16 +123,18 @@ class GoogleSearchHandler():
|
|
|
raise Exception("用户选择退出,程序终止。")
|
|
|
|
|
|
raise Exception(f"出现人机验证,正在换身份重试。。 {self.page.url}")
|
|
|
+
|
|
|
def find_search_div(self, html_content: str) -> str:
|
|
|
return bool(Adaptor(html_content).xpath_first('//div[@id="search"]'))
|
|
|
+
|
|
|
async def search(self, query: str) -> dict:
|
|
|
await self.goto_home_page()
|
|
|
search_ele_dict = get_search_ele(await self.page.content())
|
|
|
if not search_ele_dict:
|
|
|
raise Exception("未找到搜索框")
|
|
|
textarea = self.page.locator(search_ele_dict['xpath'])
|
|
|
- await textarea.fill(query, timeout=10000) # 使用 textarea.fill() 而不是 page.fill()
|
|
|
- await textarea.press('Enter') # 使用 press 方法模拟按下 Enter 键
|
|
|
+ await textarea.fill(query, timeout=10000)
|
|
|
+ await textarea.press('Enter')
|
|
|
await self.page.wait_for_load_state(state='load', timeout=10000)
|
|
|
return await self.page.content()
|
|
|
|
|
|
@@ -146,7 +161,7 @@ class GoogleSearchHandler():
|
|
|
return res
|
|
|
|
|
|
result_list = search_div.xpath('//*[@data-snc]')
|
|
|
- logger.info(f"当前页结果数量: {len(result_list)}")
|
|
|
+ logger.info(f"当前页结果数量: {len(result_list)}, next_page_url: {next_page_url}")
|
|
|
|
|
|
for result_item in result_list:
|
|
|
if len(result_item.children) < 2:
|
|
|
@@ -167,11 +182,11 @@ class GoogleSearchHandler():
|
|
|
res.results.append(result)
|
|
|
return res
|
|
|
|
|
|
-async def search_keyword(keyword, max_result_items=200, skip_existing=True, config: BrowserConfig = BrowserConfig()):
|
|
|
+async def search_keyword(keyword, max_result_items=200, skip_existing=False, config: BrowserConfig = BrowserConfig()):
|
|
|
ret = {'error': 0, 'msg': '', 'data': None}
|
|
|
config_dict = config.model_dump()
|
|
|
logger.info(f"BrowserConfig {config_dict}")
|
|
|
- # config_dict['config'] = {'navigator.cookieEnabled': False}
|
|
|
+ logger.info(f"keyword {keyword} max_result_items: {max_result_items} skip_existing: {skip_existing}")
|
|
|
async with AsyncCamoufox(**config_dict) as browser:
|
|
|
try:
|
|
|
|
|
|
@@ -193,6 +208,7 @@ async def search_keyword(keyword, max_result_items=200, skip_existing=True, conf
|
|
|
ret['msg'] = f"失败: {str(e)}"
|
|
|
ret['data'] = html_path
|
|
|
return ret
|
|
|
+
|
|
|
async def aio_main(config: BrowserConfig = BrowserConfig()):
|
|
|
try:
|
|
|
async with BrowserCore(config) as core:
|
|
|
@@ -200,20 +216,13 @@ async def aio_main(config: BrowserConfig = BrowserConfig()):
|
|
|
await search_handler.goto_home_page()
|
|
|
keywords = [
|
|
|
'Acampe carinata essential oil',
|
|
|
- # 'Acampe cephalotes essential oil',
|
|
|
- # 'Acampe hulae essential oil',
|
|
|
- # 'Acampe rigida essential oil',
|
|
|
- # 'Acamptopappus shockleyi essential oil'
|
|
|
]
|
|
|
- # for keyword in keywords:
|
|
|
- # await search_handler.process_keyword(keyword)
|
|
|
while True:
|
|
|
await asyncio.sleep(1)
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"失败: {str(e)}")
|
|
|
|
|
|
-
|
|
|
def analyze():
|
|
|
html_file = Path(r"K:\code\upwork\zhang_crawl_bio\output\results\Acampe_rigida_essential_oil\page_1.html")
|
|
|
class TestPage:
|
|
|
@@ -223,9 +232,7 @@ def analyze():
|
|
|
logger.info(f"{res.model_dump_json(indent=4,)}")
|
|
|
|
|
|
def main():
|
|
|
- # analyze()
|
|
|
asyncio.run(aio_main())
|
|
|
- # asyncio.run(search_keyword('Acampe carinata essential oil', config=BrowserConfig(), skip_existing=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|