camoufox_t.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import time
  2. from camoufox import Camoufox
  3. from camoufox.server import launch_server
  4. from camoufox.async_api import AsyncCamoufox
  5. import asyncio
  6. import signal
  7. from worker.search_engine.camoufox_broswer import BrowserConfig
  8. from worker.search_engine.google_search import GoogleSearchHandler
  9. from mylib.logu import get_logger
  10. logger = get_logger('test')
  11. page = None
  12. async def aio_main():
  13. global page
  14. async with AsyncCamoufox(
  15. headless=False,
  16. geoip=True,
  17. proxy={
  18. 'server': 'http://localhost:1881',
  19. }
  20. ) as browser:
  21. page = await browser.new_page()
  22. await page.goto("https://www.browserscan.net")
  23. while not page.is_closed():
  24. # print("page.is_closed() = ", page.is_closed())
  25. await asyncio.sleep(1) # 每隔一秒检查一次
  26. print("Browser has been closed, exiting...")
  27. # 清理操作
  28. await browser.close()
  29. def luanch_browser():
  30. playwright = Camoufox(
  31. geoip=True,
  32. proxy={
  33. 'server': 'http://localhost:1881',
  34. # 'username': 'username',
  35. # 'password': 'password'
  36. }
  37. ).start()
  38. browser = playwright.chromium.launch(headless=False)
  39. print(f"playwright.chromium.executable_path {playwright.chromium.executable_path}")
  40. # 创建一个新的浏览器上下文
  41. context = browser.new_context()
  42. page = context.new_page()
  43. page.goto("https://www.browserscan.net")
  44. def connet_ws():
  45. with sync_playwright() as p:
  46. ws_file = Path(r'K:\code\upwork\zhang_crawl_bio\output\ws.txt')
  47. content = ws_file.read_text()
  48. ws_url_pattern = r'ws://[^\s]+'
  49. match = re.search(ws_url_pattern, content)
  50. ws_url = match.group(0)
  51. browser = p.firefox.connect(ws_url)
  52. page = browser.new_page()
  53. print(page.url)
  54. return
  55. def server_brower():
  56. launch_server(
  57. headless=False,
  58. geoip=True,
  59. proxy={
  60. 'server': 'http://localhost:1881',
  61. 'username': 'username',
  62. 'password': 'password'
  63. }
  64. )
  65. return
  66. async def search_keyword(keyword, max_result_items=200, skip_existing=True, config: BrowserConfig = BrowserConfig()):
  67. ret = {'error': 0, 'msg': '', 'data': None}
  68. config_dict = config.model_dump()
  69. logger.info(f"BrowserConfig {config_dict}")
  70. # config_dict['config'] = {'navigator.cookieEnabled': False}
  71. async with AsyncCamoufox(**config_dict) as browser:
  72. try:
  73. self = GoogleSearchHandler(await browser.new_page())
  74. await self.page.goto(r'G:\code\upwork\zhang_crawl_bio\output\debug\查询不到搜索框-两个textarea.html', wait_until=None, timeout=10000)
  75. textarea = self.page.locator('xpath=//body/textarea')
  76. logger.info(f"textarea {textarea}")
  77. await textarea.fill('', timeout=10000) # 使用 textarea.fill() 而不是 page.fill()
  78. await textarea.press('Enter') # 使用 press 方法模拟按下 Enter 键
  79. await self.page.wait_for_load_state(state='load', timeout=10000)
  80. while True:
  81. await asyncio.sleep(1)
  82. # kw = await search_handler.process_keyword(keyword, max_result_items=max_result_items, skip_existing=skip_existing)
  83. # if not kw:
  84. # ret['error'] = 1
  85. # html_path = await search_handler.save_current_page(keyword, filename=f"warning_{time.strftime('%Y%m%d_%H%M%S')}")
  86. # logger.warning(f"关键词任务未完成: {keyword} html_path: {html_path}")
  87. # ret['msg'] = f"关键词任务未完成: {keyword}"
  88. # ret['data'] = html_path
  89. # return ret
  90. # ret['data'] = kw.model_dump()
  91. # return ret
  92. except Exception as e:
  93. # html_path = await self.save_current_page(keyword, filename=f"error_{time.strftime('%Y%m%d_%H%M%S')}")
  94. logger.exception(f"失败: {str(e)} html_path:")
  95. ret['error'] = 1
  96. ret['msg'] = f"失败: {str(e)}"
  97. return ret
  98. def main():
  99. asyncio.run(search_keyword('123'))
  100. if __name__ == "__main__":
  101. main()