crawl_t_copy.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import asyncio
  2. import aiofiles
  3. import os
  4. import sys
  5. import time
  6. from camoufox import Camoufox
  7. from camoufox.server import launch_server
  8. from camoufox.async_api import AsyncCamoufox
  9. import asyncio
  10. import signal
  11. from worker.search_engine.camoufox_broswer import BrowserConfig
  12. from worker.search_engine.google_search import GoogleSearchHandler
  13. from mylib.logu import get_logger
  14. import asyncio
  15. import pickle
  16. from pathlib import Path
  17. import random
  18. from typing import List
  19. import httpx
  20. import ssl
  21. from sqlmodel import select, Session
  22. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
  23. from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
  24. from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
  25. from utils.proxy_pool import get_random_proxy
  26. from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini,test_normal_chrome
  27. logger = get_logger('test')
  28. page = None
  29. async def task():
  30. page = load_chrome_from_ini(proxy='http://localhost:1881', auto_port=False)
  31. logger.info(f"{page.browser._chromium_options._address}")
  32. logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
  33. logger.info(f"{page.browser._driver._websocket_url}")
  34. item_id = 1
  35. url = 'https://greg.app/acalypha-marissima-overview/'
  36. # url = 'https://baidu.com'
  37. browser_config = BrowserConfig(
  38. headless=False,
  39. # verbose=False,
  40. # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
  41. debugging_port=9321,
  42. use_managed_browser=True,
  43. cdp_url=page.browser._driver._websocket_url
  44. )
  45. async with AsyncWebCrawler(config=browser_config) as crawler:
  46. crawler_config = CrawlerRunConfig(
  47. cache_mode=CacheMode.DISABLED,
  48. )
  49. result = await crawler.arun(url=url, config=crawler_config)
  50. print(result.markdown)
  51. # crawler = AsyncWebCrawler(config=browser_config)
  52. # await crawler.start()
  53. # crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
  54. # result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
  55. # logger.info(f"{item_id} crawler.arun result.success: {result.success} {result.status_code}")
  56. # print(result.markdown)
  57. # await crawler.close()
  58. def main():
  59. asyncio.run(task())
  60. if __name__ == "__main__":
  61. main()