| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import asyncio
- import aiofiles
- import os
- import sys
- import time
- from camoufox import Camoufox
- from camoufox.server import launch_server
- from camoufox.async_api import AsyncCamoufox
- import asyncio
- import signal
- from worker.search_engine.camoufox_broswer import BrowserConfig
- from worker.search_engine.google_search import GoogleSearchHandler
- from mylib.logu import get_logger
- import asyncio
- import pickle
- from pathlib import Path
- import random
- from typing import List
- import httpx
- import ssl
- from sqlmodel import select, Session
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
- from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
- from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
- from utils.proxy_pool import get_random_proxy
- from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini,test_normal_chrome
- logger = get_logger('test')
- page = None
- async def task():
- page = load_chrome_from_ini(proxy='http://localhost:1881', auto_port=False)
- logger.info(f"{page.browser._chromium_options._address}")
- logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
- logger.info(f"{page.browser._driver._websocket_url}")
- item_id = 1
- url = 'https://greg.app/acalypha-marissima-overview/'
- # url = 'https://baidu.com'
- browser_config = BrowserConfig(
- headless=False,
- # verbose=False,
- # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
- debugging_port=9321,
- use_managed_browser=True,
- cdp_url=page.browser._driver._websocket_url
- )
- async with AsyncWebCrawler(config=browser_config) as crawler:
- crawler_config = CrawlerRunConfig(
- cache_mode=CacheMode.DISABLED,
- )
- result = await crawler.arun(url=url, config=crawler_config)
- print(result.markdown)
- # crawler = AsyncWebCrawler(config=browser_config)
- # await crawler.start()
- # crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
- # result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
- # logger.info(f"{item_id} crawler.arun result.success: {result.success} {result.status_code}")
- # print(result.markdown)
- # await crawler.close()
- def main():
- asyncio.run(task())
- if __name__ == "__main__":
- main()
|