Sfoglia il codice sorgente

测试指纹浏览器和示例 camoufox

mrh 1 anno fa
parent
commit
9138dec48e

+ 0 - 0
.clinerules-code


+ 6 - 1
.env

@@ -1,3 +1,8 @@
 PYTHONPATH=.
 HTTP_PROXY=http://127.0.0.1:1881
-HTTPS_PROXY=http://127.0.0.1:1881
+HTTPS_PROXY=http://127.0.0.1:1881
+
+OPENAI_API_KEY=sk-NscqaCD1PfVm7soEF3C3E6297bE14d7fB595Be8f17F39aFf
+# OPENAI_API_KEY=sk-gEuP5g8IrS15vwsI578e6532Ab914f519dEcA49a69C4277a
+OPENAI_BASE_URL=https://aiapi.magong.site/v1
+

+ 1 - 1
config/settings.py

@@ -1,5 +1,5 @@
 from pathlib import Path
 WORK_DIR = Path(__file__).parent.parent.absolute()
 OUTPUT_DIR = WORK_DIR / "output"
-CONFIG_DIR = WORK_DIR / "mylib" / "conf"
+CONFIG_DIR = WORK_DIR / "config" / "conf"
 GOOGLE_SEARCH_DIR = OUTPUT_DIR / "google_search"

+ 32 - 3
readme.md

@@ -16,19 +16,34 @@ https://github.com/PyFilesystem/pyfilesystem2
 6.6k ⭐  爬虫框架大全
 https://github.com/BruceDone/awesome-crawler
 
-
+爬虫工具大全,搜索: 爬
+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
 
 
 330 ⭐  awesome 网页解析器数据提取大全
 https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
 关键词: https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
 
+
+
+
 34.9k ⭐ markitdown
 https://github.com/microsoft/markitdown
 
 22k ⭐ firecrawl AI 抓取干净结构化的页面 
 https://github.com/mendableai/firecrawl
 
+17.3k ⭐  python 用AI自动抓取网页信息,自动解析 markdown ,自定义提取的字段
+还能生成代码,为页面固定运行代码
+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
+
+7.5K ⭐  无代码,鼠标点击元素,即可提取所有相似数据转换成表格或者 json , 或者 API 接口
+https://github.com/getmaxun/maxun
+
+241 ⭐  在代码中用自然语言描述,就能让AI提取有关数据,还能转换为 markdown ,也能用自然语言描述来自动化输入、回车、提交
+需要 LLM 秘钥
+https://github.com/dendrite-systems/dendrite-python-sdk
+
 readerLM-v2
 
 3.1k⭐ 网页解析器
@@ -40,8 +55,22 @@ https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
 339 ⭐ 文章提取器,这是一个论文和评估基准
 https://github.com/scrapinghub/article-extraction-benchmark
 
-1.4k ⭐  快如闪电的解析器,比 bs4 快240倍
+1.4k ⭐  快如闪电的解析器,比 bs4 快240倍 ,
+可以进行相似元素的搜索,加快搜索效率,智能导航,可以快速跳转到父级、子级、兄弟元素
+假如元素属性发生改变,它可以智能识别改变后的元素
 https://github.com/D4Vinci/Scrapling
+
+### 反机器人检测的浏览器
+365 ⭐  给原装的playwright打补丁,能够避免检测到自动化
+https://github.com/rebrowser/rebrowser-playwright-python
+
+0.98k ⭐  反机器人检测的浏览器
+https://github.com/daijro/camoufox
+
+
 机器人检测的网站,测试用
 https://www.browserscan.net/bot-detection
-"https://bot.sannysoft.com/"
+"https://bot.sannysoft.com/"
+
+检查你的 代理ip 和浏览器指纹真伪
+https://www.browserscan.net

+ 50 - 0
tests/mytest/camoufox_connect_server.py

@@ -0,0 +1,50 @@
+import time
+from camoufox import Camoufox
+from camoufox.server import launch_server
+
+def luanch_browser():
+    playwright = Camoufox(
+        geoip=True,
+        proxy={
+            'server': 'http://localhost:1881',
+            # 'username': 'username',
+            # 'password': 'password'
+        }
+    ).start()
+    browser = playwright.chromium.launch(headless=False)
+    # 创建一个新的浏览器上下文
+    context = browser.new_context()
+    page = context.new_page()
+    page.goto("https://www.browserscan.net")
+
+def server_brower():
+
+    launch_server(
+        headless=False,
+        geoip=True,
+        proxy={
+            'server': 'http://localhost:1881',
+            'username': 'username',
+            'password': 'password'
+        }
+    )
+    return 
+
+def connect_server():
+    from playwright.sync_api import sync_playwright
+
+    with sync_playwright() as p:
+        # Example endpoint
+        browser = p.firefox.connect('ws://localhost:12696/aacc88d1a82b1805f826a43b576cba0d')
+        page = browser.new_page()
+        page.goto('https://www.browserscan.net')
+        print(page.title())
+        time.sleep(15)
+        page.screenshot(path='./1.png')
+
+def main():
+    connect_server()
+    # server_brower()
+
+if __name__ == "__main__":
+    main()

+ 59 - 0
tests/mytest/camoufox_t.py

@@ -0,0 +1,59 @@
+from camoufox import Camoufox
+from camoufox.server import launch_server
+from camoufox.async_api import AsyncCamoufox
+import asyncio
+import signal
+
+page = None
+async def aio_main():
+    global page
+    async with AsyncCamoufox(
+        headless=False,
+        geoip=True,
+        proxy={
+            'server': 'http://localhost:1881',
+        }
+    ) as browser:
+        page = await browser.new_page()
+        await page.goto("https://www.browserscan.net")
+        
+        while not page.is_closed():
+            # print("page.is_closed() = ", page.is_closed())
+            await asyncio.sleep(1)  # 每隔一秒检查一次
+        
+        print("Browser has been closed, exiting...")
+        # 清理操作
+        await browser.close()
+def luanch_browser():
+    playwright = Camoufox(
+        geoip=True,
+        proxy={
+            'server': 'http://localhost:1881',
+            # 'username': 'username',
+            # 'password': 'password'
+        }
+    ).start()
+    browser = playwright.chromium.launch(headless=False)
+    print(f"playwright.chromium.executable_path {playwright.chromium.executable_path}")
+    # 创建一个新的浏览器上下文
+    context = browser.new_context()
+    page = context.new_page()
+    page.goto("https://www.browserscan.net")
+
+def server_brower():
+
+    launch_server(
+        headless=False,
+        geoip=True,
+        proxy={
+            'server': 'http://localhost:1881',
+            'username': 'username',
+            'password': 'password'
+        }
+    )
+    return 
+def main():
+    asyncio.run(aio_main())
+
+if __name__ == "__main__":
+    main()

+ 10 - 9
tests/mytest/googlesearch_t.py

@@ -1,11 +1,12 @@
 # Get the first 20 hits for: "Breaking Code" WordPress blog
 import os
-from dotenv import load_dotenv
-load_dotenv()
-print(f"http_proxy: {os.getenv('http_proxy')}")
-print(f"https_proxy: {os.getenv('https_proxy')}")
-from googlesearch import search
-import googlesearch
-print(googlesearch.get_random_user_agent())
-for url in search('"Breaking Code" WordPress blog', stop=2):
-    print(url)
+# from dotenv import load_dotenv
+# load_dotenv()
+from mylib.drission_page import load_chrome_from_ini
+
+def main():
+    page = load_chrome_from_ini()
+    page.get("https://www.google.com/")
+
+if __name__ == "__main__":
+    main()

+ 71 - 2
tests/mytest/playwright_t.py

@@ -1,5 +1,9 @@
 import asyncio
 from playwright.async_api import async_playwright,CDPSession,Page,Locator
+from playwright.sync_api import sync_playwright
+import configparser
+from config.settings import CONFIG_DIR
+
 # 以启动路径的方式启动 chrome
 def start_chrome():
     import subprocess
@@ -17,12 +21,77 @@ async def get_win_pos_by_cdp(port=9220, host='localhost'):
             context = contexts[0]  
             print(f"context.pages {context.pages}")
             page = context.pages[1]
+
+
+def start_not_exit():
+    '''
+    self.playwright = await async_playwright().start()
+    self.page:Page = await self.get_page_from_cdp(self.playwright)
+
+    browser = await playwright.chromium.connect_over_cdp(self.model.browser_addr)
+    if not browser.contexts:
+        context = await browser.new_context()
+    else:
+        context = browser.contexts[0]
+    page = await self.find_page(context, page_index=page_index, url_prefix=url_prefix)
+    return page
+    '''
+    # 启动 Playwright
+    playwright = sync_playwright().start()
+
+    # 启动浏览器(例如 Chromium)
+    browser = playwright.chromium.launch(headless=False)
+
+    # 创建一个新的浏览器上下文
+    context = browser.new_context()
+
+    # 创建一个新的页面
+    page = context.new_page()
+
+    # 导航到一个网页
+    page.goto('https://example.com')
+
+    # 在这里可以执行其他操作,例如与页面交互
+    print(page.title())
+
+    # 保持浏览器打开,不退出
+    # 你可以在这里继续与页面交互
+
+    # 当你想要关闭浏览器时,可以手动调用以下代码
+    # browser.close()
+
+    # 关闭 Playwright
+    # playwright.stop()
+def load_init():
+    # 创建 ConfigParser 对象
+    config = configparser.ConfigParser()
+
+    # 读取配置文件
+    config.read(CONFIG_DIR / '9321.ini')
+
+    # 获取特定路径的值
+    download_path = config.get('paths', 'download_path')
+    tmp_path = config.get('paths', 'tmp_path')
+
+    # 获取 Chromium 选项
+    address = config.get('chromium_options', 'address')
+    browser_path = config.get('chromium_options', 'browser_path')
+
+    # 打印结果
+    print(f"Download Path: {download_path}")
+    print(f"Tmp Path: {tmp_path}")
+    print(f"Address: {address}")
+    print(f"Browser Path: {browser_path}")
+
 async def main():
-    start_chrome()
+    
+    # start_chrome()
     '''一定要在符合浏览器版本的 user_data_dir 下启动 chrome
 "C:\Program Files\Google\Chrome\Application\chrome.exe" --debug-port=9222 --user-data-dir=K:\\code\\upwork\\zhang_crawl_bio\\output\\user_data_dir2
     '''
     # await get_win_pos_by_cdp()
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    start_not_exit()
+    # load_init()
+    # asyncio.run(main())

+ 48 - 0
tests/mytest/scrapegraph_t.py

@@ -0,0 +1,48 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/31087937bef20eadcb83e28688077eff13ed2780/examples/script_generator_graph/openai/script_generator_openai.py#L7
+"""
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "base_url": os.getenv("OPENAI_BASE_URL"),
+        "model": "openai/deepseek-chat",
+    },
+    "library": "beautifulsoup",
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ScriptCreatorGraph(
+    prompt="List me all the news with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))

+ 9 - 1
tests/mytest/scrapling_t.py

@@ -6,6 +6,9 @@ I only made this example to show how Scrapling features can be used to scrape a
 import requests
 
 from scrapling import Adaptor
+from pathlib import Path
+from mylib.base import save_to_file
+
 def stackoverflow_demo():
     response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
     page = Adaptor(response.text, url=response.url)
@@ -25,7 +28,12 @@ def stackoverflow_demo():
             print(i, title.text, author.text)
 
 def google_search_demo():
-    search_key = "python"
+    file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
+    html_content = file.read_text(encoding='utf-8')
+    page = Adaptor(html_content)
+    page.has_class('quote')
+    print(page.find_by_text('Medicinal plants from the genus Acalypha'))
+
 def main():
     google_search_demo()
 

+ 25 - 30
tests/mytest/t.py

@@ -1,38 +1,33 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from mylib.base import OUTPUT_DIR
+import requests
+from bs4 import BeautifulSoup
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
 
-async def main():
-    # 1) Reference your persistent data directory
-    browser_config = BrowserConfig(
-        headless=False,             # 'True' for automated runs
-        verbose=True,
-        use_managed_browser=False,  # Enables persistent browser strategy
-        use_persistent_context=True,
-        browser_type="chromium",
-        user_data_dir=OUTPUT_DIR / "user_data_dir2"
-    )
+load_dotenv()
+
+def t_main():
+    url = "https://perinim.github.io/projects"
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
 
-    # 2) Standard crawl config
-    search_key='Acalypha malabarica essential oil'
-    start=30
-    url = f"https://www.google.com/search?q={search_key}&start={start}"
-    print(f"search url: {url}")
-    crawl_config = CrawlerRunConfig(
-        wait_for="css:.logged-in-content",
-        url=url,
-    )
-    browser_config.url = url
-    async with AsyncWebCrawler(config=browser_config) as crawler:
+    news_list = []
 
-    # crawler = AsyncWebCrawler(config=browser_config)
-    # await crawler.start()
-        result = await crawler.arun(
-            url=url,
-            config=crawl_config,
-        )
-        # save_to_pickle(result, OUTPUT_DIR / f"{search_key}.pickle")
-        return result
+    for news in soup.find_all('div', class_='news-item'):
+        title = news.find('h2').text.strip()
+        description = news.find('p').text.strip()
+        news_list.append({
+            "title": title,
+            "description": description
+        })
 
+    print(json.dumps(news_list, indent=4))
+async def main():
+    t_main()
+    # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n    url = \"https://perinim.github.io/projects\"\n    response = requests.get(url)\n    soup = BeautifulSoup(response.content, 'html.parser')\n    \n    news_list = []\n    \n    for news in soup.find_all('div', class_='news-item'):\n        title = news.find('h2').text.strip()\n        description = news.find('p').text.strip()\n        news_list.append({\n            \"title\": title,\n            \"description\": description\n        })\n    \n    print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n    main()\n'''
+    # print(s)
 if __name__ == "__main__":
     asyncio.run(main())