1 anno fa · 9138dec48e
--- a/.clinerules-code
+++ b/.clinerules-code
--- a/.env
+++ b/.env
@@ -1,3 +1,8 @@
 
				 PYTHONPATH=.
			
 
				 HTTP_PROXY=http://127.0.0.1:1881
			
 
				-HTTPS_PROXY=http://127.0.0.1:1881
			
 
				+HTTPS_PROXY=http://127.0.0.1:1881
			
 
				+
			
 
				+OPENAI_API_KEY=sk-NscqaCD1PfVm7soEF3C3E6297bE14d7fB595Be8f17F39aFf
			
 
				+# OPENAI_API_KEY=sk-gEuP5g8IrS15vwsI578e6532Ab914f519dEcA49a69C4277a
			
 
				+OPENAI_BASE_URL=https://aiapi.magong.site/v1
			
 
				+
			
--- a/config/settings.py
+++ b/config/settings.py
@@ -1,5 +1,5 @@
 
				 from pathlib import Path
			
 
				 WORK_DIR = Path(__file__).parent.parent.absolute()
			
 
				 OUTPUT_DIR = WORK_DIR / "output"
			
 
				-CONFIG_DIR = WORK_DIR / "mylib" / "conf"
			
 
				+CONFIG_DIR = WORK_DIR / "config" / "conf"
			
 
				 GOOGLE_SEARCH_DIR = OUTPUT_DIR / "google_search"
			
--- a/readme.md
+++ b/readme.md
@@ -16,19 +16,34 @@ https://github.com/PyFilesystem/pyfilesystem2
 
				 6.6k ⭐  爬虫框架大全
			
 
				 https://github.com/BruceDone/awesome-crawler
			
 
				 
			
 
				-
			
 
				+爬虫工具大全，搜索： 爬
			
 
				+https://github.com/GitHubDaily/GitHubDaily/blob/cb618c17a72fc5a62248e5ac863d46fe0164487b/README.md?plain=1#L190
			
 
				 
			
 
				 
			
 
				 330 ⭐  awesome 网页解析器数据提取大全
			
 
				 https://github.com/kimtth/awesome-azure-openai-llm/blob/9b16663bb4e38bc8760f3f274b92dfcca0ada34a/section/app.md
			
 
				 关键词： https://github.com/search?q=Trafilatura+awesome++language%3AMarkdown&type=code&l=Markdown
			
 
				 
			
 
				+
			
 
				+
			
 
				+
			
 
				 34.9k ⭐ markitdown
			
 
				 https://github.com/microsoft/markitdown
			
 
				 
			
 
				 22k ⭐ firecrawl AI 抓取干净结构化的页面 
			
 
				 https://github.com/mendableai/firecrawl
			
 
				 
			
 
				+17.3k ⭐  python 用AI自动抓取网页信息，自动解析 markdown ，自定义提取的字段
			
 
				+还能生成代码，为页面固定运行代码
			
 
				+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md
			
 
				+
			
 
				+7.5K ⭐  无代码，鼠标点击元素，即可提取所有相似数据转换成表格或者 json ， 或者 API 接口
			
 
				+https://github.com/getmaxun/maxun
			
 
				+
			
 
				+241 ⭐  在代码中用自然语言描述，就能让AI提取有关数据，还能转换为 markdown ，也能用自然语言描述来自动化输入、回车、提交
			
 
				+需要 LLM 秘钥
			
 
				+https://github.com/dendrite-systems/dendrite-python-sdk
			
 
				+
			
 
				 readerLM-v2
			
 
				 
			
 
				 3.1k⭐ 网页解析器
			
@@ -40,8 +55,22 @@ https://trafilatura.readthedocs.io/en/latest/evaluation.html#results-2022-05-18
 
				 339 ⭐ 文章提取器，这是一个论文和评估基准
			
 
				 https://github.com/scrapinghub/article-extraction-benchmark
			
 
				 
			
 
				-1.4k ⭐  快如闪电的解析器，比 bs4 快240倍
			
 
				+1.4k ⭐  快如闪电的解析器，比 bs4 快240倍 ，
			
 
				+可以进行相似元素的搜索，加快搜索效率，智能导航，可以快速跳转到父级、子级、兄弟元素
			
 
				+假如元素属性发生改变，它可以智能识别改变后的元素
			
 
				 https://github.com/D4Vinci/Scrapling
			
 
				+
			
 
				+### 反机器人检测的浏览器
			
 
				+365 ⭐  给原装的playwright打补丁，能够避免检测到自动化
			
 
				+https://github.com/rebrowser/rebrowser-playwright-python
			
 
				+
			
 
				+0.98k ⭐  反机器人检测的浏览器
			
 
				+https://github.com/daijro/camoufox
			
 
				+
			
 
				+
			
 
				 机器人检测的网站，测试用
			
 
				 https://www.browserscan.net/bot-detection
			
 
				-"https://bot.sannysoft.com/"
			
 
				+"https://bot.sannysoft.com/"
			
 
				+
			
 
				+检查你的 代理ip 和浏览器指纹真伪
			
 
				+https://www.browserscan.net
			
--- a/tests/mytest/camoufox_connect_server.py
+++ b/tests/mytest/camoufox_connect_server.py
@@ -0,0 +1,50 @@
 
				+import time
			
 
				+from camoufox import Camoufox
			
 
				+from camoufox.server import launch_server
			
 
				+
			
 
				+def luanch_browser():
			
 
				+    playwright = Camoufox(
			
 
				+        geoip=True,
			
 
				+        proxy={
			
 
				+            'server': 'http://localhost:1881',
			
 
				+            # 'username': 'username',
			
 
				+            # 'password': 'password'
			
 
				+        }
			
 
				+    ).start()
			
 
				+    browser = playwright.chromium.launch(headless=False)
			
 
				+    # 创建一个新的浏览器上下文
			
 
				+    context = browser.new_context()
			
 
				+    page = context.new_page()
			
 
				+    page.goto("https://www.browserscan.net")
			
 
				+
			
 
				+def server_brower():
			
 
				+
			
 
				+    launch_server(
			
 
				+        headless=False,
			
 
				+        geoip=True,
			
 
				+        proxy={
			
 
				+            'server': 'http://localhost:1881',
			
 
				+            'username': 'username',
			
 
				+            'password': 'password'
			
 
				+        }
			
 
				+    )
			
 
				+    return 
			
 
				+
			
 
				+def connect_server():
			
 
				+    from playwright.sync_api import sync_playwright
			
 
				+
			
 
				+    with sync_playwright() as p:
			
 
				+        # Example endpoint
			
 
				+        browser = p.firefox.connect('ws://localhost:12696/aacc88d1a82b1805f826a43b576cba0d')
			
 
				+        page = browser.new_page()
			
 
				+        page.goto('https://www.browserscan.net')
			
 
				+        print(page.title())
			
 
				+        time.sleep(15)
			
 
				+        page.screenshot(path='./1.png')
			
 
				+
			
 
				+def main():
			
 
				+    connect_server()
			
 
				+    # server_brower()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/tests/mytest/camoufox_t.py
+++ b/tests/mytest/camoufox_t.py
@@ -0,0 +1,59 @@
 
				+from camoufox import Camoufox
			
 
				+from camoufox.server import launch_server
			
 
				+from camoufox.async_api import AsyncCamoufox
			
 
				+import asyncio
			
 
				+import signal
			
 
				+
			
 
				+page = None
			
 
				+async def aio_main():
			
 
				+    global page
			
 
				+    async with AsyncCamoufox(
			
 
				+        headless=False,
			
 
				+        geoip=True,
			
 
				+        proxy={
			
 
				+            'server': 'http://localhost:1881',
			
 
				+        }
			
 
				+    ) as browser:
			
 
				+        page = await browser.new_page()
			
 
				+        await page.goto("https://www.browserscan.net")
			
 
				+        
			
 
				+        while not page.is_closed():
			
 
				+            # print("page.is_closed() = ", page.is_closed())
			
 
				+            await asyncio.sleep(1)  # 每隔一秒检查一次
			
 
				+        
			
 
				+        print("Browser has been closed, exiting...")
			
 
				+        # 清理操作
			
 
				+        await browser.close()
			
 
				+def luanch_browser():
			
 
				+    playwright = Camoufox(
			
 
				+        geoip=True,
			
 
				+        proxy={
			
 
				+            'server': 'http://localhost:1881',
			
 
				+            # 'username': 'username',
			
 
				+            # 'password': 'password'
			
 
				+        }
			
 
				+    ).start()
			
 
				+    browser = playwright.chromium.launch(headless=False)
			
 
				+    print(f"playwright.chromium.executable_path {playwright.chromium.executable_path}")
			
 
				+    # 创建一个新的浏览器上下文
			
 
				+    context = browser.new_context()
			
 
				+    page = context.new_page()
			
 
				+    page.goto("https://www.browserscan.net")
			
 
				+
			
 
				+def server_brower():
			
 
				+
			
 
				+    launch_server(
			
 
				+        headless=False,
			
 
				+        geoip=True,
			
 
				+        proxy={
			
 
				+            'server': 'http://localhost:1881',
			
 
				+            'username': 'username',
			
 
				+            'password': 'password'
			
 
				+        }
			
 
				+    )
			
 
				+    return 
			
 
				+def main():
			
 
				+    asyncio.run(aio_main())
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/tests/mytest/googlesearch_t.py
+++ b/tests/mytest/googlesearch_t.py
@@ -1,11 +1,12 @@
 
				 # Get the first 20 hits for: "Breaking Code" WordPress blog
			
 
				 import os
			
 
				-from dotenv import load_dotenv
			
 
				-load_dotenv()
			
 
				-print(f"http_proxy: {os.getenv('http_proxy')}")
			
 
				-print(f"https_proxy: {os.getenv('https_proxy')}")
			
 
				-from googlesearch import search
			
 
				-import googlesearch
			
 
				-print(googlesearch.get_random_user_agent())
			
 
				-for url in search('"Breaking Code" WordPress blog', stop=2):
			
 
				-    print(url)
			
 
				+# from dotenv import load_dotenv
			
 
				+# load_dotenv()
			
 
				+from mylib.drission_page import load_chrome_from_ini
			
 
				+
			
 
				+def main():
			
 
				+    page = load_chrome_from_ini()
			
 
				+    page.get("https://www.google.com/")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/tests/mytest/playwright_t.py
+++ b/tests/mytest/playwright_t.py
@@ -1,5 +1,9 @@
 
				 import asyncio
			
 
				 from playwright.async_api import async_playwright,CDPSession,Page,Locator
			
 
				+from playwright.sync_api import sync_playwright
			
 
				+import configparser
			
 
				+from config.settings import CONFIG_DIR
			
 
				+
			
 
				 # 以启动路径的方式启动 chrome
			
 
				 def start_chrome():
			
 
				     import subprocess
			
@@ -17,12 +21,77 @@ async def get_win_pos_by_cdp(port=9220, host='localhost'):
 
				             context = contexts[0]  
			
 
				             print(f"context.pages {context.pages}")
			
 
				             page = context.pages[1]
			
 
				+
			
 
				+
			
 
				+def start_not_exit():
			
 
				+    '''
			
 
				+    self.playwright = await async_playwright().start()
			
 
				+    self.page:Page = await self.get_page_from_cdp(self.playwright)
			
 
				+
			
 
				+    browser = await playwright.chromium.connect_over_cdp(self.model.browser_addr)
			
 
				+    if not browser.contexts:
			
 
				+        context = await browser.new_context()
			
 
				+    else:
			
 
				+        context = browser.contexts[0]
			
 
				+    page = await self.find_page(context, page_index=page_index, url_prefix=url_prefix)
			
 
				+    return page
			
 
				+    '''
			
 
				+    # 启动 Playwright
			
 
				+    playwright = sync_playwright().start()
			
 
				+
			
 
				+    # 启动浏览器（例如 Chromium）
			
 
				+    browser = playwright.chromium.launch(headless=False)
			
 
				+
			
 
				+    # 创建一个新的浏览器上下文
			
 
				+    context = browser.new_context()
			
 
				+
			
 
				+    # 创建一个新的页面
			
 
				+    page = context.new_page()
			
 
				+
			
 
				+    # 导航到一个网页
			
 
				+    page.goto('https://example.com')
			
 
				+
			
 
				+    # 在这里可以执行其他操作，例如与页面交互
			
 
				+    print(page.title())
			
 
				+
			
 
				+    # 保持浏览器打开，不退出
			
 
				+    # 你可以在这里继续与页面交互
			
 
				+
			
 
				+    # 当你想要关闭浏览器时，可以手动调用以下代码
			
 
				+    # browser.close()
			
 
				+
			
 
				+    # 关闭 Playwright
			
 
				+    # playwright.stop()
			
 
				+def load_init():
			
 
				+    # 创建 ConfigParser 对象
			
 
				+    config = configparser.ConfigParser()
			
 
				+
			
 
				+    # 读取配置文件
			
 
				+    config.read(CONFIG_DIR / '9321.ini')
			
 
				+
			
 
				+    # 获取特定路径的值
			
 
				+    download_path = config.get('paths', 'download_path')
			
 
				+    tmp_path = config.get('paths', 'tmp_path')
			
 
				+
			
 
				+    # 获取 Chromium 选项
			
 
				+    address = config.get('chromium_options', 'address')
			
 
				+    browser_path = config.get('chromium_options', 'browser_path')
			
 
				+
			
 
				+    # 打印结果
			
 
				+    print(f"Download Path: {download_path}")
			
 
				+    print(f"Tmp Path: {tmp_path}")
			
 
				+    print(f"Address: {address}")
			
 
				+    print(f"Browser Path: {browser_path}")
			
 
				+
			
 
				 async def main():
			
 
				-    start_chrome()
			
 
				+    
			
 
				+    # start_chrome()
			
 
				     '''一定要在符合浏览器版本的 user_data_dir 下启动 chrome
			
 
				 "C:\Program Files\Google\Chrome\Application\chrome.exe" --debug-port=9222 --user-data-dir=K:\\code\\upwork\\zhang_crawl_bio\\output\\user_data_dir2
			
 
				     '''
			
 
				     # await get_win_pos_by_cdp()
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    asyncio.run(main())
			
 
				+    start_not_exit()
			
 
				+    # load_init()
			
 
				+    # asyncio.run(main())
			
--- a/tests/mytest/scrapegraph_t.py
+++ b/tests/mytest/scrapegraph_t.py
@@ -0,0 +1,48 @@
 
				+""" 
			
 
				+Basic example of scraping pipeline using SmartScraper
			
 
				+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/31087937bef20eadcb83e28688077eff13ed2780/examples/script_generator_graph/openai/script_generator_openai.py#L7
			
 
				+"""
			
 
				+import os
			
 
				+import json
			
 
				+from dotenv import load_dotenv
			
 
				+from scrapegraphai.graphs import ScriptCreatorGraph
			
 
				+from scrapegraphai.utils import prettify_exec_info
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+# ************************************************
			
 
				+# Define the configuration for the graph
			
 
				+# ************************************************
			
 
				+
			
 
				+
			
 
				+graph_config = {
			
 
				+    "llm": {
			
 
				+        "api_key": os.getenv("OPENAI_API_KEY"),
			
 
				+        "base_url": os.getenv("OPENAI_BASE_URL"),
			
 
				+        "model": "openai/deepseek-chat",
			
 
				+    },
			
 
				+    "library": "beautifulsoup",
			
 
				+    "verbose": True,
			
 
				+    "headless": False,
			
 
				+}
			
 
				+
			
 
				+# ************************************************
			
 
				+# Create the SmartScraperGraph instance and run it
			
 
				+# ************************************************
			
 
				+
			
 
				+smart_scraper_graph = ScriptCreatorGraph(
			
 
				+    prompt="List me all the news with their description.",
			
 
				+    # also accepts a string with the already downloaded HTML code
			
 
				+    source="https://perinim.github.io/projects",
			
 
				+    config=graph_config
			
 
				+)
			
 
				+
			
 
				+result = smart_scraper_graph.run()
			
 
				+print(json.dumps(result, indent=4))
			
 
				+
			
 
				+# ************************************************
			
 
				+# Get graph execution info
			
 
				+# ************************************************
			
 
				+
			
 
				+graph_exec_info = smart_scraper_graph.get_execution_info()
			
 
				+print(prettify_exec_info(graph_exec_info))
			
--- a/tests/mytest/scrapling_t.py
+++ b/tests/mytest/scrapling_t.py
@@ -6,6 +6,9 @@ I only made this example to show how Scrapling features can be used to scrape a
 
				 import requests
			
 
				 
			
 
				 from scrapling import Adaptor
			
 
				+from pathlib import Path
			
 
				+from mylib.base import save_to_file
			
 
				+
			
 
				 def stackoverflow_demo():
			
 
				     response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
			
 
				     page = Adaptor(response.text, url=response.url)
			
@@ -25,7 +28,12 @@ def stackoverflow_demo():
 
				             print(i, title.text, author.text)
			
 
				 
			
 
				 def google_search_demo():
			
 
				-    search_key = "python"
			
 
				+    file = Path(r'K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha manniana essential oil\10.html')
			
 
				+    html_content = file.read_text(encoding='utf-8')
			
 
				+    page = Adaptor(html_content)
			
 
				+    page.has_class('quote')
			
 
				+    print(page.find_by_text('Medicinal plants from the genus Acalypha'))
			
 
				+
			
 
				 def main():
			
 
				     google_search_demo()
			
 
				 
			
--- a/tests/mytest/t.py
+++ b/tests/mytest/t.py
@@ -1,38 +1,33 @@
 
				 import asyncio
			
 
				 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
			
 
				-from mylib.base import OUTPUT_DIR
			
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+import json
			
 
				+from dotenv import load_dotenv
			
 
				+from scrapegraphai.graphs import ScriptCreatorGraph
			
 
				+from scrapegraphai.utils import prettify_exec_info
			
 
				 
			
 
				-async def main():
			
 
				-    # 1) Reference your persistent data directory
			
 
				-    browser_config = BrowserConfig(
			
 
				-        headless=False,             # 'True' for automated runs
			
 
				-        verbose=True,
			
 
				-        use_managed_browser=False,  # Enables persistent browser strategy
			
 
				-        use_persistent_context=True,
			
 
				-        browser_type="chromium",
			
 
				-        user_data_dir=OUTPUT_DIR / "user_data_dir2"
			
 
				-    )
			
 
				+load_dotenv()
			
 
				+
			
 
				+def t_main():
			
 
				+    url = "https://perinim.github.io/projects"
			
 
				+    response = requests.get(url)
			
 
				+    soup = BeautifulSoup(response.content, 'html.parser')
			
 
				 
			
 
				-    # 2) Standard crawl config
			
 
				-    search_key='Acalypha malabarica essential oil'
			
 
				-    start=30
			
 
				-    url = f"https://www.google.com/search?q={search_key}&start={start}"
			
 
				-    print(f"search url: {url}")
			
 
				-    crawl_config = CrawlerRunConfig(
			
 
				-        wait_for="css:.logged-in-content",
			
 
				-        url=url,
			
 
				-    )
			
 
				-    browser_config.url = url
			
 
				-    async with AsyncWebCrawler(config=browser_config) as crawler:
			
 
				+    news_list = []
			
 
				 
			
 
				-    # crawler = AsyncWebCrawler(config=browser_config)
			
 
				-    # await crawler.start()
			
 
				-        result = await crawler.arun(
			
 
				-            url=url,
			
 
				-            config=crawl_config,
			
 
				-        )
			
 
				-        # save_to_pickle(result, OUTPUT_DIR / f"{search_key}.pickle")
			
 
				-        return result
			
 
				+    for news in soup.find_all('div', class_='news-item'):
			
 
				+        title = news.find('h2').text.strip()
			
 
				+        description = news.find('p').text.strip()
			
 
				+        news_list.append({
			
 
				+            "title": title,
			
 
				+            "description": description
			
 
				+        })
			
 
				 
			
 
				+    print(json.dumps(news_list, indent=4))
			
 
				+async def main():
			
 
				+    t_main()
			
 
				+    # s = '''python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\ndef main():\n    url = \"https://perinim.github.io/projects\"\n    response = requests.get(url)\n    soup = BeautifulSoup(response.content, 'html.parser')\n    \n    news_list = []\n    \n    for news in soup.find_all('div', class_='news-item'):\n        title = news.find('h2').text.strip()\n        description = news.find('p').text.strip()\n        news_list.append({\n            \"title\": title,\n            \"description\": description\n        })\n    \n    print(json.dumps(news_list, indent=4))\n\nif __name__ == \"__main__\":\n    main()\n'''
			
 
				+    # print(s)
			
 
				 if __name__ == "__main__":
			
 
				     asyncio.run(main())