瀏覽代碼

基本的启动项目

mrh 1 年之前
當前提交
3553e36dc3
共有 6 個文件被更改,包括 134 次插入0 次删除
  1. 3 0
      .gitignore
  2. 61 0
      conf/config.py
  3. 33 0
      conf/dp_configs.ini
  4. 27 0
      dp/page.py
  5. 0 0
      main.py
  6. 10 0
      readme.md

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+output
+__pycache__
+env

+ 61 - 0
conf/config.py

@@ -0,0 +1,61 @@
+import sys
+import os
+from DrissionPage import ChromiumOptions
+from loguru import logger
+HOST='localhost'
+PORT=9226
+
+WORK_DIR = os.path.dirname(os.path.dirname(__file__))
+CONF_DIR =  os.path.join(WORK_DIR, 'conf')
+OUTPUT = os.path.join(WORK_DIR, 'output')
+PAGE_OUTPUT = os.path.join(OUTPUT, 'page')
+if len(sys.argv)>=3 and sys.argv[1] == "-c":
+    INI_PATH = sys.argv[2]
+else:
+    INI_PATH = os.path.join(CONF_DIR, 'dp_configs.ini')
+
+if not os.path.exists(OUTPUT):
+    os.mkdir(OUTPUT)
+if not os.path.exists(PAGE_OUTPUT):
+    os.mkdir(PAGE_OUTPUT)
+
+logger.remove()
+# logger.add(sys.stderr, format='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>')
+logger.add(sys.stderr, level="INFO", format='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}</cyan>:<cyan>{line}</cyan> :<cyan>{function}</cyan> - {message}')
+logger.add(os.path.join(OUTPUT, "all.log"), level="DEBUG", format='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}</cyan>:<cyan>{line}</cyan> :<cyan>{function}</cyan> - {message}')
+logger.debug(f"WORK_DIR {WORK_DIR}")
+logger.debug(f"INI_PATH {INI_PATH}")
+
+  
+def find_edge_path_in_registry():  
+    import winreg as reg  
+    path = None  
+    try:  
+        key = reg.OpenKey(reg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\msedge.exe")  
+        path = reg.QueryValueEx(key, "")[0]  
+        reg.CloseKey(key)  
+    except WindowsError:  
+        pass  
+    return path  
+
+logger.debug(f"find_edge browser path: {find_edge_path_in_registry()}")
+USER_DATA = os.path.join(OUTPUT, "UserData")
+# BROWSER_PATH 值理论无需手动设置,因为该模块会自动在默认路径查找 Chrome ,因此下列判断可以删去
+if 'win' in sys.platform:
+    if not os.path.exists(INI_PATH):
+        from DrissionPage.common import configs_to_here
+        # path = r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe'
+        path = find_edge_path_in_registry()
+        # 生成默认配置文件
+        chrome_options = ChromiumOptions(False, None)
+        chrome_options.set_address(f"{HOST}:{PORT}")
+        USER_DATA += str(PORT)
+        chrome_options.set_browser_path(path)
+        chrome_options.set_user_data_path(USER_DATA)
+        chrome_options.save(INI_PATH)
+    else:
+        chrome_options = ChromiumOptions(True, ini_path=INI_PATH)
+        logger.debug(f"load init {INI_PATH}")
+        logger.debug(f"chrome_options.browser_path {chrome_options.browser_path}")
+elif sys.platform == 'linux':
+    chrome_options = ChromiumOptions(ini_path=INI_PATH)

+ 33 - 0
conf/dp_configs.ini

@@ -0,0 +1,33 @@
+[paths]
+download_path = 
+tmp_path = 
+
+[chromium_options]
+address = 127.0.0.1:9226
+browser_path = C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe
+arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4', '--user-data-dir=I:\\code\\ai-yunying\\live-online-people\\output\\UserData9226']
+extensions = []
+prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}}
+flags = {}
+load_mode = normal
+user = Default
+auto_port = False
+system_user_path = False
+existing_only = False
+
+[session_options]
+headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}
+
+[timeouts]
+base = 10
+page_load = 30
+script = 30
+
+[proxies]
+http = 
+https = 
+
+[others]
+retry_times = 3
+retry_interval = 2
+

+ 27 - 0
dp/page.py

@@ -0,0 +1,27 @@
+import asyncio
+import os
+import re
+import time
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+from DrissionPage import ChromiumPage
+from DrissionPage import ChromiumOptions
+from conf.config import logger,PAGE_OUTPUT,INI_PATH,chrome_options,find_edge_path_in_registry
+from DrissionPage import ChromiumOptions
+from DrissionPage.common import Settings
+
+Settings.raise_when_ele_not_found=True
+
+        
+page = ChromiumPage(chrome_options)
+logger.debug(f"address {chrome_options.address}")
+logger.debug(f"start '{page._chromium_options._browser_path}'")
+# 设置 none 的时候 page.get() 不会等待加载完成,而是直接返回,page.ele 会阻塞,不过一旦找到元素也会立即返回
+# 因此设置为 none 是最高效率、最迅速的,甚至不用 page.stop_loading() 因为停止过程中也要花费时间,而是直接请求空页面 about:blank 断开所有连接
+page.set.load_mode.none()
+# page.set.NoneElement_value('没找到')
+page.get("edge://version/")
+page.new_tab("http://www.baidu.com")
+

+ 0 - 0
main.py


+ 10 - 0
readme.md

@@ -0,0 +1,10 @@
+
+```shell
+conda create -p .\env python=3.11
+conda activate I:\code\ai-yunying\live-online-people\env
+pip install portkey-ai
+pip install DrissionPage
+pip install loguru
+```
+## 查看本浏览器爬虫特征
+https://bot.sannysoft.com/