瀏覽代碼

完成Windows2系统的正常启动,不过浏览器路径启动还未测试

mrh 9 月之前
父節點
當前提交
cf02362e19
共有 8 個文件被更改,包括 293 次插入575 次删除
  1. 3 1
      .gitignore
  2. 1 1
      mylib/logu.py
  3. 269 459
      poetry.lock
  4. 6 105
      pyproject.toml
  5. 4 1
      readme.md
  6. 2 2
      run_manager.bat
  7. 5 5
      run_multi_proxy.bat
  8. 3 1
      使用说明.txt

+ 3 - 1
.gitignore

@@ -10,4 +10,6 @@ local_proxy_pool/
 *.rdb
 .env
 CF-Clearance-Scraper
-crawl_env
+crawl_env
+*.zip
+*.tar.gz

+ 1 - 1
mylib/logu.py

@@ -13,7 +13,7 @@ loguru.logger.remove()
 # logger.add(sys.stderr, format=FORMAT)
 # logger.add(LOG_FILE, format=FORMAT)
 if not os.path.exists(LOG_DIR):
-    os.mkdir(LOG_DIR)
+    os.makedirs(LOG_DIR)
 
 loggers = {} 
 FORMAT = '<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}:{line}</cyan> :<cyan>{function}</cyan> - {message}'

File diff suppressed because it is too large
+ 269 - 459
poetry.lock


+ 6 - 105
pyproject.toml

@@ -8,114 +8,15 @@ authors = [
 readme = "README.md"
 requires-python = "3.12"
 dependencies = [
-    "aider-install (==0.1.3)",
-    "aiofiles (==24.1.0)",
-    "aiohttp (==3.11.11)",
-    "aiosignal (==1.3.2)",
-    "alembic (==1.14.1)",
-    "amqp (==5.3.1)",
-    "annotated-types (==0.7.0)",
-    "anyio (==4.8.0)",
-    "async-timeout (==5.0.1)",
-    "attrs (==24.3.0)",
-    "automat (==24.8.1)",
-    "babel (==2.16.0)",
-    "banal (==1.0.6)",
-    "beautifulsoup4 (==4.13.2)",
-    "billiard (==4.2.1)",
-    "boto3 (==1.36.2)",
-    "botocore (==1.36.2)",
-    "brotli (==1.1.0)",
-    "browserforge (==1.2.1)",
-    "celery (==5.3.6)",
-    "certifi (==2024.12.14)",
-    "cffi (==1.17.1)",
-    "charset-normalizer (==3.4.1)",
-    "click (==8.1.8)",
-    "click-didyoumean (==0.3.1)",
-    "click-plugins (==1.1.1)",
-    "click-repl (==0.3.0)",
-    "colorama (==0.4.6)",
-    "constantly (==23.10.4)",
-    "courlan (==1.3.2)",
-    "cryptography (==44.0.0)",
-    "dataclasses-json (==0.6.7)",
-    "dateparser (==1.2.0)",
-    "deepsearch-glm (==1.0.0)",
-    "defusedxml (==0.7.1)",
-    "dill (==0.3.9)",
-    "distro (==1.9.0)",
-    "dnspython (==2.7.0)",
-    "docling (==2.18.0)",
-    "docling-core (==2.17.1)",
-    "docling-ibm-models (==3.3.0)",
-    "docling-parse (==3.2.0)",
-    "easyocr (==1.7.2)",
-    "et-xmlfile (==2.0.0)",
-    "eventlet (==0.39.0)",
-    "fake-useragent (==2.0.3)",
-    "filelock (==3.17.0)",
-    "filetype (==1.2.0)",
-    "flower (==2.0.1)",
-    "free-proxy (==1.1.3)",
-    "fsspec (==2024.12.0)",
-    "geoip2 (==4.8.1)",
-    "googlesearch-python (==1.2.5)",
-    "html2text (==2024.2.26)",
-    "htmldate (==1.9.3)",
-    "httpx-sse (==0.4.0)",
-    "huggingface-hub (==0.27.1)",
-    "humanize (==4.11.0)",
-    "hyperlink (==21.0.0)",
-    "idna (==3.10)",
-    "imageio (==2.37.0)",
-    "incremental (==24.7.2)",
-    "itemadapter (==0.10.0)",
-    "itemloaders (==1.3.2)",
-    "jinja2 (==3.1.5)",
-    "jmespath (==1.0.1)",
-    "jsonlines (==3.1.0)",
-    "jsonpatch (==1.33)",
-    "jsonpointer (==3.0.0)",
-    "jsonref (==1.1.0)",
-    "jsonschema (==4.23.0)",
-    "jsonschema-specifications (==2024.10.1)",
-    "justext (==3.0.1)",
-    "kombu (==5.4.2)",
-    "langchain (==0.3.14)",
-    "langchain-aws (==0.2.11)",
-    "langchain-community (==0.3.14)",
-    "langchain-core (==0.3.30)",
-    "langchain-mistralai (==0.2.4)",
-    "langchain-ollama (==0.2.2)",
-    "langchain-openai (==0.3.1)",
-    "langchain-text-splitters (==0.3.5)",
-    "langsmith (==0.2.11)",
-    "language-tags (==1.2.0)",
-    "latex2mathml (==3.77.0)",
-    "lazy-loader (==0.4)",
-    "loguru (==0.7.3)",
-    "lxml (==5.3.0)",
-    "lxml-html-clean (==0.4.1)",
-    "mako (==1.3.8)",
-    "markdown-it-py (==3.0.0)",
-    "marko (==2.1.2)",
-    "markupsafe (==3.0.2)",
-    "marshmallow (==3.25.1)",
-    "maxminddb (==2.6.3)",
-    "mdurl (==0.1.2)",
-    "ollama (==0.4.6)",
-    "openai (==1.59.9)",
-    "opencv-python-headless (==4.11.0.86)",
-    "openpyxl (==3.1.5)",
-    "orjson (==3.10.15)",
-    "packaging (==24.2)",
-    "pandas (==2.2.3)",
     "sqlalchemy (==2.0.37)",
-    "patchright (==1.49.1)",
     "scrapling (>=0.2.93,<0.3.0)",
     "sqlmodel (>=0.0.22,<0.0.23)",
-    "drissionpage (>=4.1.0.17,<5.0.0.0)"
+    "drissionpage (>=4.1.0.17,<5.0.0.0)",
+    "crawl4ai (>=0.4.248,<0.5.0)",
+    "redis (>=5.2.1,<6.0.0)",
+    "celery (>=5.4.0,<6.0.0)",
+    "flower (>=2.0.1,<3.0.0)",
+    "docling (>=2.21.0,<3.0.0)"
 ]
 
 

+ 4 - 1
readme.md

@@ -79,5 +79,8 @@ Remove-Item -Path crawl_env
 mkdir -p crawl_env
 tar -xzf crawl_env.tar.gz -C crawl_env
 Remove-Item -Path crawl_env.tar.gz
-7z a -tzip google_crawler.zip cmd_python.bat config database crawl_env download local_proxy_pool mylib script tests utils worker 使用说明.txt
+D:\Program\7-Zip\7z.exe a -tzip -mmt -mx3 google_crawler.zip *.bat config database crawl_env download mylib script tests utils worker 使用说明.txt -xr!*/pycache/*
+# 使用 tar 压缩可能更快(其实也没多快),Windows11 以上都支持 tar.gz 格式
+tar -czvf google_crawler.tar.gz *.bat config database crawl_env download mylib script tests utils worker 使用说明.txt --exclude='*/__pycache__'
+
 ```

+ 2 - 2
run_manager.bat

@@ -3,7 +3,7 @@
 start "" "download\Redis-x64-5.0.14.1\redis-server.exe"
 
 :: 启动 Celery Flower
-start "" celery -A worker.celery.app flower --persistent=True --db=".\output\flower_db"
+start "" crawl_env\python.exe -m celery -A worker.celery.app flower --persistent=True --db=".\output\flower_db"
 
 :: 启动 Celery Worker
-start "" celery -A worker.celery.app worker --hostname=w1@%h
+start "" crawl_env\python.exe -m celery -A worker.celery.app worker --hostname=w1@%h

+ 5 - 5
run_multi_proxy.bat

@@ -1,6 +1,6 @@
 @echo off
-start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\一分机场_9361.yaml"
-start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\一分机场_9363.yaml"
-start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\一分机场_9365.yaml"
-start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\一分机场_9367.yaml"
-start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\一分机场_9369.yaml"
+start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\yfjc_9361.yaml"
+start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\yfjc_9363.yaml"
+start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\yfjc_9365.yaml"
+start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\yfjc_9367.yaml"
+start "" "download\proxy_pool\mihomo-windows-amd64-go120.exe" -f "download\proxy_pool\yfjc_9369.yaml"

+ 3 - 1
使用说明.txt

@@ -55,7 +55,9 @@ download\Redis-x64-5.0.14.1\redis-server.exe
 
 celery -A worker.celery.app flower --persistent=True --db=".\output\flower_db"
 
-启动浏览器工作进程
+启动浏览器工作进程,任务提交后,会启动谷歌浏览器,浏览器默认路径 C:\Program Files\Google\Chrome\Application\chrome.exe
+没有安装浏览器需要先安装 https://www.google.com/chrome/ 
+或者使用自己的浏览器路径,在 config\conf\9321.ini 文件中,右键用记事本打开,修改 browser_path
 celery -A worker.celery.app worker --hostname=w1@%h
 
 python -m worker.celery.client "G:\code\upwork\zhang_crawl_bio\download\测试-精

Some files were not shown because too many files changed in this diff