Explorar o código

完成打包并正常运行。crawl4ai 使用自定义浏览器。html convert 还有问题

mrh hai 9 meses
pai
achega
0e3dcdc184

+ 53 - 5
poetry.lock

@@ -2143,6 +2143,25 @@ tokenizers = "*"
 extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
 proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=43.0.1,<44.0.0)", "fastapi (>=0.115.5,<0.116.0)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.29.0,<0.30.0)", "uvloop (>=0.21.0,<0.22.0)"]
 
+[[package]]
+name = "loguru"
+version = "0.7.3"
+description = "Python logging made (stupidly) simple"
+optional = false
+python-versions = "<4.0,>=3.5"
+groups = ["main"]
+files = [
+    {file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
+    {file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["Sphinx (==8.1.3)", "build (==1.2.2)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.5.0)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.13.0)", "mypy (==v1.4.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pytest (==6.1.2)", "pytest (==8.3.2)", "pytest-cov (==2.12.1)", "pytest-cov (==5.0.0)", "pytest-cov (==6.0.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.1.0)", "sphinx-rtd-theme (==3.0.2)", "tox (==3.27.1)", "tox (==4.23.2)", "twine (==6.0.1)"]
+
 [[package]]
 name = "lxml"
 version = "5.3.1"
@@ -4136,6 +4155,18 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "python-multipart"
+version = "0.0.20"
+description = "A streaming multipart parser for Python"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104"},
+    {file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
+]
+
 [[package]]
 name = "python-pptx"
 version = "1.0.2"
@@ -4830,14 +4861,15 @@ tqdm = "*"
 
 [[package]]
 name = "setuptools"
-version = "75.8.0"
+version = "75.8.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
+markers = "python_version >= \"3.12\""
 files = [
-    {file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"},
-    {file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"},
+    {file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
+    {file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
 ]
 
 [package.extras]
@@ -5869,6 +5901,22 @@ docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]
 
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+description = "A small Python utility to set file creation time on Windows"
+optional = false
+python-versions = ">=3.5"
+groups = ["main"]
+markers = "sys_platform == \"win32\""
+files = [
+    {file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
+    {file = "win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0"},
+]
+
+[package.extras]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
+
 [[package]]
 name = "xlsxwriter"
 version = "3.2.2"
@@ -6246,5 +6294,5 @@ cffi = ["cffi (>=1.11)"]
 
 [metadata]
 lock-version = "2.1"
-python-versions = "3.12"
-content-hash = "8e7e98071910481d87f2c698eade5544d8fa0c84c8b7b64120a243ccbea6dd11"
+python-versions = "3.12.*"
+content-hash = "8c3ba2aeebbc2e7e7a8d4d8defb2228336d503ee1163cc73402b237b750fe1d1"

+ 3 - 1
pyproject.toml

@@ -6,7 +6,7 @@ authors = [
     {name = "mrh",email = "cxy-magong@qq.com"}
 ]
 readme = "README.md"
-requires-python = "3.12"
+requires-python = "3.12.*"
 dependencies = [
     "sqlalchemy (==2.0.37)",
     "scrapling (>=0.2.93,<0.3.0)",
@@ -21,6 +21,8 @@ dependencies = [
     "cachetools (>=5.5.2,<6.0.0)",
     "uvicorn (>=0.34.0,<0.35.0)",
     "playwright (>=1.50.0,<2.0.0)",
+    "python-multipart (>=0.0.20,<0.0.21)",
+    "loguru (>=0.7.3,<0.8.0)",
 ]
 
 

+ 34 - 3
readme.md

@@ -72,15 +72,46 @@
    ```bash
    python main.py
    ```
+   
 # pack
 ```shell
-conda pack -n crawl_env -o crawl_env.tar.gz
-Remove-Item -Path crawl_env
+conda create -p venv python=3.12 -y
+conda activate G:\code\upwork\zhang_crawl_bio\venv
+poetry install --no-root
+scrapling install
+```
+
+
+手动修改 venv\Lib\site-packages\crawl4ai\async_crawler_strategy.py 文件中修改如下代码:
+```python
+    class BrowserManager:
+        def __init__(self, browser_config: BrowserConfig, logger=None):
+            ...
+            if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                ...
+                cdp_url=self.config.cdp_url, # 👈 添加此行
+            )
+    ```
+
+```
+
+```shell
+# 如果 conda pack 无法打包的话,可能要安装: conda install --force-reinstall setuptools
+conda pack -p .\venv -o crawl_env.tar.gz
+Remove-Item -Path crawl_env -r
 mkdir -p crawl_env
 tar -xzf crawl_env.tar.gz -C crawl_env
 Remove-Item -Path crawl_env.tar.gz
-D:\Program\7-Zip\7z.exe a -tzip -mmt -mx3 google_crawler.zip *.bat config database crawl_env download mylib script tests utils worker 使用说明.txt -xr!*/pycache/*
+D:\Program\7-Zip\7z.exe a -tzip -mmt -mx3 google_crawler.zip *.bat config database crawl_env download mylib utils worker 使用说明.txt '-xr!*/pycache/*' '-xr!*__pycache__*' ui\fontend\dist ui\backend '-xr!ui\backend\output\*' '-xr!ui\backend\config.yaml'
 # 使用 tar 压缩可能更快(其实也没多快),Windows11 以上都支持 tar.gz 格式
 tar -czvf google_crawler.tar.gz *.bat config database crawl_env download mylib script tests utils worker 使用说明.txt --exclude='*/__pycache__'
 
+
+# 测试用,排除指定文件,要加引号
+D:\Program\7-Zip\7z.exe a -tzip -mmt -mx3 test.zip *.bat config  ui\fontend\dist ui\backend\* -xr!ui\backend\output\*  '-xr!ui\backend\config.yaml'
+mkdir -p z7test
+tar -xzf test.zip -C z7test
+Remove-Item -Path z7test -r 
+Remove-Item -Path test.zip
 ```

+ 16 - 0
run.bat

@@ -0,0 +1,16 @@
+@echo off
+REM 设置 Python 可执行文件的路径
+chcp 65001 > nul
+
+REM 强制Python使用UTF-8编码处理输入输出
+set PYTHONIOENCODING=utf-8
+set PYTHON_PATH=%~dp0crawl_env\python.exe
+
+REM 进入 backend 目录
+cd /d "%~dp0ui\backend"
+
+REM 使用指定的 Python 可执行文件运行 main.py
+"%PYTHON_PATH%" main.py
+
+REM 暂停,以便查看输出
+pause

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 13 - 5
tests/mytest/crawl_t.py


+ 64 - 0
tests/mytest/crawl_t_copy.py

@@ -0,0 +1,64 @@
+import asyncio
+import aiofiles
+import os
+import sys
+import time
+from camoufox import Camoufox
+from camoufox.server import launch_server
+from camoufox.async_api import AsyncCamoufox
+import asyncio
+import signal
+from worker.search_engine.camoufox_broswer import BrowserConfig
+from worker.search_engine.google_search import GoogleSearchHandler
+from mylib.logu import get_logger
+import asyncio
+import pickle
+from pathlib import Path
+import random
+from typing import List
+import httpx
+import ssl
+from sqlmodel import select, Session
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
+from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
+from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
+from utils.proxy_pool import get_random_proxy
+from mylib.drission_page import load_random_ua_chrome, load_chrome_from_ini,test_normal_chrome
+logger = get_logger('test')
+page = None
+async def task():
+    page = load_chrome_from_ini(proxy='http://localhost:1881', auto_port=False)
+    logger.info(f"{page.browser._chromium_options._address}")
+    logger.info(f"{page.browser._driver.get(f'http://{page.browser._chromium_options._address}/json').json()}")
+    logger.info(f"{page.browser._driver._websocket_url}")
+    item_id = 1
+    url = 'https://greg.app/acalypha-marissima-overview/'
+    # url = 'https://baidu.com'
+    browser_config = BrowserConfig(
+        headless=False,
+        # verbose=False,
+        # extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+        debugging_port=9321,
+        use_managed_browser=True,
+        cdp_url=page.browser._driver._websocket_url
+    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.DISABLED,                
+        )
+        result = await crawler.arun(url=url, config=crawler_config)
+        print(result.markdown)
+
+    # crawler = AsyncWebCrawler(config=browser_config)
+    # await crawler.start()
+    # crawl_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    # result:CrawlResult = await crawler.arun(url=url, config=crawl_config)
+    # logger.info(f"{item_id} crawler.arun result.success: {result.success} {result.status_code}")
+    # print(result.markdown)
+    # await crawler.close()
+
+def main():
+    asyncio.run(task())
+
+if __name__ == "__main__":
+    main()

+ 3 - 2
ui/backend/src/services/celery_worker.py

@@ -134,8 +134,9 @@ class CeleryWorker:
             "--host", config.worker_backend.host,
             "--port", str(config.worker_backend.port)
         ]
-
-        await process_manager.start_process("worker_backend", worker_backend_cmd, cwd=WORKER_DIR_BASE)
+        env = os.environ.copy()
+        env["PYTHONPATH"] = str(WORKER_DIR_BASE)
+        await process_manager.start_process("worker_backend", worker_backend_cmd, cwd=WORKER_DIR_BASE, env=env)
 
         # G:\code\upwork\zhang_crawl_bio\crawl_env\python.exe -m celery -A worker.celery.app flower --address=127.0.0.1 --persistent=True --db=".\output\flower_db"
         flower_cmd = [python_exe, '-m', 'celery', '-A', 'worker.celery.app', 'flower', '--address=127.0.0.1', '--persistent=True', f'--db={flower_db}']

+ 2 - 2
ui/backend/utils/config.py

@@ -21,7 +21,7 @@ class Sub(AutoLoadModel):
     url: Optional[str] = None
     start_port: Optional[int] = 9660
     redis_url: Optional[str] = 'redis://localhost:6379/8'
-    file: Optional[str] = None
+    file: Optional[str] = str(REPO_BASE_DIR / r'download\proxy_pool\6137e542.yaml')
     temp_dir: Optional[str] = str(PROXY_POLL_DIR / "temp")  
     auto_start: Optional[bool] = True 
     proxies: Optional[Dict[Union[int,str], Proxy]] = {}
@@ -51,7 +51,7 @@ class Config(BaseModel):
     mimo_exe: Optional[str] = str(PROXY_POLL_DIR / r"mihomo-windows-amd64-go120.exe")
     pandoc_exe: Optional[str] = str(REPO_BASE_DIR / r"download\pandoc-3.6.3-windows-x86_64\pandoc.exe")
     worker_backend_py: Optional[str] = str(WORKER_DIR_BASE / r"worker\api\worker_server.py")
-    sqluri: Optional[str] = r'G:\code\upwork\zhang_crawl_bio\output\temp.db'
+    sqluri: Optional[str] = 'sqlite:///' + str(REPO_BASE_DIR / r'output\temp.db')
     browser: Optional[Browser] = Browser()
     backend: Optional[Backend] = Backend()
     worker_backend: Optional[Backend] = Field(default_factory=lambda: Backend(host="localhost", port=8003))

+ 5 - 4
ui/backend/utils/process_mgr.py

@@ -105,7 +105,8 @@ class ProcessManager:
         name: str,
         command: list,
         cwd:str = os.getcwd(),
-        log_dir: Path = LOG_DIR / "process_mgr"
+        log_dir: Path = LOG_DIR / "process_mgr",
+        env: Optional[Dict[str, str]] = None
     ) -> Optional[int]:
         """启动并管理后台进程"""
         async with self.lock:
@@ -118,14 +119,14 @@ class ProcessManager:
 
             try:
                 log_fd = open(log_file, "ab")
-                
                 process = await asyncio.create_subprocess_exec(
                     *command,
                     stdout=log_fd,
                     cwd=cwd,
                     stderr=subprocess.STDOUT,
                     stdin=subprocess.DEVNULL,
-                    start_new_session=True
+                    start_new_session=True,
+                    env=env
                 )
 
                 if platform.system() == 'Windows' and self.job_object:
@@ -139,7 +140,7 @@ class ProcessManager:
                     "pid": process.pid
                 }
 
-                logger.info(f"Started process {name} (PID: {process.pid})")
+                logger.info(f"Started process {name} (PID: {process.pid}) , {" ".join(command)}")
                 return process.pid
 
             except Exception as e:

+ 4 - 0
ui/docs/gpt/build.md

@@ -0,0 +1,4 @@
+我想在当前项目中新建一个启动文件,这个启动文件适用于将整个 python 项目压缩包在任意 Windows 电脑和发给不同用户运行。不过我的启动文件是: ui\backend\main.py
+为了在 backend 目录中启动,让 python 搜索路径是基于 man.py 同级目录,因此我似乎不能直接 cmd: python ui\backend\main.py  这样运行。
+你如何帮我在根路径中创建启动脚本?
+

+ 1 - 4
worker/celery/search_client.py

@@ -97,10 +97,7 @@ def main(file_path: str, clear: bool = False):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description='谷歌搜索任务客户端',
-        epilog="示例:\r\n"
-               "  python -m worker.celery.client -f G:\download\测试-精油-2000.xlsx   # 提交任务\r\n"
-               "  python -m worker.celery.client -c             # 清除任务\r\n"
-               "  python -m worker.celery.client -h             # 显示帮助",
+        epilog="示例:\r\n",
         formatter_class=argparse.RawTextHelpFormatter
     )
     parser.add_argument('-f', '--file_path', help='包含关键词的文件路径')

+ 6 - 4
worker/crawl_pages/crawl_urls.py

@@ -96,10 +96,12 @@ class URLCrawler:
                         return {"search_result_model": item, "crawl_result": None, 'message': response.headers.get('content-type')}
             except Exception as e:
                 logger.warning(f"Failed to check headers for id: {item_id} , {url} {str(e)}")
-                return {"search_result_model": None, "crawl_result": None, 'message': str(e)}
-        if 'html' not in content_type:
-            return {"search_result_model": None, "crawl_result": None,'message': f'not html, content_type {content_type}'}
+        #         return {"search_result_model": None, "crawl_result": None, 'message': str(e)}
+        # if 'html' not in content_type:
+        #     logger.info(f"Skipping {url} (item_id: {item_id}) - not html, conent_type {content_type}")
+        #     return {"search_result_model": None, "crawl_result": None,'message': f'not html, content_type {content_type}'}
         logger.info(f"crawler.arun start {item_id} content-type: {content_type}, {url} ")    
+        logger.info(f"browser_config use_managed_browser {browser_config.use_managed_browser} , cdp_url: {browser_config.cdp_url}, headless: {browser_config.headless}")
         # If not PDF or header check failed, try regular crawl
         crawler = AsyncWebCrawler(config=browser_config)
         await crawler.start()
@@ -155,7 +157,7 @@ class URLCrawler:
                         session.add(item)
                         session.commit()
                         session.refresh(item)
-                
+            logger.info(f"{item_id} crawler.arun result.success: {item}")
             return {"search_result_model": item, "crawl_result": result}
         except Exception as e:
             logger.error(f"Failed to crawl id: {item_id} , {url} {str(e)}")

+ 3 - 1
worker/html_convert/pandoc.py

@@ -178,7 +178,9 @@ def process_single_example(result_id: int, skip_existing=True):
     # Process a single result example
     docling_converter = DoclingConverter()
     search_result_item = docling_converter.get_search_result_item(result_id)
-    if search_result_item.html_path.endswith('.html'):
+    if (search_result_item and 
+        search_result_item.html_path and 
+        search_result_item.html_path.endswith('.html')):
         docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
     
     crawl_filter = CrawlFilter()

+ 0 - 0
run_manager.bat → worker/run_manager.bat


+ 0 - 0
run_multi_proxy.bat → worker/run_multi_proxy.bat


+ 2 - 63
使用说明.txt

@@ -1,64 +1,3 @@
-在线文档: https://evoi45c69f6.feishu.cn/wiki/RugdwwiU0iNbbfktsWhclKWgnmf
+双击 run.bat 
 
-# 批量启动
-双击  run_multi_proxy.bat 批量启动代理池,相当于程序多开,同时使用多个节点
-
-在这个地址可以管理和切换每个单独节点, port 后面的数字代表每个节点的管理端口,依次改成 9361、9363、9365、9367、9369
-https://yacd.metacubex.one/?hostname=127.0.0.1&port=9363&secret=#/proxies
-
-双击 run_manager.bat 批量启动管理程序,自动监听谷歌搜索任务
-
-# 执行任务
-双击 cmd_python.bat 进入控制台,可以用来执行 python 程序,在控制台输入指令
-
-- 提交表格,进行谷歌搜索任务
-python -m worker.celery.client -f "G:\code\upwork\zhang_crawl_bio\download\测试-精油-2000.xlsx"
-
-将文件路径改为自己的表格路径,程序会自动导入表格数据,自动提交搜索任务,每个任务排队执行,自动保存搜索结果
-
-每个搜索词完成后,会自动保存,即便下一次重新提交任务,也会跳过已经完成的搜索任务
-
-即便关机了,或者不小心关闭了程序也没有关系,
-还在排队中的搜索任务在 run_manager.bat 启动后,会接着未完成的关键词继续排队执行,
-因此不需要重复提交表格任务,即便重复提交,已经完成的搜索词也会跳过。
-
-- 清除任务
-python -m worker.celery.client -c
-
-如果队列中有很多任务以前已经提交,此时想让剩余任务停止,或者你想新增、减少 excel 表格搜索词,重新导入
-此时可以使用清除任务。
-
-只会清除还在排队中的、未完成的搜索任务,不会清除任何本地数据,不会清除任何已经完成的数据
-
-## 查看任务状态
-
-访问: http://localhost:5555/broker
-Messages 的数字表示还有搜索词在排队中,未完成
-
-上方导航栏: Tasks 
-表示每个任务的执行情况,成功、失败、保存的html页面路径
-
-导航栏 Workers 表示当前浏览器工作程序的状态
-
-# proxy
-
-download\proxy_pool\mihomo-windows-amd64-go120.exe -f download\proxy_pool\一分机场_9361.yaml
-download\proxy_pool\mihomo-windows-amd64-go120.exe -f download\proxy_pool\一分机场_9363.yaml
-download\proxy_pool\mihomo-windows-amd64-go120.exe -f download\proxy_pool\一分机场_9365.yaml
-download\proxy_pool\mihomo-windows-amd64-go120.exe -f download\proxy_pool\一分机场_9367.yaml
-download\proxy_pool\mihomo-windows-amd64-go120.exe -f download\proxy_pool\一分机场_9369.yaml
-
-https://yacd.metacubex.one/?hostname=127.0.0.1&port=9363&secret=#/proxies
-
-# worker
-download\Redis-x64-5.0.14.1\redis-server.exe
-
-celery -A worker.celery.app flower --persistent=True --db=".\output\flower_db"
-
-启动浏览器工作进程,任务提交后,会启动谷歌浏览器,浏览器默认路径 C:\Program Files\Google\Chrome\Application\chrome.exe
-没有安装浏览器需要先安装 https://www.google.com/chrome/ 
-或者使用自己的浏览器路径,在 config\conf\9321.ini 文件中,右键用记事本打开,修改 browser_path
-celery -A worker.celery.app worker --hostname=w1@%h
-
-python -m worker.celery.client "G:\code\upwork\zhang_crawl_bio\download\测试-精
-油-2000.xlsx"
+访问: http://localhost:5835/

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio