|
|
@@ -1,3 +1,4 @@
|
|
|
+import os
|
|
|
from pathlib import Path
|
|
|
import subprocess
|
|
|
import sys
|
|
|
@@ -18,11 +19,17 @@ py_client: Optional[Dict[str,Any]] = {
|
|
|
'crawl': WORKER_DIR_BASE / r'worker\celery\crawl_client.py',
|
|
|
'convert': WORKER_DIR_BASE / r'worker\celery\html_convert_tasks.py'
|
|
|
}
|
|
|
+class SearchBrowserConfig(BaseModel):
|
|
|
+ # def load_chrome_from_ini(path=CONFIG_DIR / '9321.ini', headless=False, proxy=None, browser_path=None, no_imgs=True):
|
|
|
+ headless: Optional[bool] = False
|
|
|
+ proxy: Optional[str] = None
|
|
|
+ browser_path: Optional[str] = None
|
|
|
+ no_imgs: Optional[bool] = True
|
|
|
|
|
|
class SearchTaskConfig(BaseModel):
|
|
|
max_result_items: Optional[int] = 200
|
|
|
skip_existing: Optional[bool] = True
|
|
|
- browser_config: Optional[dict] = {}
|
|
|
+ browser_config: Optional[SearchBrowserConfig] = SearchBrowserConfig()
|
|
|
proxy_pool_url: Optional[str] = None
|
|
|
dry_run: Optional[bool] = True
|
|
|
|
|
|
@@ -58,6 +65,7 @@ class CeleryWorker:
|
|
|
def __init__(self, python_exe: str=sys.executable, config = config):
|
|
|
self.python_exe = python_exe
|
|
|
self.config = config
|
|
|
+ os.environ['DB_URL'] = self.config.sqluri
|
|
|
self.workers_model: Dict[str, WorkerModel] = {}
|
|
|
self.redis_url = f"redis://{config.redis.host}:{config.redis.port}/{config.redis.db}"
|
|
|
self.redis_client = redis.Redis(host=config.redis.host, port=config.redis.port, db=config.redis.db)
|
|
|
@@ -131,16 +139,16 @@ class CeleryWorker:
|
|
|
if not worker_model:
|
|
|
raise ValueError(f"Invalid worker name: {name}")
|
|
|
if in_cmd_windows:
|
|
|
+ logger.info(f"self.config.sqluri {self.config.sqluri}")
|
|
|
cmd = ['start','cmd', '/c' ]
|
|
|
sub_cmd = ' '.join(worker_model.cmd)
|
|
|
cmd.append(f'{sub_cmd}')
|
|
|
logger.info(f"run {' '.join(cmd)}")
|
|
|
process = subprocess.Popen(cmd, shell=True, cwd=WORKER_DIR_BASE)
|
|
|
- # 立即记录CMD进程的PID作为后备
|
|
|
- worker_model.pid = process.pid
|
|
|
# 等待flower返回真实worker PID
|
|
|
if not await self.wait_for_worker_online(name, timeout):
|
|
|
- logger.warning(f"Worker {name} 未能及时注册到flower,使用CMD进程PID {process.pid}")
|
|
|
+ # logger.warning(f"Worker {name} 未能及时注册到flower,使用CMD进程PID {process.pid}")
|
|
|
+ raise ValueError(f"{name} 未能启动")
|
|
|
logger.info(f"start sucess {worker_model}")
|
|
|
else:
|
|
|
worker_model.pid = await process_manager.start_process(name, worker_model.cmd, cwd=WORKER_DIR_BASE)
|
|
|
@@ -240,6 +248,7 @@ class CeleryWorker:
|
|
|
return queue_lengths
|
|
|
def _prepare_search_task(self, data: Dict, select_proxy: Optional[str] = None):
|
|
|
task_model = SearchTaskInput(**data)
|
|
|
+ task_model.config.browser_config.browser_path = self.config.browser.exe_path
|
|
|
if select_proxy == 'pool':
|
|
|
task_model.config.proxy_pool_url = f"http://{self.config.backend.host}:{self.config.backend.port}/api/proxy/proxies-pool"
|
|
|
return task_model
|