mrh 9 месяцев назад
Родитель
Сommit
f7aa3356f7

+ 1 - 1
config/settings.py

@@ -6,7 +6,7 @@ WORK_DIR = Path(__file__).parent.parent.absolute()
 OUTPUT_DIR = WORK_DIR / "output"
 CONFIG_DIR = WORK_DIR / "config" / "conf"
 GOOGLE_SEARCH_DIR = OUTPUT_DIR / 'results'
-PANDOC_EXE = pandoc_path = shutil.which('pandoc')
+PANDOC_EXE = os.environ.get('PANDOC_EXE') or shutil.which('pandoc')
 
 LOG_LEVEL='info'
 LOG_DIR = OUTPUT_DIR / "logs"

+ 1 - 0
ui/backend/config.yaml

@@ -5,6 +5,7 @@ browser:
   exe_path: G:\code\upwork\zhang_crawl_bio\download\GoogleChromePortable\GoogleChromePortable.exe
   no_imgs: true
 mimo_exe: g:\code\upwork\zhang_crawl_bio\download\proxy_pool\mihomo-windows-amd64-go120.exe
+pandoc_exe: G:\code\upwork\zhang_crawl_bio\download\pandoc-3.6.3-windows-x86_64\pandoc.exe
 redis:
   db: 1
   exe: G:\code\upwork\zhang_crawl_bio\download\Redis-x64-5.0.14.1\redis-server.exe

+ 6 - 2
ui/backend/routers/worker.py

@@ -138,8 +138,12 @@ async def status() -> ResponseStatus:
             celery_status["workers"]["data"]
         )
     
-    # 添加队列长度信息
-    celery_status["queue_lengths"] = queue_lengths
+    # 添加队列长度信息(确保包含convert队列)
+    celery_status["queue_lengths"] = {
+        "search": queue_lengths.get("search", 0),
+        "crawl": queue_lengths.get("crawl", 0),
+        "convert": queue_lengths.get("convert", 0)
+    }
     
     return ResponseStatus(
         endpoint=Endpoint(health=health),

+ 6 - 3
ui/backend/src/services/celery_worker.py

@@ -54,6 +54,7 @@ class CrawlTaskConfig(BaseModel):
 class ConvertTaskParams(BaseModel):
     result_ids: List[str] = Field(..., min_length=0)
     batch_size: Optional[int] = Field(0, gt=-1)
+    queue_name: Optional[str] = Field('convert_queue', description="任务队列名称")  # 新增队列参数
 
 class WorkerModel(BaseModel):
     name: str
@@ -75,6 +76,7 @@ class CeleryWorker:
         self.python_exe = python_exe
         self.config = config
         os.environ['DB_URL'] = self.config.sqluri
+        os.environ['PANDOC_EXE'] = self.config.pandoc_exe
         self.workers_model: Dict[str, WorkerModel] = {}
         self.redis_url = f"redis://{config.redis.host}:{config.redis.port}/{config.redis.db}"
         self.redis_client = redis.Redis(host=config.redis.host, port=config.redis.port, db=config.redis.db)
@@ -91,11 +93,11 @@ class CeleryWorker:
         while True:
             try:
                 worker_status = await self.check_worker_status()
+                workers_data = worker_status.get("workers", {}).get("data", [])
                 if worker_status["err"] != 0:
                     await asyncio.sleep(1)
                     continue
                 
-                workers_data = worker_status.get("workers", {}).get("data", [])
                 
                 logger.info(f"wait ... worker {len(workers_data)}")
                 # Check if our worker is in the list
@@ -111,7 +113,9 @@ class CeleryWorker:
                     return False
                 
                 await asyncio.sleep(1)
-            except Exception:
+            except Exception as e:
+                logger.error(f"Error waiting for worker {name}: {str(e)}")
+
                 await asyncio.sleep(1)
                 continue
 
@@ -150,7 +154,6 @@ class CeleryWorker:
         if not worker_model:
             raise ValueError(f"Invalid worker name: {name}")
         if in_cmd_windows:
-            logger.info(f"self.config.sqluri {self.config.sqluri}")
             cmd = ['start','cmd', '/c' ]
             sub_cmd = ' '.join(worker_model.cmd)
             cmd.append(f'{sub_cmd}')

+ 1 - 0
ui/backend/utils/config.py

@@ -49,6 +49,7 @@ class Config(BaseModel):
     sub: Optional[Sub] = Sub()
     select_proxy: Optional[str] = "system"
     mimo_exe: Optional[str] = str(PROXY_POLL_DIR / r"mihomo-windows-amd64-go120.exe")
+    pandoc_exe: Optional[str] = str(REPO_BASE_DIR / r"download\pandoc-3.6.3-windows-x86_64\pandoc.exe")
     worker_backend_py: Optional[str] = str(WORKER_DIR_BASE / r"worker\api\worker_server.py")
     sqluri: Optional[str] = r'G:\code\upwork\zhang_crawl_bio\output\temp.db'
     browser: Optional[Browser] = Browser()

+ 5 - 13
ui/fontend/src/components/WorkerCtrl.vue

@@ -26,17 +26,6 @@
                 :disabled="!workerStatus.search || loadingStates.search"
                 :loading="loadingStates.search"
                 @click="sendRequest('search', 'stop')">停止运行</el-button>
-              <el-button type="primary"
-                :disabled="!workerStatus.search || loadingStates.search"
-                @click="sendRequest('search', workerStatus.search_paused ? 'resume' : 'pause')">
-                {{ workerStatus.search_paused ? '恢复运行' : '暂停运行' }}
-              </el-button>
-              <el-button type="primary"
-                :disabled="!workerStatus.search || loadingStates.search"
-                @click="sendRequest('search', workerStatus.search_paused ? 'resume' : 'pause')">
-                {{ workerStatus.search_paused ? '恢复运行' : '暂停运行' }}
-              </el-button>
-
             </el-row>
             <el-row>
               <el-button type="primary" 
@@ -92,7 +81,7 @@
         <el-col :span="8">
           <el-card shadow="hover">
             <div class="task-container">
-              <span class="task-label">将结果页转换成文档:</span>
+              <span class="task-label">将结果页转换成文档:{{ taskCounts.convert }}</span>
               <el-button type="primary" 
                 :disabled="workerStatus.convert || loadingStates.convert"
                 :loading="loadingStates.convert"
@@ -101,10 +90,13 @@
                 :disabled="!workerStatus.convert || loadingStates.convert"
                 :loading="loadingStates.convert"
                 @click="sendRequest('convert', 'stop')">停止运行</el-button>
-                <el-button type="primary" 
+                <el-button type="primary"
                 :disabled="!workerStatus.convert || loadingStates.convert"
                 :loading="loadingStates.convert"
                 @click="sendRequest('convert', 'submit_all')">执行文档转换</el-button>
+              <el-button type="primary"
+                @click="sendRequest('convert', 'clean')">清空剩余任务
+              </el-button>
             </div>
           </el-card>
         </el-col>

+ 3 - 3
ui/fontend/tsconfig.app.json

@@ -4,9 +4,9 @@
     "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
 
     /* Linting */
-    "module": "ESNext",
-    "target": "ESNext", // 目标也设置为 esnext
-    "moduleResolution": "node", // 模块解析策略
+    "module": "ES2020",
+    "target": "ES2020",
+    "moduleResolution": "Node",
     "strict": true,
     "esModuleInterop": true, // 允许 CommonJS 和 ES 模块互操作
     "skipLibCheck": true, // 跳过库文件的类型检查

+ 4 - 0
ui/fontend/vite.config.ts

@@ -12,6 +12,10 @@ const pathSrc = path.resolve(__dirname, 'src')
 
 // https://vite.dev/config/
 export default defineConfig({
+  define: {
+    'import.meta.env': JSON.stringify(process.env)
+  },
+  envDir: './', // 指定环境文件目录
   plugins: [
     vue(),
     AutoImport({

+ 18 - 4
worker/celery/crawl_tasks.py

@@ -52,16 +52,30 @@ def crawl_all_unprocessed_pages_task(config: dict|CrawlTaskConfig):
         
         logger.info(f"找到 {len(page_ids)} 个未处理的页面,开始批量提交任务...")
         
-        task_group = group([
+        # 批量创建任务签名
+        task_signatures = [
             crawl_page_urls_task.s(
                 page_id=page_id,
                 config=config.model_dump()
             ).set(queue=config.queue_name)
             for page_id in page_ids
-        ])
+        ]
         
-        result = task_group.apply_async()
-        return {"status": "success", "task_id": result.id, "message": f"已启动 {len(page_ids)} 个页面爬取任务"}
+        # 分块处理(每100个任务为一组)
+        chunk_size = 100
+        results = []
+        for i in range(0, len(task_signatures), chunk_size):
+            chunk = task_signatures[i:i+chunk_size]
+            task_group = group(chunk)
+            group_result = task_group.apply_async()
+            results.append(group_result.id)
+            logger.info(f"已提交第 {i//chunk_size + 1} 批任务,共 {len(chunk)} 个")
+        
+        return {
+            "status": "success",
+            "task_ids": results,
+            "message": f"已启动 {len(page_ids)} 个页面爬取任务(分{len(results)}批)"
+        }
     except Exception as e:
         logger.error(f"批量提交页面任务失败: {str(e)}")
         raise

+ 51 - 28
worker/celery/html_convert_tasks.py

@@ -1,5 +1,5 @@
 from typing import List, Optional
-from celery import current_app
+from celery import current_app, group
 from pydantic import BaseModel, Field
 from worker.html_convert.pandoc import process_single_example, process_all_results
 from mylib.logu import get_logger
@@ -9,8 +9,9 @@ from worker.search_engine.valid_google_search import ValidSearchResult
 
 logger = get_logger('pandoc_tasks')
 class ConvertTaskParams(BaseModel):
-    result_ids: List[str] = Field(..., min_length=1)
-    batch_size: Optional[int] = Field(10, gt=0)
+    result_ids: List[str] = Field(..., min_length=0)
+    batch_size: Optional[int] = Field(0, gt=-1)
+    queue_name: Optional[str] = Field('convert_queue', description="任务队列名称")  # 新增队列参数
 
 @current_app.task(name='html_convert_tasks.convert_single_result')
 def convert_single_result_task(result_id: int):
@@ -34,36 +35,58 @@ def convert_single_result_task(result_id: int):
 
 @current_app.task(name='html_convert_tasks.convert_all_results')
 def convert_all_results_task(input_params: ConvertTaskParams=None):
-    """
-    Celery task to convert all SearchResultItems using Pandoc.
-    
-    Returns:
-        dict: Task result status.
-    """
+    """批量转换所有结果任务"""
     try:
-        logger.info("Starting Pandoc conversion for all SearchResultItems")
-        test_task_process_all_results()
-        logger.info("Pandoc conversion completed for all SearchResultItems")
-        return {"status": "completed"}
+        config = ConvertTaskParams(**input_params) if input_params and isinstance(input_params, dict) else input_params
+        valid_search = ValidSearchResult()
+        valid_items = valid_search.get_valid_search_result_items()
+        result_ids = [str(item.id) for item in valid_items]
+        logger.info(f"开始批量转换 {len(result_ids)} 个结果,队列: {config.queue_name}")
+        
+        # 创建任务签名并分组
+        task_signatures = [
+            convert_single_result_task.s(result_id).set(queue=config.queue_name)
+            for result_id in result_ids
+        ]
+        
+        # 分批次提交(每50个为一组)
+        chunk_size = 50
+        task_ids = []
+        for i in range(0, len(task_signatures), chunk_size):
+            chunk = task_signatures[i:i+chunk_size]
+            group(chunk).apply_async()
+            task_ids.extend([t.id for t in chunk])
+            logger.info(f"已提交第 {i//chunk_size + 1} 批转换任务,共 {len(chunk)} 个")
+        
+        return {
+            "status": "success",
+            "task_ids": task_ids,
+            "message": f"已提交 {len(result_ids)} 个转换任务(分{len(task_ids)//chunk_size +1}批)"
+        }
     except Exception as e:
         logger.exception(f"Error during bulk Pandoc conversion: {str(e)}")
         return {"status": "failed", "error": str(e)}
-
 def test_task_process_all_results():
-    # Process all valid results using ValidSearchResult
-    valid_search = ValidSearchResult()
-    valid_items = valid_search.get_valid_search_result_items()
-    
-    logger.info(f"Total valid results: {len(valid_items)}")
-    logger.info(f"First 5 valid result IDs: {[item.id for item in valid_items[:5]]}")
-    
-    for item in valid_items:
-        try:
-            if item.html_path and item.html_path.endswith('.html'):
-                logger.info(f"Submitting task for valid SearchResultItem ID: {item.id}")
-                convert_single_result_task.delay(item.id)
-        except Exception as e:
-            logger.error(f"Error processing valid result {item.id}: {e}")
+    """批量提交转换任务(测试用)"""
+    try:
+        valid_search = ValidSearchResult()
+        valid_items = valid_search.get_valid_search_result_items()
+        
+        logger.info(f"找到 {len(valid_items)} 个有效结果,开始批量提交...")
+        
+        # 创建任务参数
+        params = ConvertTaskParams(
+            result_ids=[str(item.id) for item in valid_items],
+            queue_name='convert_queue'
+        )
+        
+        # 调用转换任务
+        result = convert_all_results_task(params)
+        logger.info(f"批量提交完成,任务ID: {result.get('task_ids', [])}")
+        return result
+    except Exception as e:
+        logger.error(f"批量提交转换任务失败: {str(e)}")
+        raise f"批量提交转换任务失败 {str(e)}"
 
 def clear_existing_tasks():
     """清除所有待处理的任务"""

+ 2 - 2
worker/html_convert/pandoc.py

@@ -81,10 +81,10 @@ class PandocConverter:
             
             return result.returncode == 0
         except subprocess.CalledProcessError as e:
-            logger.error(f"Pandoc conversion error for {md_path}: {e.stderr}")
+            logger.exception(f"Pandoc conversion error for {md_path}: {e.stderr}")
             return False
         except Exception as e:
-            logger.error(f"Error converting {md_path} to docx: {e}")
+            logger.exception(f"Error converting {md_path} to docx: {e}")
             return False
     
     def process_single_result(self, result_id: int, skip_existing: bool = True) -> bool: