Ver Fonte

完成 pandoc 转换,但是数据库重复记录导致无法同时转换多个docx

mrh há 1 ano atrás
pai
commit
5b278ce3be
1 ficheiros alterados com 85 adições e 2 exclusões
  1. 85 2
      worker/html_convert/pandoc.py

+ 85 - 2
worker/html_convert/pandoc.py

@@ -8,6 +8,89 @@ import ssl
 from sqlmodel import select, Session
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
 from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
-from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
+from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
 from mylib.logu import logger
-from config.settings import PROXY_POOL_BASE_URL
+from config.settings import PANDOC_EXE
+from worker.html_convert.models import HtmlConvertResult
+import subprocess
+
+async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
+    """Convert markdown file to docx using pandoc"""
+    try:
+        cmd = [
+            PANDOC_EXE,
+            '-f', 'markdown',
+            '-t', 'docx',
+            '-o', str(output_path),
+            str(md_path)
+        ]
+        result = subprocess.run(cmd, check=True)
+        return result.returncode == 0
+    except Exception as e:
+        logger.error(f"Error converting {md_path} to docx: {e}")
+        return False
+
+async def process_single_result(result_id: int) -> bool:
+    """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
+    db_manager = SearchResultManager()
+    
+    with Session(db_manager.engine) as session:
+        # Get the search result item
+        result_item = session.get(SearchResultItem, result_id)
+        if not result_item:
+            logger.error(f"SearchResultItem with ID {result_id} not found")
+            return False
+        
+        # Get the HTML convert result
+        html_convert = session.exec(
+            select(HtmlConvertResult)
+            .where(HtmlConvertResult.search_result_item_id == result_id)
+        ).first()
+        
+        if not html_convert:
+            logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
+            return False
+        
+        # Initialize success flags
+        docling_success = False
+        filtered_success = False
+        
+        # Convert docling markdown if available
+        if html_convert.docling_md_path:
+            docling_md_path = Path(html_convert.docling_md_path)
+            docling_docx_path = docling_md_path.with_suffix('.docx')
+            docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
+            if docling_success:
+                html_convert.pandoc_docx_path = str(docling_docx_path)
+                html_convert.is_pandoc_converted = True
+                logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
+        
+        # Convert filtered markdown if available
+        if html_convert.filter_crawl_md_path:
+            filtered_md_path = Path(html_convert.filter_crawl_md_path)
+            filtered_docx_path = filtered_md_path.with_suffix('.docx')
+            filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
+            if filtered_success:
+                html_convert.pandoc_docx_path = str(filtered_docx_path)
+                html_convert.is_pandoc_converted = True
+                logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
+        
+        # Update database if either conversion succeeded
+        if docling_success or filtered_success:
+            session.add(html_convert)
+            session.commit()
+            return True
+        
+        return False
+
+async def main():
+    # Example: Process a single result with ID 21567
+    result_id = 21567
+    success = await process_single_result(result_id)
+    if success:
+        logger.info(f"Successfully processed result {result_id}")
+    else:
+        logger.error(f"Failed to process result {result_id}")
+
+if __name__ == "__main__":
+    asyncio.run(main())