há 1 ano atrás · 5b278ce3be
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -8,6 +8,89 @@ import ssl
 
				 from sqlmodel import select, Session
			
 
				 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
			
 
				 from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
			
 
				-from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
			
 
				+from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
			
 
				 from mylib.logu import logger
			
 
				-from config.settings import PROXY_POOL_BASE_URL
			
 
				+from config.settings import PANDOC_EXE
			
 
				+from worker.html_convert.models import HtmlConvertResult
			
 
				+import subprocess
			
 
				+
			
 
				+async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
			
 
				+    """Convert markdown file to docx using pandoc"""
			
 
				+    try:
			
 
				+        cmd = [
			
 
				+            PANDOC_EXE,
			
 
				+            '-f', 'markdown',
			
 
				+            '-t', 'docx',
			
 
				+            '-o', str(output_path),
			
 
				+            str(md_path)
			
 
				+        ]
			
 
				+        result = subprocess.run(cmd, check=True)
			
 
				+        return result.returncode == 0
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Error converting {md_path} to docx: {e}")
			
 
				+        return False
			
 
				+
			
 
				+async def process_single_result(result_id: int) -> bool:
			
 
				+    """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
			
 
				+    db_manager = SearchResultManager()
			
 
				+    
			
 
				+    with Session(db_manager.engine) as session:
			
 
				+        # Get the search result item
			
 
				+        result_item = session.get(SearchResultItem, result_id)
			
 
				+        if not result_item:
			
 
				+            logger.error(f"SearchResultItem with ID {result_id} not found")
			
 
				+            return False
			
 
				+        
			
 
				+        # Get the HTML convert result
			
 
				+        html_convert = session.exec(
			
 
				+            select(HtmlConvertResult)
			
 
				+            .where(HtmlConvertResult.search_result_item_id == result_id)
			
 
				+        ).first()
			
 
				+        
			
 
				+        if not html_convert:
			
 
				+            logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
			
 
				+            return False
			
 
				+        
			
 
				+        # Initialize success flags
			
 
				+        docling_success = False
			
 
				+        filtered_success = False
			
 
				+        
			
 
				+        # Convert docling markdown if available
			
 
				+        if html_convert.docling_md_path:
			
 
				+            docling_md_path = Path(html_convert.docling_md_path)
			
 
				+            docling_docx_path = docling_md_path.with_suffix('.docx')
			
 
				+            docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
			
 
				+            if docling_success:
			
 
				+                html_convert.pandoc_docx_path = str(docling_docx_path)
			
 
				+                html_convert.is_pandoc_converted = True
			
 
				+                logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
			
 
				+        
			
 
				+        # Convert filtered markdown if available
			
 
				+        if html_convert.filter_crawl_md_path:
			
 
				+            filtered_md_path = Path(html_convert.filter_crawl_md_path)
			
 
				+            filtered_docx_path = filtered_md_path.with_suffix('.docx')
			
 
				+            filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				+            if filtered_success:
			
 
				+                html_convert.pandoc_docx_path = str(filtered_docx_path)
			
 
				+                html_convert.is_pandoc_converted = True
			
 
				+                logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
			
 
				+        
			
 
				+        # Update database if either conversion succeeded
			
 
				+        if docling_success or filtered_success:
			
 
				+            session.add(html_convert)
			
 
				+            session.commit()
			
 
				+            return True
			
 
				+        
			
 
				+        return False
			
 
				+
			
 
				+async def main():
			
 
				+    # Example: Process a single result with ID 21567
			
 
				+    result_id = 21567
			
 
				+    success = await process_single_result(result_id)
			
 
				+    if success:
			
 
				+        logger.info(f"Successfully processed result {result_id}")
			
 
				+    else:
			
 
				+        logger.error(f"Failed to process result {result_id}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(main())