hai 1 ano · 2989a436d1
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -12,85 +12,137 @@ from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
 
				 from mylib.logu import logger
			
 
				 from config.settings import PANDOC_EXE
			
 
				 from worker.html_convert.models import HtmlConvertResult
			
 
				+from worker.html_convert.docling_converter import DoclingConverter
			
 
				+from worker.html_convert.crawl_filter import CrawlFilter
			
 
				 import subprocess
			
 
				 
			
 
				-async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
			
 
				-    """Convert markdown file to docx using pandoc"""
			
 
				-    try:
			
 
				-        cmd = [
			
 
				-            PANDOC_EXE,
			
 
				-            '-f', 'markdown',
			
 
				-            '-t', 'docx',
			
 
				-            '-o', str(output_path),
			
 
				-            str(md_path)
			
 
				-        ]
			
 
				-        result = subprocess.run(cmd, check=True)
			
 
				-        return result.returncode == 0
			
 
				-    except Exception as e:
			
 
				-        logger.error(f"Error converting {md_path} to docx: {e}")
			
 
				-        return False
			
 
				-
			
 
				-async def process_single_result(result_id: int) -> bool:
			
 
				-    """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
			
 
				-    db_manager = SearchResultManager()
			
 
				+class PandocConverter:
			
 
				+    """Class for handling Pandoc conversions with customizable options"""
			
 
				     
			
 
				-    with Session(db_manager.engine) as session:
			
 
				-        # Get the search result item
			
 
				-        result_item = session.get(SearchResultItem, result_id)
			
 
				-        if not result_item:
			
 
				-            logger.error(f"SearchResultItem with ID {result_id} not found")
			
 
				-            return False
			
 
				+    def __init__(self, font_name: str = "宋体", include_toc: bool = False):
			
 
				+        """
			
 
				+        Initialize PandocConverter with optional parameters
			
 
				         
			
 
				-        # Get the HTML convert result
			
 
				-        html_convert = session.exec(
			
 
				-            select(HtmlConvertResult)
			
 
				-            .where(HtmlConvertResult.search_result_item_id == result_id)
			
 
				-        ).first()
			
 
				-        
			
 
				-        if not html_convert:
			
 
				-            logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
			
 
				+        Args:
			
 
				+            font_name (str): The default font to use in DOCX output
			
 
				+            include_toc (bool): Whether to include table of contents in DOCX output
			
 
				+        """
			
 
				+        self.font_name = font_name
			
 
				+        self.include_toc = include_toc
			
 
				+    
			
 
				+    def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
			
 
				+        """Convert markdown file to docx using pandoc with custom options"""
			
 
				+        try:
			
 
				+            cmd = [
			
 
				+                PANDOC_EXE,
			
 
				+                '-f', 'markdown',
			
 
				+                '-t', 'docx',
			
 
				+                '--reference-doc', self._get_reference_doc(),
			
 
				+                '-o', str(output_path),
			
 
				+                str(md_path)
			
 
				+            ]
			
 
				+            
			
 
				+            if self.include_toc:
			
 
				+                cmd.insert(-1, '--toc')
			
 
				+            
			
 
				+            result = subprocess.run(cmd, check=True)
			
 
				+            return result.returncode == 0
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Error converting {md_path} to docx: {e}")
			
 
				             return False
			
 
				+    
			
 
				+    def process_single_result(self, result_id: int) -> bool:
			
 
				+        """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
			
 
				+        db_manager = SearchResultManager()
			
 
				         
			
 
				-        # Initialize success flags
			
 
				-        docling_success = False
			
 
				-        filtered_success = False
			
 
				-        
			
 
				-        # Convert docling markdown if available
			
 
				-        if html_convert.docling_md_path:
			
 
				-            docling_md_path = Path(html_convert.docling_md_path)
			
 
				-            docling_docx_path = docling_md_path.with_suffix('.docx')
			
 
				-            docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
			
 
				-            if docling_success:
			
 
				-                html_convert.pandoc_docx_path = str(docling_docx_path)
			
 
				-                html_convert.is_pandoc_converted = True
			
 
				-                logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
			
 
				-        
			
 
				-        # Convert filtered markdown if available
			
 
				-        if html_convert.filter_crawl_md_path:
			
 
				-            filtered_md_path = Path(html_convert.filter_crawl_md_path)
			
 
				-            filtered_docx_path = filtered_md_path.with_suffix('.docx')
			
 
				-            filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				-            if filtered_success:
			
 
				-                html_convert.pandoc_docx_path = str(filtered_docx_path)
			
 
				-                html_convert.is_pandoc_converted = True
			
 
				-                logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
			
 
				+        with Session(db_manager.engine) as session:
			
 
				+            # Get the search result item
			
 
				+            result_item = session.get(SearchResultItem, result_id)
			
 
				+            if not result_item:
			
 
				+                logger.error(f"SearchResultItem with ID {result_id} not found")
			
 
				+                return False
			
 
				+            
			
 
				+            # Get the HTML convert result
			
 
				+            html_convert = session.exec(
			
 
				+                select(HtmlConvertResult)
			
 
				+                .where(HtmlConvertResult.search_result_item_id == result_id)
			
 
				+            ).first()
			
 
				+            
			
 
				+            if not html_convert:
			
 
				+                logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
			
 
				+                return False
			
 
				+            
			
 
				+            # Initialize success flags
			
 
				+            docling_success = False
			
 
				+            filtered_success = False
			
 
				+            
			
 
				+            # Convert docling markdown if available
			
 
				+            if html_convert.docling_md_path:
			
 
				+                docling_md_path = Path(html_convert.docling_md_path)
			
 
				+                docling_docx_path = docling_md_path.with_suffix('.docx')
			
 
				+                docling_success = self.convert_md_to_docx(docling_md_path, docling_docx_path)
			
 
				+                if docling_success:
			
 
				+                    html_convert.pandoc_docx_path = str(docling_docx_path)
			
 
				+                    html_convert.is_pandoc_converted = True
			
 
				+                    logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
			
 
				+            
			
 
				+            # Convert filtered markdown if available
			
 
				+            if html_convert.filter_crawl_md_path:
			
 
				+                filtered_md_path = Path(html_convert.filter_crawl_md_path)
			
 
				+                filtered_docx_path = filtered_md_path.with_suffix('.docx')
			
 
				+                filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				+                if filtered_success:
			
 
				+                    html_convert.pandoc_docx_path = str(filtered_docx_path)
			
 
				+                    html_convert.is_pandoc_converted = True
			
 
				+                    logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
			
 
				+            
			
 
				+            # Update database if either conversion succeeded
			
 
				+            if docling_success or filtered_success:
			
 
				+                session.add(html_convert)
			
 
				+                session.commit()
			
 
				+                return True
			
 
				+            
			
 
				+            return False
			
 
				+    
			
 
				+    def _get_reference_doc(self) -> str:
			
 
				+        """Get path to reference document with specified font"""
			
 
				+        reference_dir = Path(__file__).parent / "reference_docs"
			
 
				+        reference_dir.mkdir(exist_ok=True)
			
 
				+        reference_doc = reference_dir / f"{self.font_name.replace(' ', '_')}.docx"
			
 
				         
			
 
				-        # Update database if either conversion succeeded
			
 
				-        if docling_success or filtered_success:
			
 
				-            session.add(html_convert)
			
 
				-            session.commit()
			
 
				-            return True
			
 
				+        if not reference_doc.exists():
			
 
				+            self._create_reference_doc(reference_doc)
			
 
				         
			
 
				-        return False
			
 
				+        return str(reference_doc)
			
 
				+    
			
 
				+    def _create_reference_doc(self, reference_doc: Path):
			
 
				+        """Create reference document with specified font"""
			
 
				+        from docx import Document
			
 
				+        doc = Document()
			
 
				+        style = doc.styles['Normal']
			
 
				+        font = style.font
			
 
				+        font.name = self.font_name
			
 
				+        doc.save(str(reference_doc))
			
 
				 
			
 
				-async def main():
			
 
				+def main():
			
 
				     # Example: Process a single result with ID 21567
			
 
				-    result_id = 21567
			
 
				-    success = await process_single_result(result_id)
			
 
				+    result_id = 21566
			
 
				+    
			
 
				+    # First, process the Docling conversion
			
 
				+    docling_converter = DoclingConverter()
			
 
				+    docling_converter.process_conversion_by_id(result_id)
			
 
				+    
			
 
				+    # Then, process the Crawl Filter conversion
			
 
				+    crawl_filter = CrawlFilter()
			
 
				+    crawl_filter.process_filter_by_id(result_id)
			
 
				+    
			
 
				+    # Finally, convert both results to DOCX using Pandoc with custom options
			
 
				+    pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
			
 
				+    success = pandoc_converter.process_single_result(result_id)
			
 
				     if success:
			
 
				         logger.info(f"Successfully processed result {result_id}")
			
 
				     else:
			
 
				         logger.error(f"Failed to process result {result_id}")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    asyncio.run(main())
			
 
				+    main()