1 年之前 · 2989a436d1
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -12,85 +12,137 @@ from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
 
															 from mylib.logu import logger
														
 
															 from config.settings import PANDOC_EXE
														
 
															 from worker.html_convert.models import HtmlConvertResult
														
 
															+from worker.html_convert.docling_converter import DoclingConverter
														
 
															+from worker.html_convert.crawl_filter import CrawlFilter
														
 
															 import subprocess
														
 
															-async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
														
 
															-    """Convert markdown file to docx using pandoc"""
														
 
															-    try:
														
 
															-        cmd = [
														
 
															-            PANDOC_EXE,
														
 
															-            '-f', 'markdown',
														
 
															-            '-t', 'docx',
														
 
															-            '-o', str(output_path),
														
 
															-            str(md_path)
														
 
															-        ]
														
 
															-        result = subprocess.run(cmd, check=True)
														
 
															-        return result.returncode == 0
														
 
															-    except Exception as e:
														
 
															-        logger.error(f"Error converting {md_path} to docx: {e}")
														
 
															-        return False
														
 
															-
														
 
															-async def process_single_result(result_id: int) -> bool:
														
 
															-    """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
														
 
															-    db_manager = SearchResultManager()
														
 
															+class PandocConverter:
														
 
															+    """Class for handling Pandoc conversions with customizable options"""
														
 
															-    with Session(db_manager.engine) as session:
														
 
															-        # Get the search result item
														
 
															-        result_item = session.get(SearchResultItem, result_id)
														
 
															-        if not result_item:
														
 
															-            logger.error(f"SearchResultItem with ID {result_id} not found")
														
 
															-            return False
														
 
															+    def __init__(self, font_name: str = "宋体", include_toc: bool = False):
														
 
															+        """
														
 
															+        Initialize PandocConverter with optional parameters
														
 
															-        # Get the HTML convert result
														
 
															-        html_convert = session.exec(
														
 
															-            select(HtmlConvertResult)
														
 
															-            .where(HtmlConvertResult.search_result_item_id == result_id)
														
 
															-        ).first()
														
 
															-        
														
 
															-        if not html_convert:
														
 
															-            logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
														
 
															+        Args:
														
 
															+            font_name (str): The default font to use in DOCX output
														
 
															+            include_toc (bool): Whether to include table of contents in DOCX output
														
 
															+        """
														
 
															+        self.font_name = font_name
														
 
															+        self.include_toc = include_toc
														
 
															+    
														
 
															+    def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
														
 
															+        """Convert markdown file to docx using pandoc with custom options"""
														
 
															+        try:
														
 
															+            cmd = [
														
 
															+                PANDOC_EXE,
														
 
															+                '-f', 'markdown',
														
 
															+                '-t', 'docx',
														
 
															+                '--reference-doc', self._get_reference_doc(),
														
 
															+                '-o', str(output_path),
														
 
															+                str(md_path)
														
 
															+            ]
														
 
															+            
														
 
															+            if self.include_toc:
														
 
															+                cmd.insert(-1, '--toc')
														
 
															+            
														
 
															+            result = subprocess.run(cmd, check=True)
														
 
															+            return result.returncode == 0
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"Error converting {md_path} to docx: {e}")
														
 
															             return False
														
 
															+    
														
 
															+    def process_single_result(self, result_id: int) -> bool:
														
 
															+        """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
														
 
															+        db_manager = SearchResultManager()
														
 
															-        # Initialize success flags
														
 
															-        docling_success = False
														
 
															-        filtered_success = False
														
 
															-        
														
 
															-        # Convert docling markdown if available
														
 
															-        if html_convert.docling_md_path:
														
 
															-            docling_md_path = Path(html_convert.docling_md_path)
														
 
															-            docling_docx_path = docling_md_path.with_suffix('.docx')
														
 
															-            docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
														
 
															-            if docling_success:
														
 
															-                html_convert.pandoc_docx_path = str(docling_docx_path)
														
 
															-                html_convert.is_pandoc_converted = True
														
 
															-                logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
														
 
															-        
														
 
															-        # Convert filtered markdown if available
														
 
															-        if html_convert.filter_crawl_md_path:
														
 
															-            filtered_md_path = Path(html_convert.filter_crawl_md_path)
														
 
															-            filtered_docx_path = filtered_md_path.with_suffix('.docx')
														
 
															-            filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
														
 
															-            if filtered_success:
														
 
															-                html_convert.pandoc_docx_path = str(filtered_docx_path)
														
 
															-                html_convert.is_pandoc_converted = True
														
 
															-                logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
														
 
															+        with Session(db_manager.engine) as session:
														
 
															+            # Get the search result item
														
 
															+            result_item = session.get(SearchResultItem, result_id)
														
 
															+            if not result_item:
														
 
															+                logger.error(f"SearchResultItem with ID {result_id} not found")
														
 
															+                return False
														
 
															+            
														
 
															+            # Get the HTML convert result
														
 
															+            html_convert = session.exec(
														
 
															+                select(HtmlConvertResult)
														
 
															+                .where(HtmlConvertResult.search_result_item_id == result_id)
														
 
															+            ).first()
														
 
															+            
														
 
															+            if not html_convert:
														
 
															+                logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
														
 
															+                return False
														
 
															+            
														
 
															+            # Initialize success flags
														
 
															+            docling_success = False
														
 
															+            filtered_success = False
														
 
															+            
														
 
															+            # Convert docling markdown if available
														
 
															+            if html_convert.docling_md_path:
														
 
															+                docling_md_path = Path(html_convert.docling_md_path)
														
 
															+                docling_docx_path = docling_md_path.with_suffix('.docx')
														
 
															+                docling_success = self.convert_md_to_docx(docling_md_path, docling_docx_path)
														
 
															+                if docling_success:
														
 
															+                    html_convert.pandoc_docx_path = str(docling_docx_path)
														
 
															+                    html_convert.is_pandoc_converted = True
														
 
															+                    logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
														
 
															+            
														
 
															+            # Convert filtered markdown if available
														
 
															+            if html_convert.filter_crawl_md_path:
														
 
															+                filtered_md_path = Path(html_convert.filter_crawl_md_path)
														
 
															+                filtered_docx_path = filtered_md_path.with_suffix('.docx')
														
 
															+                filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
														
 
															+                if filtered_success:
														
 
															+                    html_convert.pandoc_docx_path = str(filtered_docx_path)
														
 
															+                    html_convert.is_pandoc_converted = True
														
 
															+                    logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
														
 
															+            
														
 
															+            # Update database if either conversion succeeded
														
 
															+            if docling_success or filtered_success:
														
 
															+                session.add(html_convert)
														
 
															+                session.commit()
														
 
															+                return True
														
 
															+            
														
 
															+            return False
														
 
															+    
														
 
															+    def _get_reference_doc(self) -> str:
														
 
															+        """Get path to reference document with specified font"""
														
 
															+        reference_dir = Path(__file__).parent / "reference_docs"
														
 
															+        reference_dir.mkdir(exist_ok=True)
														
 
															+        reference_doc = reference_dir / f"{self.font_name.replace(' ', '_')}.docx"
														
 
															-        # Update database if either conversion succeeded
														
 
															-        if docling_success or filtered_success:
														
 
															-            session.add(html_convert)
														
 
															-            session.commit()
														
 
															-            return True
														
 
															+        if not reference_doc.exists():
														
 
															+            self._create_reference_doc(reference_doc)
														
 
															-        return False
														
 
															+        return str(reference_doc)
														
 
															+    
														
 
															+    def _create_reference_doc(self, reference_doc: Path):
														
 
															+        """Create reference document with specified font"""
														
 
															+        from docx import Document
														
 
															+        doc = Document()
														
 
															+        style = doc.styles['Normal']
														
 
															+        font = style.font
														
 
															+        font.name = self.font_name
														
 
															+        doc.save(str(reference_doc))
														
 
															-async def main():
														
 
															+def main():
														
 
															     # Example: Process a single result with ID 21567
														
 
															-    result_id = 21567
														
 
															-    success = await process_single_result(result_id)
														
 
															+    result_id = 21566
														
 
															+    
														
 
															+    # First, process the Docling conversion
														
 
															+    docling_converter = DoclingConverter()
														
 
															+    docling_converter.process_conversion_by_id(result_id)
														
 
															+    
														
 
															+    # Then, process the Crawl Filter conversion
														
 
															+    crawl_filter = CrawlFilter()
														
 
															+    crawl_filter.process_filter_by_id(result_id)
														
 
															+    
														
 
															+    # Finally, convert both results to DOCX using Pandoc with custom options
														
 
															+    pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
														
 
															+    success = pandoc_converter.process_single_result(result_id)
														
 
															     if success:
														
 
															         logger.info(f"Successfully processed result {result_id}")
														
 
															     else:
														
 
															         logger.error(f"Failed to process result {result_id}")
														
 
															 if __name__ == "__main__":
														
 
															-    asyncio.run(main())
														
 
															+    main()