Explorar o código

完成 pandoc 转换,并且成功添加字体修改,但是目录不正常

mrh hai 1 ano
pai
achega
2989a436d1
Modificáronse 1 ficheiros con 118 adicións e 66 borrados
  1. 118 66
      worker/html_convert/pandoc.py

+ 118 - 66
worker/html_convert/pandoc.py

@@ -12,85 +12,137 @@ from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
 from mylib.logu import logger
 from config.settings import PANDOC_EXE
 from worker.html_convert.models import HtmlConvertResult
+from worker.html_convert.docling_converter import DoclingConverter
+from worker.html_convert.crawl_filter import CrawlFilter
 import subprocess
 
-async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
-    """Convert markdown file to docx using pandoc"""
-    try:
-        cmd = [
-            PANDOC_EXE,
-            '-f', 'markdown',
-            '-t', 'docx',
-            '-o', str(output_path),
-            str(md_path)
-        ]
-        result = subprocess.run(cmd, check=True)
-        return result.returncode == 0
-    except Exception as e:
-        logger.error(f"Error converting {md_path} to docx: {e}")
-        return False
-
-async def process_single_result(result_id: int) -> bool:
-    """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
-    db_manager = SearchResultManager()
+class PandocConverter:
+    """Class for handling Pandoc conversions with customizable options"""
     
-    with Session(db_manager.engine) as session:
-        # Get the search result item
-        result_item = session.get(SearchResultItem, result_id)
-        if not result_item:
-            logger.error(f"SearchResultItem with ID {result_id} not found")
-            return False
+    def __init__(self, font_name: str = "宋体", include_toc: bool = False):
+        """
+        Initialize PandocConverter with optional parameters
         
-        # Get the HTML convert result
-        html_convert = session.exec(
-            select(HtmlConvertResult)
-            .where(HtmlConvertResult.search_result_item_id == result_id)
-        ).first()
-        
-        if not html_convert:
-            logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
+        Args:
+            font_name (str): The default font to use in DOCX output
+            include_toc (bool): Whether to include table of contents in DOCX output
+        """
+        self.font_name = font_name
+        self.include_toc = include_toc
+    
+    def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
+        """Convert markdown file to docx using pandoc with custom options"""
+        try:
+            cmd = [
+                PANDOC_EXE,
+                '-f', 'markdown',
+                '-t', 'docx',
+                '--reference-doc', self._get_reference_doc(),
+                '-o', str(output_path),
+                str(md_path)
+            ]
+            
+            if self.include_toc:
+                cmd.insert(-1, '--toc')
+            
+            result = subprocess.run(cmd, check=True)
+            return result.returncode == 0
+        except Exception as e:
+            logger.error(f"Error converting {md_path} to docx: {e}")
             return False
+    
+    def process_single_result(self, result_id: int) -> bool:
+        """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
+        db_manager = SearchResultManager()
         
-        # Initialize success flags
-        docling_success = False
-        filtered_success = False
-        
-        # Convert docling markdown if available
-        if html_convert.docling_md_path:
-            docling_md_path = Path(html_convert.docling_md_path)
-            docling_docx_path = docling_md_path.with_suffix('.docx')
-            docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
-            if docling_success:
-                html_convert.pandoc_docx_path = str(docling_docx_path)
-                html_convert.is_pandoc_converted = True
-                logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
-        
-        # Convert filtered markdown if available
-        if html_convert.filter_crawl_md_path:
-            filtered_md_path = Path(html_convert.filter_crawl_md_path)
-            filtered_docx_path = filtered_md_path.with_suffix('.docx')
-            filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
-            if filtered_success:
-                html_convert.pandoc_docx_path = str(filtered_docx_path)
-                html_convert.is_pandoc_converted = True
-                logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
+        with Session(db_manager.engine) as session:
+            # Get the search result item
+            result_item = session.get(SearchResultItem, result_id)
+            if not result_item:
+                logger.error(f"SearchResultItem with ID {result_id} not found")
+                return False
+            
+            # Get the HTML convert result
+            html_convert = session.exec(
+                select(HtmlConvertResult)
+                .where(HtmlConvertResult.search_result_item_id == result_id)
+            ).first()
+            
+            if not html_convert:
+                logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
+                return False
+            
+            # Initialize success flags
+            docling_success = False
+            filtered_success = False
+            
+            # Convert docling markdown if available
+            if html_convert.docling_md_path:
+                docling_md_path = Path(html_convert.docling_md_path)
+                docling_docx_path = docling_md_path.with_suffix('.docx')
+                docling_success = self.convert_md_to_docx(docling_md_path, docling_docx_path)
+                if docling_success:
+                    html_convert.pandoc_docx_path = str(docling_docx_path)
+                    html_convert.is_pandoc_converted = True
+                    logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
+            
+            # Convert filtered markdown if available
+            if html_convert.filter_crawl_md_path:
+                filtered_md_path = Path(html_convert.filter_crawl_md_path)
+                filtered_docx_path = filtered_md_path.with_suffix('.docx')
+                filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
+                if filtered_success:
+                    html_convert.pandoc_docx_path = str(filtered_docx_path)
+                    html_convert.is_pandoc_converted = True
+                    logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
+            
+            # Update database if either conversion succeeded
+            if docling_success or filtered_success:
+                session.add(html_convert)
+                session.commit()
+                return True
+            
+            return False
+    
+    def _get_reference_doc(self) -> str:
+        """Get path to reference document with specified font"""
+        reference_dir = Path(__file__).parent / "reference_docs"
+        reference_dir.mkdir(exist_ok=True)
+        reference_doc = reference_dir / f"{self.font_name.replace(' ', '_')}.docx"
         
-        # Update database if either conversion succeeded
-        if docling_success or filtered_success:
-            session.add(html_convert)
-            session.commit()
-            return True
+        if not reference_doc.exists():
+            self._create_reference_doc(reference_doc)
         
-        return False
+        return str(reference_doc)
+    
+    def _create_reference_doc(self, reference_doc: Path):
+        """Create reference document with specified font"""
+        from docx import Document
+        doc = Document()
+        style = doc.styles['Normal']
+        font = style.font
+        font.name = self.font_name
+        doc.save(str(reference_doc))
 
-async def main():
+def main():
     # Example: Process a single result with ID 21567
-    result_id = 21567
-    success = await process_single_result(result_id)
+    result_id = 21566
+    
+    # First, process the Docling conversion
+    docling_converter = DoclingConverter()
+    docling_converter.process_conversion_by_id(result_id)
+    
+    # Then, process the Crawl Filter conversion
+    crawl_filter = CrawlFilter()
+    crawl_filter.process_filter_by_id(result_id)
+    
+    # Finally, convert both results to DOCX using Pandoc with custom options
+    pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
+    success = pandoc_converter.process_single_result(result_id)
     if success:
         logger.info(f"Successfully processed result {result_id}")
     else:
         logger.error(f"Failed to process result {result_id}")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    main()