|
@@ -12,85 +12,137 @@ from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
|
|
|
from mylib.logu import logger
|
|
from mylib.logu import logger
|
|
|
from config.settings import PANDOC_EXE
|
|
from config.settings import PANDOC_EXE
|
|
|
from worker.html_convert.models import HtmlConvertResult
|
|
from worker.html_convert.models import HtmlConvertResult
|
|
|
|
|
+from worker.html_convert.docling_converter import DoclingConverter
|
|
|
|
|
+from worker.html_convert.crawl_filter import CrawlFilter
|
|
|
import subprocess
|
|
import subprocess
|
|
|
|
|
|
|
|
-async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
|
|
|
|
|
- """Convert markdown file to docx using pandoc"""
|
|
|
|
|
- try:
|
|
|
|
|
- cmd = [
|
|
|
|
|
- PANDOC_EXE,
|
|
|
|
|
- '-f', 'markdown',
|
|
|
|
|
- '-t', 'docx',
|
|
|
|
|
- '-o', str(output_path),
|
|
|
|
|
- str(md_path)
|
|
|
|
|
- ]
|
|
|
|
|
- result = subprocess.run(cmd, check=True)
|
|
|
|
|
- return result.returncode == 0
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.error(f"Error converting {md_path} to docx: {e}")
|
|
|
|
|
- return False
|
|
|
|
|
-
|
|
|
|
|
-async def process_single_result(result_id: int) -> bool:
|
|
|
|
|
- """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
|
|
|
|
|
- db_manager = SearchResultManager()
|
|
|
|
|
|
|
+class PandocConverter:
|
|
|
|
|
+ """Class for handling Pandoc conversions with customizable options"""
|
|
|
|
|
|
|
|
- with Session(db_manager.engine) as session:
|
|
|
|
|
- # Get the search result item
|
|
|
|
|
- result_item = session.get(SearchResultItem, result_id)
|
|
|
|
|
- if not result_item:
|
|
|
|
|
- logger.error(f"SearchResultItem with ID {result_id} not found")
|
|
|
|
|
- return False
|
|
|
|
|
|
|
+ def __init__(self, font_name: str = "宋体", include_toc: bool = False):
|
|
|
|
|
+ """
|
|
|
|
|
+ Initialize PandocConverter with optional parameters
|
|
|
|
|
|
|
|
- # Get the HTML convert result
|
|
|
|
|
- html_convert = session.exec(
|
|
|
|
|
- select(HtmlConvertResult)
|
|
|
|
|
- .where(HtmlConvertResult.search_result_item_id == result_id)
|
|
|
|
|
- ).first()
|
|
|
|
|
-
|
|
|
|
|
- if not html_convert:
|
|
|
|
|
- logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
|
|
|
|
|
|
|
+ Args:
|
|
|
|
|
+ font_name (str): The default font to use in DOCX output
|
|
|
|
|
+ include_toc (bool): Whether to include table of contents in DOCX output
|
|
|
|
|
+ """
|
|
|
|
|
+ self.font_name = font_name
|
|
|
|
|
+ self.include_toc = include_toc
|
|
|
|
|
+
|
|
|
|
|
+ def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
|
|
|
|
|
+ """Convert markdown file to docx using pandoc with custom options"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ cmd = [
|
|
|
|
|
+ PANDOC_EXE,
|
|
|
|
|
+ '-f', 'markdown',
|
|
|
|
|
+ '-t', 'docx',
|
|
|
|
|
+ '--reference-doc', self._get_reference_doc(),
|
|
|
|
|
+ '-o', str(output_path),
|
|
|
|
|
+ str(md_path)
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ if self.include_toc:
|
|
|
|
|
+ cmd.insert(-1, '--toc')
|
|
|
|
|
+
|
|
|
|
|
+ result = subprocess.run(cmd, check=True)
|
|
|
|
|
+ return result.returncode == 0
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"Error converting {md_path} to docx: {e}")
|
|
|
return False
|
|
return False
|
|
|
|
|
+
|
|
|
|
|
+ def process_single_result(self, result_id: int) -> bool:
|
|
|
|
|
+ """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
|
|
|
|
|
+ db_manager = SearchResultManager()
|
|
|
|
|
|
|
|
- # Initialize success flags
|
|
|
|
|
- docling_success = False
|
|
|
|
|
- filtered_success = False
|
|
|
|
|
-
|
|
|
|
|
- # Convert docling markdown if available
|
|
|
|
|
- if html_convert.docling_md_path:
|
|
|
|
|
- docling_md_path = Path(html_convert.docling_md_path)
|
|
|
|
|
- docling_docx_path = docling_md_path.with_suffix('.docx')
|
|
|
|
|
- docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
|
|
|
|
|
- if docling_success:
|
|
|
|
|
- html_convert.pandoc_docx_path = str(docling_docx_path)
|
|
|
|
|
- html_convert.is_pandoc_converted = True
|
|
|
|
|
- logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
|
|
|
|
|
-
|
|
|
|
|
- # Convert filtered markdown if available
|
|
|
|
|
- if html_convert.filter_crawl_md_path:
|
|
|
|
|
- filtered_md_path = Path(html_convert.filter_crawl_md_path)
|
|
|
|
|
- filtered_docx_path = filtered_md_path.with_suffix('.docx')
|
|
|
|
|
- filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
|
|
- if filtered_success:
|
|
|
|
|
- html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
|
|
|
- html_convert.is_pandoc_converted = True
|
|
|
|
|
- logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
|
|
|
|
|
|
|
+ with Session(db_manager.engine) as session:
|
|
|
|
|
+ # Get the search result item
|
|
|
|
|
+ result_item = session.get(SearchResultItem, result_id)
|
|
|
|
|
+ if not result_item:
|
|
|
|
|
+ logger.error(f"SearchResultItem with ID {result_id} not found")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Get the HTML convert result
|
|
|
|
|
+ html_convert = session.exec(
|
|
|
|
|
+ select(HtmlConvertResult)
|
|
|
|
|
+ .where(HtmlConvertResult.search_result_item_id == result_id)
|
|
|
|
|
+ ).first()
|
|
|
|
|
+
|
|
|
|
|
+ if not html_convert:
|
|
|
|
|
+ logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Initialize success flags
|
|
|
|
|
+ docling_success = False
|
|
|
|
|
+ filtered_success = False
|
|
|
|
|
+
|
|
|
|
|
+ # Convert docling markdown if available
|
|
|
|
|
+ if html_convert.docling_md_path:
|
|
|
|
|
+ docling_md_path = Path(html_convert.docling_md_path)
|
|
|
|
|
+ docling_docx_path = docling_md_path.with_suffix('.docx')
|
|
|
|
|
+ docling_success = self.convert_md_to_docx(docling_md_path, docling_docx_path)
|
|
|
|
|
+ if docling_success:
|
|
|
|
|
+ html_convert.pandoc_docx_path = str(docling_docx_path)
|
|
|
|
|
+ html_convert.is_pandoc_converted = True
|
|
|
|
|
+ logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
|
|
|
|
|
+
|
|
|
|
|
+ # Convert filtered markdown if available
|
|
|
|
|
+ if html_convert.filter_crawl_md_path:
|
|
|
|
|
+ filtered_md_path = Path(html_convert.filter_crawl_md_path)
|
|
|
|
|
+ filtered_docx_path = filtered_md_path.with_suffix('.docx')
|
|
|
|
|
+ filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
|
|
+ if filtered_success:
|
|
|
|
|
+ html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
|
|
|
+ html_convert.is_pandoc_converted = True
|
|
|
|
|
+ logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
|
|
|
|
|
+
|
|
|
|
|
+ # Update database if either conversion succeeded
|
|
|
|
|
+ if docling_success or filtered_success:
|
|
|
|
|
+ session.add(html_convert)
|
|
|
|
|
+ session.commit()
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def _get_reference_doc(self) -> str:
|
|
|
|
|
+ """Get path to reference document with specified font"""
|
|
|
|
|
+ reference_dir = Path(__file__).parent / "reference_docs"
|
|
|
|
|
+ reference_dir.mkdir(exist_ok=True)
|
|
|
|
|
+ reference_doc = reference_dir / f"{self.font_name.replace(' ', '_')}.docx"
|
|
|
|
|
|
|
|
- # Update database if either conversion succeeded
|
|
|
|
|
- if docling_success or filtered_success:
|
|
|
|
|
- session.add(html_convert)
|
|
|
|
|
- session.commit()
|
|
|
|
|
- return True
|
|
|
|
|
|
|
+ if not reference_doc.exists():
|
|
|
|
|
+ self._create_reference_doc(reference_doc)
|
|
|
|
|
|
|
|
- return False
|
|
|
|
|
|
|
+ return str(reference_doc)
|
|
|
|
|
+
|
|
|
|
|
+ def _create_reference_doc(self, reference_doc: Path):
|
|
|
|
|
+ """Create reference document with specified font"""
|
|
|
|
|
+ from docx import Document
|
|
|
|
|
+ doc = Document()
|
|
|
|
|
+ style = doc.styles['Normal']
|
|
|
|
|
+ font = style.font
|
|
|
|
|
+ font.name = self.font_name
|
|
|
|
|
+ doc.save(str(reference_doc))
|
|
|
|
|
|
|
|
-async def main():
|
|
|
|
|
|
|
+def main():
|
|
|
# Example: Process a single result with ID 21567
|
|
# Example: Process a single result with ID 21567
|
|
|
- result_id = 21567
|
|
|
|
|
- success = await process_single_result(result_id)
|
|
|
|
|
|
|
+ result_id = 21566
|
|
|
|
|
+
|
|
|
|
|
+ # First, process the Docling conversion
|
|
|
|
|
+ docling_converter = DoclingConverter()
|
|
|
|
|
+ docling_converter.process_conversion_by_id(result_id)
|
|
|
|
|
+
|
|
|
|
|
+ # Then, process the Crawl Filter conversion
|
|
|
|
|
+ crawl_filter = CrawlFilter()
|
|
|
|
|
+ crawl_filter.process_filter_by_id(result_id)
|
|
|
|
|
+
|
|
|
|
|
+ # Finally, convert both results to DOCX using Pandoc with custom options
|
|
|
|
|
+ pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
|
|
|
|
|
+ success = pandoc_converter.process_single_result(result_id)
|
|
|
if success:
|
|
if success:
|
|
|
logger.info(f"Successfully processed result {result_id}")
|
|
logger.info(f"Successfully processed result {result_id}")
|
|
|
else:
|
|
else:
|
|
|
logger.error(f"Failed to process result {result_id}")
|
|
logger.error(f"Failed to process result {result_id}")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
- asyncio.run(main())
|
|
|
|
|
|
|
+ main()
|