| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- from pathlib import Path
- from docling.document_converter import DocumentConverter
- from worker.html_convert.converter_base import ConverterBase
- from worker.html_convert.models import HtmlConvertResult
- from worker.search_engine.search_result_db import SearchResultItem
- from sqlmodel import Session
- from mylib.logu import get_logger
- logger = get_logger('docling_converter')
- class DoclingConverter(ConverterBase):
- """Class for handling Docling conversions"""
-
- def __init__(self):
- super().__init__()
-
- def process_conversion(self, html_convert: HtmlConvertResult, skip_existing: bool = True) -> HtmlConvertResult:
- """Process HTML to markdown conversion using docling"""
- if not html_convert.search_result_item:
- logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
- return html_convert
-
- save_path = Path(html_convert.search_result_item.save_path)
- if not save_path.exists():
- logger.warning(f"save_path {save_path} not exists")
- return html_convert
-
- # Skip if already converted
- if skip_existing and html_convert.is_docling_converted and html_convert.docling_md_path and Path(html_convert.docling_md_path).exists():
- logger.info(f"Skipping already converted content for {html_convert.id}")
- return html_convert
-
- convert_dir = self.ensure_convert_dir(save_path)
-
- try:
- # Perform the conversion
- converter = DocumentConverter()
- result = converter.convert(save_path)
- markdown_content = result.document.export_to_markdown()
-
- # Apply filtering and add URL header
- markdown_content = self.filter_markdown(markdown_content)
- if html_convert.search_result_item.url:
- markdown_content = self.add_url_header(markdown_content, html_convert.search_result_item.url)
-
- # Save the converted markdown
- docling_md_path = convert_dir / f"{save_path.stem}_docling.md"
- with open(docling_md_path, 'w', encoding='utf-8') as f:
- f.write(markdown_content)
-
- # Update the conversion result
- html_convert.docling_md_path = str(docling_md_path)
- html_convert.is_docling_converted = True
-
- logger.info(f"Successfully converted HTML to markdown: {docling_md_path}")
-
- except Exception as e:
- logger.error(f"Error converting HTML to markdown: {e}")
- html_convert.is_docling_converted = False
-
- return html_convert
- def process_conversion_by_id(self, result_id: int, skip_existing: bool = True) -> HtmlConvertResult:
- """Process conversion for a specific result ID"""
- existing_html_convert = self.get_html_convert_result(result_id)
- result = None
-
- if existing_html_convert:
- if existing_html_convert.is_docling_converted and skip_existing and existing_html_convert.docling_md_path and Path(existing_html_convert.docling_md_path).exists():
- logger.info(f"Skipping already converted content for {result_id}")
- return existing_html_convert
- else:
- result = self.process_conversion(existing_html_convert, skip_existing)
- else:
- result_item_model = self.get_search_result_item(result_id)
- html_convert = HtmlConvertResult(
- search_result_item_id=result_item_model.id,
- search_result_item=result_item_model
- )
- result = self.process_conversion(html_convert, skip_existing)
-
- if result:
- with Session(self.db_manager.engine) as session:
- session.add(result)
- session.commit()
- session.refresh(result)
- return result
- def main():
- # Example: Process a single result with ID 21566
- result_id = 21567
- converter = DoclingConverter()
- converter.process_conversion_by_id(result_id)
- logger.info(f"Successfully processed result {result_id}")
- if __name__ == "__main__":
- main()
|