docling_converter.py 4.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. from pathlib import Path
  2. from docling.document_converter import DocumentConverter
  3. from worker.html_convert.converter_base import ConverterBase
  4. from worker.html_convert.models import HtmlConvertResult
  5. from worker.search_engine.search_result_db import SearchResultItem
  6. from sqlmodel import Session
  7. from mylib.logu import get_logger
  8. logger = get_logger('docling_converter')
  9. class DoclingConverter(ConverterBase):
  10. """Class for handling Docling conversions"""
  11. def __init__(self):
  12. super().__init__()
  13. def process_conversion(self, html_convert: HtmlConvertResult, skip_existing: bool = True) -> HtmlConvertResult:
  14. """Process HTML to markdown conversion using docling"""
  15. if not html_convert.search_result_item:
  16. logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
  17. return html_convert
  18. save_path = Path(html_convert.search_result_item.save_path)
  19. if not save_path.exists():
  20. logger.warning(f"save_path {save_path} not exists")
  21. return html_convert
  22. # Skip if already converted
  23. if skip_existing and html_convert.is_docling_converted and html_convert.docling_md_path and Path(html_convert.docling_md_path).exists():
  24. logger.info(f"Skipping already converted content for {html_convert.id}")
  25. return html_convert
  26. convert_dir = self.ensure_convert_dir(save_path)
  27. try:
  28. # Perform the conversion
  29. converter = DocumentConverter()
  30. result = converter.convert(save_path)
  31. markdown_content = result.document.export_to_markdown()
  32. # Apply filtering and add URL header
  33. markdown_content = self.filter_markdown(markdown_content)
  34. if html_convert.search_result_item.url:
  35. markdown_content = self.add_url_header(markdown_content, html_convert.search_result_item.url)
  36. # Save the converted markdown
  37. docling_md_path = convert_dir / f"{save_path.stem}_docling.md"
  38. with open(docling_md_path, 'w', encoding='utf-8') as f:
  39. f.write(markdown_content)
  40. # Update the conversion result
  41. html_convert.docling_md_path = str(docling_md_path)
  42. html_convert.is_docling_converted = True
  43. logger.info(f"Successfully converted HTML to markdown: {docling_md_path}")
  44. except Exception as e:
  45. logger.error(f"Error converting HTML to markdown: {e}")
  46. html_convert.is_docling_converted = False
  47. return html_convert
  48. def process_conversion_by_id(self, result_id: int, skip_existing: bool = True) -> HtmlConvertResult:
  49. """Process conversion for a specific result ID"""
  50. existing_html_convert = self.get_html_convert_result(result_id)
  51. result = None
  52. if existing_html_convert:
  53. if existing_html_convert.is_docling_converted and skip_existing and existing_html_convert.docling_md_path and Path(existing_html_convert.docling_md_path).exists():
  54. logger.info(f"Skipping already converted content for {result_id}")
  55. return existing_html_convert
  56. else:
  57. result = self.process_conversion(existing_html_convert, skip_existing)
  58. else:
  59. result_item_model = self.get_search_result_item(result_id)
  60. html_convert = HtmlConvertResult(
  61. search_result_item_id=result_item_model.id,
  62. search_result_item=result_item_model
  63. )
  64. result = self.process_conversion(html_convert, skip_existing)
  65. if result:
  66. with Session(self.db_manager.engine) as session:
  67. session.add(result)
  68. session.commit()
  69. session.refresh(result)
  70. return result
  71. def main():
  72. # Example: Process a single result with ID 21566
  73. result_id = 21567
  74. converter = DoclingConverter()
  75. converter.process_conversion_by_id(result_id)
  76. logger.info(f"Successfully processed result {result_id}")
  77. if __name__ == "__main__":
  78. main()