part-time-job
/
zhang_crawl_bio


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							from pathlib import Path
from docling.document_converter import DocumentConverter
from worker.html_convert.converter_base import ConverterBase
from worker.html_convert.models import HtmlConvertResult
from worker.search_engine.search_result_db import SearchResultItem
from sqlmodel import Session
from mylib.logu import get_logger

logger = get_logger('docling_converter')

class DoclingConverter(ConverterBase):
    """Class for handling Docling conversions"""
    
    def __init__(self):
        super().__init__()
    
    def process_conversion(self, html_convert: HtmlConvertResult, skip_existing: bool = True) -> HtmlConvertResult:
        """Process HTML to markdown conversion using docling"""
        if not html_convert.search_result_item:
            logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
            return html_convert
            
        save_path = Path(html_convert.search_result_item.save_path)
        if not save_path.exists():
            logger.warning(f"save_path {save_path} not exists")
            return html_convert
        
        # Skip if already converted
        if skip_existing and html_convert.is_docling_converted and html_convert.docling_md_path and Path(html_convert.docling_md_path).exists():
            logger.info(f"Skipping already converted content for {html_convert.id}")
            return html_convert
        
        convert_dir = self.ensure_convert_dir(save_path)
        
        try:
            # Perform the conversion
            converter = DocumentConverter()
            result = converter.convert(save_path)
            markdown_content = result.document.export_to_markdown()
            
            # Apply filtering and add URL header
            markdown_content = self.filter_markdown(markdown_content)
            if html_convert.search_result_item.url:
                markdown_content = self.add_url_header(markdown_content, html_convert.search_result_item.url)
            
            # Save the converted markdown
            docling_md_path = convert_dir / f"{save_path.stem}_docling.md"
            with open(docling_md_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            # Update the conversion result
            html_convert.docling_md_path = str(docling_md_path)
            html_convert.is_docling_converted = True
            
            logger.info(f"Successfully converted HTML to markdown: {docling_md_path}")
            
        except Exception as e:
            logger.error(f"Error converting HTML to markdown: {e}")
            html_convert.is_docling_converted = False
            
        return html_convert

    def process_conversion_by_id(self, result_id: int, skip_existing: bool = True) -> HtmlConvertResult:
        """Process conversion for a specific result ID"""
        existing_html_convert = self.get_html_convert_result(result_id)
        result = None
        
        if existing_html_convert:
            if existing_html_convert.is_docling_converted and skip_existing and existing_html_convert.docling_md_path and Path(existing_html_convert.docling_md_path).exists():
                logger.info(f"Skipping already converted content for {result_id}")
                return existing_html_convert
            else:
                result = self.process_conversion(existing_html_convert, skip_existing)
        else:
            result_item_model = self.get_search_result_item(result_id)
            html_convert = HtmlConvertResult(
                search_result_item_id=result_item_model.id,
                search_result_item=result_item_model
            )
            result = self.process_conversion(html_convert, skip_existing)
        
        if result:
            with Session(self.db_manager.engine) as session:
                session.add(result)
                session.commit()
                session.refresh(result)
                return result

def main():
    # Example: Process a single result with ID 21566
    result_id = 21567
    converter = DoclingConverter()
    converter.process_conversion_by_id(result_id)
    logger.info(f"Successfully processed result {result_id}")

if __name__ == "__main__":
    main()