from typing import Optional from datetime import datetime from sqlmodel import SQLModel, Field, Relationship from sqlalchemy.orm import relationship from pathlib import Path from config.settings import DB_URL from worker.search_engine.search_result_db import SearchResultItem class HtmlConvertResult(SQLModel, table=True): """存储HTML转换结果的数据模型""" id: Optional[int] = Field(default=None, primary_key=True) search_result_item_id: int = Field(foreign_key="searchresultitem.id", index=True) source_crawl_md_path: Optional[str] = None # 从crawl_multi生成的markdown文件路径 # 过滤后的文件路径 filter_crawl_md_path: Optional[str] = None # 过滤后的markdown文件路径 # 转换结果路径 docling_md_path: Optional[str] = None # 使用docling转换的markdown文件路径 pandoc_md_path: Optional[str] = None # 使用pandoc转换的markdown文件路径 pandoc_docx_path: Optional[str] = None # 使用pandoc转换的docx文件路径 # 元数据 created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) # 转换状态 is_docling_converted: bool = Field(default=False) is_pandoc_converted: bool = Field(default=False) is_filtered: bool = Field(default=False) # 转换结果评估 docling_quality_score: Optional[float] = None # docling转换质量评分 pandoc_quality_score: Optional[float] = None # pandoc转换质量评分 # 添加与SearchResultItem的关系 search_result_item: Optional[SearchResultItem] = Relationship( sa_relationship=relationship("SearchResultItem", lazy="joined") ) def get_html_path(self) -> Path: """获取HTML文件路径""" return Path(self.search_result_item.html_path) if self.search_result_item.html_path else None def get_source_md_path(self) -> Path: """获取原始markdown文件路径""" return Path(self.source_crawl_md_path) if self.source_crawl_md_path else None def get_filtered_md_path(self) -> Path: """获取过滤后的markdown文件路径""" return Path(self.filter_crawl_md_path) if self.filter_crawl_md_path else None def get_docling_md_path(self) -> Path: """获取docling转换的markdown文件路径""" return Path(self.docling_md_path) if self.docling_md_path else None def get_pandoc_md_path(self) -> Path: """获取pandoc转换的markdown文件路径""" return Path(self.pandoc_md_path) if self.pandoc_md_path else None def get_pandoc_docx_path(self) -> Path: """获取pandoc转换的docx文件路径""" return Path(self.pandoc_docx_path) if self.pandoc_docx_path else None