models.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. from typing import Optional
  2. from datetime import datetime
  3. from sqlmodel import SQLModel, Field, Relationship
  4. from sqlalchemy.orm import relationship
  5. from pathlib import Path
  6. from config.settings import DB_URL
  7. from worker.search_engine.search_result_db import SearchResultItem
  8. class HtmlConvertResult(SQLModel, table=True):
  9. """存储HTML转换结果的数据模型"""
  10. id: Optional[int] = Field(default=None, primary_key=True)
  11. search_result_item_id: int = Field(foreign_key="searchresultitem.id", index=True)
  12. source_crawl_md_path: Optional[str] = None # 从crawl_multi生成的markdown文件路径
  13. # 过滤后的文件路径
  14. filter_crawl_md_path: Optional[str] = None # 过滤后的markdown文件路径
  15. # 转换结果路径
  16. docling_md_path: Optional[str] = None # 使用docling转换的markdown文件路径
  17. pandoc_md_path: Optional[str] = None # 使用pandoc转换的markdown文件路径
  18. pandoc_docx_path: Optional[str] = None # 使用pandoc转换的docx文件路径
  19. # 元数据
  20. created_at: datetime = Field(default_factory=datetime.now)
  21. updated_at: datetime = Field(default_factory=datetime.now)
  22. # 转换状态
  23. is_docling_converted: bool = Field(default=False)
  24. is_pandoc_converted: bool = Field(default=False)
  25. is_filtered: bool = Field(default=False)
  26. # 转换结果评估
  27. docling_quality_score: Optional[float] = None # docling转换质量评分
  28. pandoc_quality_score: Optional[float] = None # pandoc转换质量评分
  29. # 添加与SearchResultItem的关系
  30. search_result_item: Optional[SearchResultItem] = Relationship(
  31. sa_relationship=relationship("SearchResultItem", lazy="joined")
  32. )
  33. def get_html_path(self) -> Path:
  34. """获取HTML文件路径"""
  35. return Path(self.search_result_item.html_path) if self.search_result_item.html_path else None
  36. def get_source_md_path(self) -> Path:
  37. """获取原始markdown文件路径"""
  38. return Path(self.source_crawl_md_path) if self.source_crawl_md_path else None
  39. def get_filtered_md_path(self) -> Path:
  40. """获取过滤后的markdown文件路径"""
  41. return Path(self.filter_crawl_md_path) if self.filter_crawl_md_path else None
  42. def get_docling_md_path(self) -> Path:
  43. """获取docling转换的markdown文件路径"""
  44. return Path(self.docling_md_path) if self.docling_md_path else None
  45. def get_pandoc_md_path(self) -> Path:
  46. """获取pandoc转换的markdown文件路径"""
  47. return Path(self.pandoc_md_path) if self.pandoc_md_path else None
  48. def get_pandoc_docx_path(self) -> Path:
  49. """获取pandoc转换的docx文件路径"""
  50. return Path(self.pandoc_docx_path) if self.pandoc_docx_path else None