| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- from typing import Optional
- from datetime import datetime
- from sqlmodel import SQLModel, Field, Relationship
- from sqlalchemy.orm import relationship
- from pathlib import Path
- from config.settings import DB_URL
- from worker.search_engine.search_result_db import SearchResultItem
- class HtmlConvertResult(SQLModel, table=True):
- """存储HTML转换结果的数据模型"""
- id: Optional[int] = Field(default=None, primary_key=True)
- search_result_item_id: int = Field(foreign_key="searchresultitem.id", index=True)
- source_crawl_md_path: Optional[str] = None # 从crawl_multi生成的markdown文件路径
-
- # 过滤后的文件路径
- filter_crawl_md_path: Optional[str] = None # 过滤后的markdown文件路径
-
- # 转换结果路径
- docling_md_path: Optional[str] = None # 使用docling转换的markdown文件路径
- pandoc_md_path: Optional[str] = None # 使用pandoc转换的markdown文件路径
- pandoc_docx_path: Optional[str] = None # 使用pandoc转换的docx文件路径
-
- # 元数据
- created_at: datetime = Field(default_factory=datetime.now)
- updated_at: datetime = Field(default_factory=datetime.now)
-
- # 转换状态
- is_docling_converted: bool = Field(default=False)
- is_pandoc_converted: bool = Field(default=False)
- is_filtered: bool = Field(default=False)
-
- # 转换结果评估
- docling_quality_score: Optional[float] = None # docling转换质量评分
- pandoc_quality_score: Optional[float] = None # pandoc转换质量评分
-
- # 添加与SearchResultItem的关系
- search_result_item: Optional[SearchResultItem] = Relationship(
- sa_relationship=relationship("SearchResultItem", lazy="joined")
- )
-
- def get_html_path(self) -> Path:
- """获取HTML文件路径"""
- return Path(self.search_result_item.html_path) if self.search_result_item.html_path else None
-
- def get_source_md_path(self) -> Path:
- """获取原始markdown文件路径"""
- return Path(self.source_crawl_md_path) if self.source_crawl_md_path else None
-
- def get_filtered_md_path(self) -> Path:
- """获取过滤后的markdown文件路径"""
- return Path(self.filter_crawl_md_path) if self.filter_crawl_md_path else None
-
- def get_docling_md_path(self) -> Path:
- """获取docling转换的markdown文件路径"""
- return Path(self.docling_md_path) if self.docling_md_path else None
-
- def get_pandoc_md_path(self) -> Path:
- """获取pandoc转换的markdown文件路径"""
- return Path(self.pandoc_md_path) if self.pandoc_md_path else None
-
- def get_pandoc_docx_path(self) -> Path:
- """获取pandoc转换的docx文件路径"""
- return Path(self.pandoc_docx_path) if self.pandoc_docx_path else None
|