|
|
@@ -8,6 +8,89 @@ import ssl
|
|
|
from sqlmodel import select, Session
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
|
|
|
-from mylib.base import ensure_output_dir, save_to_file,load_from_pickle
|
|
|
+from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
|
|
|
from mylib.logu import logger
|
|
|
-from config.settings import PROXY_POOL_BASE_URL
|
|
|
+from config.settings import PANDOC_EXE
|
|
|
+from worker.html_convert.models import HtmlConvertResult
|
|
|
+import subprocess
|
|
|
+
|
|
|
+async def convert_md_to_docx(md_path: Path, output_path: Path) -> bool:
|
|
|
+ """Convert markdown file to docx using pandoc"""
|
|
|
+ try:
|
|
|
+ cmd = [
|
|
|
+ PANDOC_EXE,
|
|
|
+ '-f', 'markdown',
|
|
|
+ '-t', 'docx',
|
|
|
+ '-o', str(output_path),
|
|
|
+ str(md_path)
|
|
|
+ ]
|
|
|
+ result = subprocess.run(cmd, check=True)
|
|
|
+ return result.returncode == 0
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"Error converting {md_path} to docx: {e}")
|
|
|
+ return False
|
|
|
+
|
|
|
+async def process_single_result(result_id: int) -> bool:
|
|
|
+ """Process a single SearchResultItem and convert both filtered and docling markdown files to docx"""
|
|
|
+ db_manager = SearchResultManager()
|
|
|
+
|
|
|
+ with Session(db_manager.engine) as session:
|
|
|
+ # Get the search result item
|
|
|
+ result_item = session.get(SearchResultItem, result_id)
|
|
|
+ if not result_item:
|
|
|
+ logger.error(f"SearchResultItem with ID {result_id} not found")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Get the HTML convert result
|
|
|
+ html_convert = session.exec(
|
|
|
+ select(HtmlConvertResult)
|
|
|
+ .where(HtmlConvertResult.search_result_item_id == result_id)
|
|
|
+ ).first()
|
|
|
+
|
|
|
+ if not html_convert:
|
|
|
+ logger.error(f"No HtmlConvertResult found for SearchResultItem {result_id}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Initialize success flags
|
|
|
+ docling_success = False
|
|
|
+ filtered_success = False
|
|
|
+
|
|
|
+ # Convert docling markdown if available
|
|
|
+ if html_convert.docling_md_path:
|
|
|
+ docling_md_path = Path(html_convert.docling_md_path)
|
|
|
+ docling_docx_path = docling_md_path.with_suffix('.docx')
|
|
|
+ docling_success = await convert_md_to_docx(docling_md_path, docling_docx_path)
|
|
|
+ if docling_success:
|
|
|
+ html_convert.pandoc_docx_path = str(docling_docx_path)
|
|
|
+ html_convert.is_pandoc_converted = True
|
|
|
+ logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
|
|
|
+
|
|
|
+ # Convert filtered markdown if available
|
|
|
+ if html_convert.filter_crawl_md_path:
|
|
|
+ filtered_md_path = Path(html_convert.filter_crawl_md_path)
|
|
|
+ filtered_docx_path = filtered_md_path.with_suffix('.docx')
|
|
|
+ filtered_success = await convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
+ if filtered_success:
|
|
|
+ html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
|
+ html_convert.is_pandoc_converted = True
|
|
|
+ logger.info(f"Successfully converted filtered markdown to {filtered_docx_path}")
|
|
|
+
|
|
|
+ # Update database if either conversion succeeded
|
|
|
+ if docling_success or filtered_success:
|
|
|
+ session.add(html_convert)
|
|
|
+ session.commit()
|
|
|
+ return True
|
|
|
+
|
|
|
+ return False
|
|
|
+
|
|
|
+async def main():
|
|
|
+ # Example: Process a single result with ID 21567
|
|
|
+ result_id = 21567
|
|
|
+ success = await process_single_result(result_id)
|
|
|
+ if success:
|
|
|
+ logger.info(f"Successfully processed result {result_id}")
|
|
|
+ else:
|
|
|
+ logger.error(f"Failed to process result {result_id}")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ asyncio.run(main())
|