1 سال پیش · 62c5238036
--- a/worker/html_convert/crawl_filter.py
+++ b/worker/html_convert/crawl_filter.py
@@ -19,13 +19,13 @@ class CrawlFilter(ConverterBase):
 
				             logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
			
 
				             return html_convert
			
 
				             
			
 
				-        html_path = Path(html_convert.search_result_item.html_path)
			
 
				-        if not html_path.exists():
			
 
				-            logger.warning(f"html_path {html_path} not exists")
			
 
				+        save_path = Path(html_convert.search_result_item.save_path)
			
 
				+        if not save_path.exists():
			
 
				+            logger.warning(f"save_path {save_path} not exists")
			
 
				             return html_convert
			
 
				         
			
 
				-        convert_dir = self.ensure_convert_dir(html_path)
			
 
				-        md_path = html_path.parent / f"{html_path.stem}.md"
			
 
				+        convert_dir = self.ensure_convert_dir(save_path)
			
 
				+        md_path = save_path.parent / f"{save_path.stem}.md"
			
 
				         logger.info(f"save dir {convert_dir}")
			
 
				         if md_path.exists():
			
 
				             html_convert.source_crawl_md_path = str(md_path)
			
@@ -42,7 +42,7 @@ class CrawlFilter(ConverterBase):
 
				                 if html_convert.search_result_item.url:
			
 
				                     filtered_content = self.add_url_header(filtered_content, html_convert.search_result_item.url)
			
 
				                 
			
 
				-                filtered_md_path = convert_dir / f"{html_path.stem}_filtered.md"
			
 
				+                filtered_md_path = convert_dir / f"{save_path.stem}_filtered.md"
			
 
				                 with open(filtered_md_path, 'w', encoding='utf-8') as f_out:
			
 
				                     f_out.write(filtered_content)
			
 
				                 
			
--- a/worker/html_convert/docling_converter.py
+++ b/worker/html_convert/docling_converter.py
@@ -20,9 +20,9 @@ class DoclingConverter(ConverterBase):
 
				             logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
			
 
				             return html_convert
			
 
				             
			
 
				-        html_path = Path(html_convert.search_result_item.html_path)
			
 
				-        if not html_path.exists():
			
 
				-            logger.warning(f"html_path {html_path} not exists")
			
 
				+        save_path = Path(html_convert.search_result_item.save_path)
			
 
				+        if not save_path.exists():
			
 
				+            logger.warning(f"save_path {save_path} not exists")
			
 
				             return html_convert
			
 
				         
			
 
				         # Skip if already converted
			
@@ -30,12 +30,12 @@ class DoclingConverter(ConverterBase):
 
				             logger.info(f"Skipping already converted content for {html_convert.id}")
			
 
				             return html_convert
			
 
				         
			
 
				-        convert_dir = self.ensure_convert_dir(html_path)
			
 
				+        convert_dir = self.ensure_convert_dir(save_path)
			
 
				         
			
 
				         try:
			
 
				             # Perform the conversion
			
 
				             converter = DocumentConverter()
			
 
				-            result = converter.convert(html_path)
			
 
				+            result = converter.convert(save_path)
			
 
				             markdown_content = result.document.export_to_markdown()
			
 
				             
			
 
				             # Apply filtering and add URL header
			
@@ -44,7 +44,7 @@ class DoclingConverter(ConverterBase):
 
				                 markdown_content = self.add_url_header(markdown_content, html_convert.search_result_item.url)
			
 
				             
			
 
				             # Save the converted markdown
			
 
				-            docling_md_path = convert_dir / f"{html_path.stem}_docling.md"
			
 
				+            docling_md_path = convert_dir / f"{save_path.stem}_docling.md"
			
 
				             with open(docling_md_path, 'w', encoding='utf-8') as f:
			
 
				                 f.write(markdown_content)
			
 
				             
			
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -53,6 +53,8 @@ class PandocConverter:
 
				                 PANDOC_EXE,
			
 
				                 '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
			
 
				                 '-t', 'docx',
			
 
				+                '--standalone=true',
			
 
				+                '--embed-resources=true',
			
 
				                 '--reference-doc', self._get_reference_doc(),
			
 
				                 '-o', str(output_path),
			
 
				                 str(md_path)
			
@@ -129,7 +131,8 @@ class PandocConverter:
 
				                         logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
			
 
				             
			
 
				             # Convert filtered markdown if available
			
 
				-            if html_convert.filter_crawl_md_path:
			
 
				+            # 暂时不处理filtered markdown，因为 pandoc 要加载资源，很慢
			
 
				+            if False and html_convert.filter_crawl_md_path:
			
 
				                 filtered_md_path = Path(html_convert.filter_crawl_md_path)
			
 
				                 filtered_docx_path = filtered_md_path.with_suffix('.docx')
			
 
				                 
			
@@ -138,7 +141,7 @@ class PandocConverter:
 
				                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
			
 
				                     filtered_success = True
			
 
				                 else:
			
 
				-                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				+                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
			
 
				                     if filtered_success:
			
 
				                         html_convert.pandoc_docx_path = str(filtered_docx_path)
			
 
				                         html_convert.is_pandoc_converted = True
			
@@ -175,14 +178,13 @@ class PandocConverter:
 
				         doc.save(str(reference_doc))
			
 
				 
			
 
				 def process_single_example(result_id: int, skip_existing=True):
			
 
				-    # Process a single result example
			
 
				+    # 未来可能删除， 现在先不删 Process a single result example
			
 
				     docling_converter = DoclingConverter()
			
 
				     search_result_item = docling_converter.get_search_result_item(result_id)
			
 
				     if (search_result_item and 
			
 
				-        search_result_item.html_path and 
			
 
				-        search_result_item.html_path.endswith('.html')):
			
 
				+        search_result_item.save_path and 
			
 
				+        search_result_item.save_path.endswith('.html')):
			
 
				         docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
			
 
				-    
			
 
				     crawl_filter = CrawlFilter()
			
 
				     crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
			
 
				     
			
@@ -199,13 +201,13 @@ def process_all_results():
 
				     global db_manager
			
 
				     with Session(db_manager.engine) as session:
			
 
				         # Fetch all IDs with explicit ordering
			
 
				-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
			
 
				+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.save_path).order_by(SearchResultItem.id)).all()
			
 
				         logger.info(f"Total results: {len(result_ids)}")
			
 
				         logger.info(f"First 5 result IDs: {result_ids[:5]}")
			
 
				         
			
 
				-        for result_id, html_path in result_ids:
			
 
				+        for result_id, save_path in result_ids:
			
 
				             try:
			
 
				-                if html_path and html_path.endswith('.html'):
			
 
				+                if save_path and save_path.endswith('.html'):
			
 
				                     process_single_example(result_id)
			
 
				             except Exception as e:
			
 
				                 logger.error(f"Error processing result {result_id}: {e}")
			
--- a/worker/search_engine/search_result_db.py
+++ b/worker/search_engine/search_result_db.py
@@ -277,6 +277,13 @@ class SearchResultManager:
 
				                 select(SearchResultItem)
			
 
				                 .where(SearchResultItem.save_path.is_not(None))
			
 
				             ).all()
			
 
				+    def get_uncomplete_search_result_items(self) -> list[SearchResultItem]:
			
 
				+        """Get all unsuccessful search result items"""
			
 
				+        with Session(self.engine) as session:
			
 
				+            return session.exec(
			
 
				+                select(SearchResultItem)
			
 
				+               .where(SearchResultItem.save_path.is_(None))
			
 
				+            ).all()
			
 
				     def add_or_update_search_result_item(self, search_result_item: SearchResultItem):
			
 
				         with Session(self.engine) as session:
			
 
				             session.add(search_result_item)