فهرست منبع

html convert 去掉 filter_md 转换成 docx 因为资源加载很慢

mrh 9 ماه پیش
والد
کامیت
62c5238036

+ 6 - 6
worker/html_convert/crawl_filter.py

@@ -19,13 +19,13 @@ class CrawlFilter(ConverterBase):
             logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
             return html_convert
             
-        html_path = Path(html_convert.search_result_item.html_path)
-        if not html_path.exists():
-            logger.warning(f"html_path {html_path} not exists")
+        save_path = Path(html_convert.search_result_item.save_path)
+        if not save_path.exists():
+            logger.warning(f"save_path {save_path} not exists")
             return html_convert
         
-        convert_dir = self.ensure_convert_dir(html_path)
-        md_path = html_path.parent / f"{html_path.stem}.md"
+        convert_dir = self.ensure_convert_dir(save_path)
+        md_path = save_path.parent / f"{save_path.stem}.md"
         logger.info(f"save dir {convert_dir}")
         if md_path.exists():
             html_convert.source_crawl_md_path = str(md_path)
@@ -42,7 +42,7 @@ class CrawlFilter(ConverterBase):
                 if html_convert.search_result_item.url:
                     filtered_content = self.add_url_header(filtered_content, html_convert.search_result_item.url)
                 
-                filtered_md_path = convert_dir / f"{html_path.stem}_filtered.md"
+                filtered_md_path = convert_dir / f"{save_path.stem}_filtered.md"
                 with open(filtered_md_path, 'w', encoding='utf-8') as f_out:
                     f_out.write(filtered_content)
                 

+ 6 - 6
worker/html_convert/docling_converter.py

@@ -20,9 +20,9 @@ class DoclingConverter(ConverterBase):
             logger.warning(f"html_convert id {html_convert.id} has no search_result_item")
             return html_convert
             
-        html_path = Path(html_convert.search_result_item.html_path)
-        if not html_path.exists():
-            logger.warning(f"html_path {html_path} not exists")
+        save_path = Path(html_convert.search_result_item.save_path)
+        if not save_path.exists():
+            logger.warning(f"save_path {save_path} not exists")
             return html_convert
         
         # Skip if already converted
@@ -30,12 +30,12 @@ class DoclingConverter(ConverterBase):
             logger.info(f"Skipping already converted content for {html_convert.id}")
             return html_convert
         
-        convert_dir = self.ensure_convert_dir(html_path)
+        convert_dir = self.ensure_convert_dir(save_path)
         
         try:
             # Perform the conversion
             converter = DocumentConverter()
-            result = converter.convert(html_path)
+            result = converter.convert(save_path)
             markdown_content = result.document.export_to_markdown()
             
             # Apply filtering and add URL header
@@ -44,7 +44,7 @@ class DoclingConverter(ConverterBase):
                 markdown_content = self.add_url_header(markdown_content, html_convert.search_result_item.url)
             
             # Save the converted markdown
-            docling_md_path = convert_dir / f"{html_path.stem}_docling.md"
+            docling_md_path = convert_dir / f"{save_path.stem}_docling.md"
             with open(docling_md_path, 'w', encoding='utf-8') as f:
                 f.write(markdown_content)
             

+ 11 - 9
worker/html_convert/pandoc.py

@@ -53,6 +53,8 @@ class PandocConverter:
                 PANDOC_EXE,
                 '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
                 '-t', 'docx',
+                '--standalone=true',
+                '--embed-resources=true',
                 '--reference-doc', self._get_reference_doc(),
                 '-o', str(output_path),
                 str(md_path)
@@ -129,7 +131,8 @@ class PandocConverter:
                         logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
             
             # Convert filtered markdown if available
-            if html_convert.filter_crawl_md_path:
+            # 暂时不处理filtered markdown,因为 pandoc 要加载资源,很慢
+            if False and html_convert.filter_crawl_md_path:
                 filtered_md_path = Path(html_convert.filter_crawl_md_path)
                 filtered_docx_path = filtered_md_path.with_suffix('.docx')
                 
@@ -138,7 +141,7 @@ class PandocConverter:
                     logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
                     filtered_success = True
                 else:
-                    # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
+                    filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
                     if filtered_success:
                         html_convert.pandoc_docx_path = str(filtered_docx_path)
                         html_convert.is_pandoc_converted = True
@@ -175,14 +178,13 @@ class PandocConverter:
         doc.save(str(reference_doc))
 
 def process_single_example(result_id: int, skip_existing=True):
-    # Process a single result example
+    # 未来可能删除, 现在先不删 Process a single result example
     docling_converter = DoclingConverter()
     search_result_item = docling_converter.get_search_result_item(result_id)
     if (search_result_item and 
-        search_result_item.html_path and 
-        search_result_item.html_path.endswith('.html')):
+        search_result_item.save_path and 
+        search_result_item.save_path.endswith('.html')):
         docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
-    
     crawl_filter = CrawlFilter()
     crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
     
@@ -199,13 +201,13 @@ def process_all_results():
     global db_manager
     with Session(db_manager.engine) as session:
         # Fetch all IDs with explicit ordering
-        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.save_path).order_by(SearchResultItem.id)).all()
         logger.info(f"Total results: {len(result_ids)}")
         logger.info(f"First 5 result IDs: {result_ids[:5]}")
         
-        for result_id, html_path in result_ids:
+        for result_id, save_path in result_ids:
             try:
-                if html_path and html_path.endswith('.html'):
+                if save_path and save_path.endswith('.html'):
                     process_single_example(result_id)
             except Exception as e:
                 logger.error(f"Error processing result {result_id}: {e}")

+ 7 - 0
worker/search_engine/search_result_db.py

@@ -277,6 +277,13 @@ class SearchResultManager:
                 select(SearchResultItem)
                 .where(SearchResultItem.save_path.is_not(None))
             ).all()
+    def get_uncomplete_search_result_items(self) -> list[SearchResultItem]:
+        """Get all unsuccessful search result items"""
+        with Session(self.engine) as session:
+            return session.exec(
+                select(SearchResultItem)
+               .where(SearchResultItem.save_path.is_(None))
+            ).all()
     def add_or_update_search_result_item(self, search_result_item: SearchResultItem):
         with Session(self.engine) as session:
             session.add(search_result_item)