10 months ago · 72b3a40982
--- a/worker/html_convert/crawl_filter.py
+++ b/worker/html_convert/crawl_filter.py
@@ -50,7 +50,7 @@ class CrawlFilter(ConverterBase):
 
				                 html_convert.filter_crawl_md_path = str(filtered_md_path)
			
 
				                 html_convert.is_filtered = True
			
 
				                 
			
 
				-                logger.info(f"{html_convert.model_dump_json(indent=2)}")
			
 
				+                # logger.info(f"{html_convert.model_dump_json(indent=2)}")
			
 
				         
			
 
				         return html_convert
			
 
				 
			
--- a/worker/html_convert/pandoc.py
+++ b/worker/html_convert/pandoc.py
@@ -5,12 +5,13 @@ import random
 
				 from typing import List
			
 
				 import httpx
			
 
				 import ssl
			
 
				+import os
			
 
				 from sqlmodel import select, Session
			
 
				 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
			
 
				 from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
			
 
				 from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
			
 
				-from mylib.logu import logger
			
 
				-from config.settings import PANDOC_EXE
			
 
				+from mylib.logu import get_logger
			
 
				+from config.settings import PANDOC_EXE, HTTP_PROXY, HTTPS_PROXY
			
 
				 from worker.html_convert.models import HtmlConvertResult
			
 
				 from worker.html_convert.docling_converter import DoclingConverter
			
 
				 from worker.html_convert.crawl_filter import CrawlFilter
			
@@ -18,6 +19,7 @@ import subprocess
 
				 from docx import Document
			
 
				 from docx.oxml.ns import qn
			
 
				 from docx.oxml import OxmlElement
			
 
				+logger = get_logger('pandoc')
			
 
				 
			
 
				 class PandocConverter:
			
 
				     """Class for handling Pandoc conversions with customizable options"""
			
@@ -36,6 +38,15 @@ class PandocConverter:
 
				     def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
			
 
				         """Convert markdown file to docx using pandoc with custom options"""
			
 
				         try:
			
 
				+            # Prepare environment variables with proxy settings
			
 
				+            env = os.environ.copy()
			
 
				+            if HTTP_PROXY:
			
 
				+                env['HTTP_PROXY'] = HTTP_PROXY
			
 
				+                env['http_proxy'] = HTTP_PROXY  # Some systems use lowercase
			
 
				+            if HTTPS_PROXY:
			
 
				+                env['HTTPS_PROXY'] = HTTPS_PROXY
			
 
				+                env['https_proxy'] = HTTPS_PROXY
			
 
				+            
			
 
				             cmd = [
			
 
				                 PANDOC_EXE,
			
 
				                 '-f', 'markdown+yaml_metadata_block',
			
@@ -50,9 +61,26 @@ class PandocConverter:
 
				                 cmd.insert(-1, '--toc')
			
 
				                 cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
			
 
				             
			
 
				-            result = subprocess.run(cmd, check=True)
			
 
				+            # Add verbose flag to capture more information about resource fetching
			
 
				+            cmd.append('--verbose')
			
 
				+            
			
 
				+            result = subprocess.run(
			
 
				+                cmd,
			
 
				+                check=True,
			
 
				+                env=env,  # Pass the environment with proxies
			
 
				+                stdout=subprocess.PIPE,
			
 
				+                stderr=subprocess.PIPE,
			
 
				+                text=True
			
 
				+            )
			
 
				+            
			
 
				+            # Log any warnings about resources
			
 
				+            if "Could not fetch resource" in result.stderr:
			
 
				+                logger.warning(f"Resource fetching issue detected for {md_path}: {result.stderr}")
			
 
				             
			
 
				             return result.returncode == 0
			
 
				+        except subprocess.CalledProcessError as e:
			
 
				+            logger.error(f"Pandoc conversion error for {md_path}: {e.stderr}")
			
 
				+            return False
			
 
				         except Exception as e:
			
 
				             logger.error(f"Error converting {md_path} to docx: {e}")
			
 
				             return False
			
@@ -144,26 +172,39 @@ class PandocConverter:
 
				         normal_style.font.size = Pt(12)
			
 
				         doc.save(str(reference_doc))
			
 
				 
			
 
				-def main():
			
 
				-    # Example: Process a single result with ID 21567
			
 
				-    result_id = 21566
			
 
				-    
			
 
				-    # First, process the Docling conversion
			
 
				+def process_single_example(result_id: int):
			
 
				+    # Process a single result example
			
 
				     docling_converter = DoclingConverter()
			
 
				     docling_converter.process_conversion_by_id(result_id)
			
 
				     
			
 
				-    # Then, process the Crawl Filter conversion
			
 
				     crawl_filter = CrawlFilter()
			
 
				     crawl_filter.process_filter_by_id(result_id)
			
 
				     
			
 
				-    # Finally, convert both results to DOCX using Pandoc with custom options
			
 
				-    pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
			
 
				-    success = pandoc_converter.process_single_result(result_id)
			
 
				+    pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
			
 
				+    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
			
 
				     if success:
			
 
				         logger.info(f"Successfully processed result {result_id}")
			
 
				         logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
			
 
				     else:
			
 
				         logger.error(f"Failed to process result {result_id}")
			
 
				 
			
 
				+def process_all_results():
			
 
				+    # Process all results in the database
			
 
				+    db_manager = SearchResultManager()
			
 
				+    with Session(db_manager.engine) as session:
			
 
				+        # Fetch all IDs with explicit ordering
			
 
				+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
			
 
				+        logger.info(f"Total results: {len(result_ids)}")
			
 
				+        logger.info(f"First 5 result IDs: {result_ids[:5]}")
			
 
				+        
			
 
				+        for result_id, html_path in result_ids:
			
 
				+            try:
			
 
				+                if html_path.endswith('.html'):
			
 
				+                    process_single_example(result_id)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"Error processing result {result_id}: {e}")
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    # Example usage
			
 
				+    # process_single_example(6)
			
 
				+    process_all_results()