Browse Source

pandoc 批量处理

mrh 10 months ago
parent
commit
72b3a40982
2 changed files with 55 additions and 14 deletions
  1. 1 1
      worker/html_convert/crawl_filter.py
  2. 54 13
      worker/html_convert/pandoc.py

+ 1 - 1
worker/html_convert/crawl_filter.py

@@ -50,7 +50,7 @@ class CrawlFilter(ConverterBase):
                 html_convert.filter_crawl_md_path = str(filtered_md_path)
                 html_convert.is_filtered = True
                 
-                logger.info(f"{html_convert.model_dump_json(indent=2)}")
+                # logger.info(f"{html_convert.model_dump_json(indent=2)}")
         
         return html_convert
 

+ 54 - 13
worker/html_convert/pandoc.py

@@ -5,12 +5,13 @@ import random
 from typing import List
 import httpx
 import ssl
+import os
 from sqlmodel import select, Session
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
 from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
 from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
-from mylib.logu import logger
-from config.settings import PANDOC_EXE
+from mylib.logu import get_logger
+from config.settings import PANDOC_EXE, HTTP_PROXY, HTTPS_PROXY
 from worker.html_convert.models import HtmlConvertResult
 from worker.html_convert.docling_converter import DoclingConverter
 from worker.html_convert.crawl_filter import CrawlFilter
@@ -18,6 +19,7 @@ import subprocess
 from docx import Document
 from docx.oxml.ns import qn
 from docx.oxml import OxmlElement
+logger = get_logger('pandoc')
 
 class PandocConverter:
     """Class for handling Pandoc conversions with customizable options"""
@@ -36,6 +38,15 @@ class PandocConverter:
     def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
         """Convert markdown file to docx using pandoc with custom options"""
         try:
+            # Prepare environment variables with proxy settings
+            env = os.environ.copy()
+            if HTTP_PROXY:
+                env['HTTP_PROXY'] = HTTP_PROXY
+                env['http_proxy'] = HTTP_PROXY  # Some systems use lowercase
+            if HTTPS_PROXY:
+                env['HTTPS_PROXY'] = HTTPS_PROXY
+                env['https_proxy'] = HTTPS_PROXY
+            
             cmd = [
                 PANDOC_EXE,
                 '-f', 'markdown+yaml_metadata_block',
@@ -50,9 +61,26 @@ class PandocConverter:
                 cmd.insert(-1, '--toc')
                 cmd.insert(-1, '--toc-depth=3')  # Include up to level 3 headings
             
-            result = subprocess.run(cmd, check=True)
+            # Add verbose flag to capture more information about resource fetching
+            cmd.append('--verbose')
+            
+            result = subprocess.run(
+                cmd,
+                check=True,
+                env=env,  # Pass the environment with proxies
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            # Log any warnings about resources
+            if "Could not fetch resource" in result.stderr:
+                logger.warning(f"Resource fetching issue detected for {md_path}: {result.stderr}")
             
             return result.returncode == 0
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Pandoc conversion error for {md_path}: {e.stderr}")
+            return False
         except Exception as e:
             logger.error(f"Error converting {md_path} to docx: {e}")
             return False
@@ -144,26 +172,39 @@ class PandocConverter:
         normal_style.font.size = Pt(12)
         doc.save(str(reference_doc))
 
-def main():
-    # Example: Process a single result with ID 21567
-    result_id = 21566
-    
-    # First, process the Docling conversion
+def process_single_example(result_id: int):
+    # Process a single result example
     docling_converter = DoclingConverter()
     docling_converter.process_conversion_by_id(result_id)
     
-    # Then, process the Crawl Filter conversion
     crawl_filter = CrawlFilter()
     crawl_filter.process_filter_by_id(result_id)
     
-    # Finally, convert both results to DOCX using Pandoc with custom options
-    pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
-    success = pandoc_converter.process_single_result(result_id)
+    pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
+    success = pandoc_converter.process_single_result(result_id, skip_existing=True)
     if success:
         logger.info(f"Successfully processed result {result_id}")
         logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
     else:
         logger.error(f"Failed to process result {result_id}")
 
+def process_all_results():
+    # Process all results in the database
+    db_manager = SearchResultManager()
+    with Session(db_manager.engine) as session:
+        # Fetch all IDs with explicit ordering
+        result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
+        logger.info(f"Total results: {len(result_ids)}")
+        logger.info(f"First 5 result IDs: {result_ids[:5]}")
+        
+        for result_id, html_path in result_ids:
+            try:
+                if html_path.endswith('.html'):
+                    process_single_example(result_id)
+            except Exception as e:
+                logger.error(f"Error processing result {result_id}: {e}")
+
 if __name__ == "__main__":
-    main()
+    # Example usage
+    # process_single_example(6)
+    process_all_results()