|
|
@@ -5,12 +5,13 @@ import random
|
|
|
from typing import List
|
|
|
import httpx
|
|
|
import ssl
|
|
|
+import os
|
|
|
from sqlmodel import select, Session
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
from worker.search_engine.search_result_db import SearchResultManager, KeywordTask, SearchPageResult, SearchResultItem
|
|
|
from mylib.base import ensure_output_dir, save_to_file, load_from_pickle
|
|
|
-from mylib.logu import logger
|
|
|
-from config.settings import PANDOC_EXE
|
|
|
+from mylib.logu import get_logger
|
|
|
+from config.settings import PANDOC_EXE, HTTP_PROXY, HTTPS_PROXY
|
|
|
from worker.html_convert.models import HtmlConvertResult
|
|
|
from worker.html_convert.docling_converter import DoclingConverter
|
|
|
from worker.html_convert.crawl_filter import CrawlFilter
|
|
|
@@ -18,6 +19,7 @@ import subprocess
|
|
|
from docx import Document
|
|
|
from docx.oxml.ns import qn
|
|
|
from docx.oxml import OxmlElement
|
|
|
+logger = get_logger('pandoc')
|
|
|
|
|
|
class PandocConverter:
|
|
|
"""Class for handling Pandoc conversions with customizable options"""
|
|
|
@@ -36,6 +38,15 @@ class PandocConverter:
|
|
|
def convert_md_to_docx(self, md_path: Path, output_path: Path) -> bool:
|
|
|
"""Convert markdown file to docx using pandoc with custom options"""
|
|
|
try:
|
|
|
+ # Prepare environment variables with proxy settings
|
|
|
+ env = os.environ.copy()
|
|
|
+ if HTTP_PROXY:
|
|
|
+ env['HTTP_PROXY'] = HTTP_PROXY
|
|
|
+ env['http_proxy'] = HTTP_PROXY # Some systems use lowercase
|
|
|
+ if HTTPS_PROXY:
|
|
|
+ env['HTTPS_PROXY'] = HTTPS_PROXY
|
|
|
+ env['https_proxy'] = HTTPS_PROXY
|
|
|
+
|
|
|
cmd = [
|
|
|
PANDOC_EXE,
|
|
|
'-f', 'markdown+yaml_metadata_block',
|
|
|
@@ -50,9 +61,26 @@ class PandocConverter:
|
|
|
cmd.insert(-1, '--toc')
|
|
|
cmd.insert(-1, '--toc-depth=3') # Include up to level 3 headings
|
|
|
|
|
|
- result = subprocess.run(cmd, check=True)
|
|
|
+ # Add verbose flag to capture more information about resource fetching
|
|
|
+ cmd.append('--verbose')
|
|
|
+
|
|
|
+ result = subprocess.run(
|
|
|
+ cmd,
|
|
|
+ check=True,
|
|
|
+ env=env, # Pass the environment with proxies
|
|
|
+ stdout=subprocess.PIPE,
|
|
|
+ stderr=subprocess.PIPE,
|
|
|
+ text=True
|
|
|
+ )
|
|
|
+
|
|
|
+ # Log any warnings about resources
|
|
|
+ if "Could not fetch resource" in result.stderr:
|
|
|
+ logger.warning(f"Resource fetching issue detected for {md_path}: {result.stderr}")
|
|
|
|
|
|
return result.returncode == 0
|
|
|
+ except subprocess.CalledProcessError as e:
|
|
|
+ logger.error(f"Pandoc conversion error for {md_path}: {e.stderr}")
|
|
|
+ return False
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error converting {md_path} to docx: {e}")
|
|
|
return False
|
|
|
@@ -144,26 +172,39 @@ class PandocConverter:
|
|
|
normal_style.font.size = Pt(12)
|
|
|
doc.save(str(reference_doc))
|
|
|
|
|
|
-def main():
|
|
|
- # Example: Process a single result with ID 21567
|
|
|
- result_id = 21566
|
|
|
-
|
|
|
- # First, process the Docling conversion
|
|
|
+def process_single_example(result_id: int):
|
|
|
+ # Process a single result example
|
|
|
docling_converter = DoclingConverter()
|
|
|
docling_converter.process_conversion_by_id(result_id)
|
|
|
|
|
|
- # Then, process the Crawl Filter conversion
|
|
|
crawl_filter = CrawlFilter()
|
|
|
crawl_filter.process_filter_by_id(result_id)
|
|
|
|
|
|
- # Finally, convert both results to DOCX using Pandoc with custom options
|
|
|
- pandoc_converter = PandocConverter(font_name="微软雅黑", include_toc=True)
|
|
|
- success = pandoc_converter.process_single_result(result_id)
|
|
|
+ pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
|
|
|
+ success = pandoc_converter.process_single_result(result_id, skip_existing=True)
|
|
|
if success:
|
|
|
logger.info(f"Successfully processed result {result_id}")
|
|
|
logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
|
|
|
else:
|
|
|
logger.error(f"Failed to process result {result_id}")
|
|
|
|
|
|
+def process_all_results():
|
|
|
+ # Process all results in the database
|
|
|
+ db_manager = SearchResultManager()
|
|
|
+ with Session(db_manager.engine) as session:
|
|
|
+ # Fetch all IDs with explicit ordering
|
|
|
+ result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
|
|
|
+ logger.info(f"Total results: {len(result_ids)}")
|
|
|
+ logger.info(f"First 5 result IDs: {result_ids[:5]}")
|
|
|
+
|
|
|
+ for result_id, html_path in result_ids:
|
|
|
+ try:
|
|
|
+ if html_path.endswith('.html'):
|
|
|
+ process_single_example(result_id)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"Error processing result {result_id}: {e}")
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
- main()
|
|
|
+ # Example usage
|
|
|
+ # process_single_example(6)
|
|
|
+ process_all_results()
|