|
|
@@ -53,6 +53,8 @@ class PandocConverter:
|
|
|
PANDOC_EXE,
|
|
|
'-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
|
|
|
'-t', 'docx',
|
|
|
+ '--standalone=true',
|
|
|
+ '--embed-resources=true',
|
|
|
'--reference-doc', self._get_reference_doc(),
|
|
|
'-o', str(output_path),
|
|
|
str(md_path)
|
|
|
@@ -129,7 +131,8 @@ class PandocConverter:
|
|
|
logger.info(f"Successfully converted docling markdown to {docling_docx_path}")
|
|
|
|
|
|
# Convert filtered markdown if available
|
|
|
- if html_convert.filter_crawl_md_path:
|
|
|
+ # 暂时不处理filtered markdown,因为 pandoc 要加载资源,很慢
|
|
|
+ if False and html_convert.filter_crawl_md_path:
|
|
|
filtered_md_path = Path(html_convert.filter_crawl_md_path)
|
|
|
filtered_docx_path = filtered_md_path.with_suffix('.docx')
|
|
|
|
|
|
@@ -138,7 +141,7 @@ class PandocConverter:
|
|
|
logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
|
|
|
filtered_success = True
|
|
|
else:
|
|
|
- # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
+ filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
if filtered_success:
|
|
|
html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
|
html_convert.is_pandoc_converted = True
|
|
|
@@ -175,14 +178,13 @@ class PandocConverter:
|
|
|
doc.save(str(reference_doc))
|
|
|
|
|
|
def process_single_example(result_id: int, skip_existing=True):
|
|
|
- # Process a single result example
|
|
|
+ # 未来可能删除, 现在先不删 Process a single result example
|
|
|
docling_converter = DoclingConverter()
|
|
|
search_result_item = docling_converter.get_search_result_item(result_id)
|
|
|
if (search_result_item and
|
|
|
- search_result_item.html_path and
|
|
|
- search_result_item.html_path.endswith('.html')):
|
|
|
+ search_result_item.save_path and
|
|
|
+ search_result_item.save_path.endswith('.html')):
|
|
|
docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
|
|
|
-
|
|
|
crawl_filter = CrawlFilter()
|
|
|
crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
|
|
|
|
|
|
@@ -199,13 +201,13 @@ def process_all_results():
|
|
|
global db_manager
|
|
|
with Session(db_manager.engine) as session:
|
|
|
# Fetch all IDs with explicit ordering
|
|
|
- result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.html_path).order_by(SearchResultItem.id)).all()
|
|
|
+ result_ids = session.exec(select(SearchResultItem.id, SearchResultItem.save_path).order_by(SearchResultItem.id)).all()
|
|
|
logger.info(f"Total results: {len(result_ids)}")
|
|
|
logger.info(f"First 5 result IDs: {result_ids[:5]}")
|
|
|
|
|
|
- for result_id, html_path in result_ids:
|
|
|
+ for result_id, save_path in result_ids:
|
|
|
try:
|
|
|
- if html_path and html_path.endswith('.html'):
|
|
|
+ if save_path and save_path.endswith('.html'):
|
|
|
process_single_example(result_id)
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error processing result {result_id}: {e}")
|