|
@@ -51,7 +51,7 @@ class PandocConverter:
|
|
|
|
|
|
|
|
cmd = [
|
|
cmd = [
|
|
|
PANDOC_EXE,
|
|
PANDOC_EXE,
|
|
|
- '-f', 'markdown+yaml_metadata_block',
|
|
|
|
|
|
|
+ '-f', 'markdown+pipe_tables+simple_tables+multiline_tables',
|
|
|
'-t', 'docx',
|
|
'-t', 'docx',
|
|
|
'--reference-doc', self._get_reference_doc(),
|
|
'--reference-doc', self._get_reference_doc(),
|
|
|
'-o', str(output_path),
|
|
'-o', str(output_path),
|
|
@@ -61,7 +61,7 @@ class PandocConverter:
|
|
|
if self.include_toc:
|
|
if self.include_toc:
|
|
|
# Specify heading levels for TOC
|
|
# Specify heading levels for TOC
|
|
|
cmd.insert(-1, '--toc')
|
|
cmd.insert(-1, '--toc')
|
|
|
- cmd.insert(-1, '--toc-depth=3') # Include up to level 3 headings
|
|
|
|
|
|
|
+ cmd.insert(-1, '--toc-depth=2') # Include up to level 3 headings
|
|
|
|
|
|
|
|
# Add verbose flag to capture more information about resource fetching
|
|
# Add verbose flag to capture more information about resource fetching
|
|
|
cmd.append('--verbose')
|
|
cmd.append('--verbose')
|
|
@@ -99,7 +99,7 @@ class PandocConverter:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
# Get the HTML convert result
|
|
# Get the HTML convert result
|
|
|
- html_convert = session.exec(
|
|
|
|
|
|
|
+ html_convert:HtmlConvertResult = session.exec(
|
|
|
select(HtmlConvertResult)
|
|
select(HtmlConvertResult)
|
|
|
.where(HtmlConvertResult.search_result_item_id == result_id)
|
|
.where(HtmlConvertResult.search_result_item_id == result_id)
|
|
|
).first()
|
|
).first()
|
|
@@ -138,7 +138,7 @@ class PandocConverter:
|
|
|
logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
|
|
logger.info(f"Skipping already converted filtered markdown: {filtered_docx_path}")
|
|
|
filtered_success = True
|
|
filtered_success = True
|
|
|
else:
|
|
else:
|
|
|
- filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
|
|
|
|
+ # filtered_success = self.convert_md_to_docx(filtered_md_path, filtered_docx_path)
|
|
|
if filtered_success:
|
|
if filtered_success:
|
|
|
html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
html_convert.pandoc_docx_path = str(filtered_docx_path)
|
|
|
html_convert.is_pandoc_converted = True
|
|
html_convert.is_pandoc_converted = True
|
|
@@ -174,21 +174,21 @@ class PandocConverter:
|
|
|
normal_style.font.size = Pt(12)
|
|
normal_style.font.size = Pt(12)
|
|
|
doc.save(str(reference_doc))
|
|
doc.save(str(reference_doc))
|
|
|
|
|
|
|
|
-def process_single_example(result_id: int):
|
|
|
|
|
|
|
+def process_single_example(result_id: int, skip_existing=True):
|
|
|
# Process a single result example
|
|
# Process a single result example
|
|
|
docling_converter = DoclingConverter()
|
|
docling_converter = DoclingConverter()
|
|
|
search_result_item = docling_converter.get_search_result_item(result_id)
|
|
search_result_item = docling_converter.get_search_result_item(result_id)
|
|
|
if search_result_item.html_path.endswith('.html'):
|
|
if search_result_item.html_path.endswith('.html'):
|
|
|
- docling_converter.process_conversion_by_id(result_id)
|
|
|
|
|
|
|
+ docling_converter.process_conversion_by_id(result_id, skip_existing=skip_existing)
|
|
|
|
|
|
|
|
crawl_filter = CrawlFilter()
|
|
crawl_filter = CrawlFilter()
|
|
|
- crawl_filter.process_filter_by_id(result_id)
|
|
|
|
|
|
|
+ crawl_filter.process_filter_by_id(result_id, skip_existing=skip_existing)
|
|
|
|
|
|
|
|
pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
|
|
pandoc_converter = PandocConverter(font_name="宋体", include_toc=True)
|
|
|
- success = pandoc_converter.process_single_result(result_id, skip_existing=True)
|
|
|
|
|
|
|
+ logger.info(f"skip_existing {skip_existing}")
|
|
|
|
|
+ success = pandoc_converter.process_single_result(result_id, skip_existing=skip_existing)
|
|
|
if success:
|
|
if success:
|
|
|
logger.info(f"Successfully processed result {result_id}")
|
|
logger.info(f"Successfully processed result {result_id}")
|
|
|
- logger.info("Note: You may need to manually update the Table of Contents in Word after opening the document.")
|
|
|
|
|
else:
|
|
else:
|
|
|
logger.error(f"Failed to process result {result_id}")
|
|
logger.error(f"Failed to process result {result_id}")
|
|
|
|
|
|
|
@@ -209,6 +209,10 @@ def process_all_results():
|
|
|
logger.error(f"Error processing result {result_id}: {e}")
|
|
logger.error(f"Error processing result {result_id}: {e}")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
- # Example usage
|
|
|
|
|
- # process_single_example(6)
|
|
|
|
|
- process_all_results()
|
|
|
|
|
|
|
+ # 计算运行时间
|
|
|
|
|
+ import time
|
|
|
|
|
+ start_time = time.time()
|
|
|
|
|
+ process_single_example(996, skip_existing=False)
|
|
|
|
|
+ end_time = time.time()
|
|
|
|
|
+ print(f"Total time: {end_time - start_time} seconds")
|
|
|
|
|
+ # process_all_results()
|