Explorar o código

pandoc 并未能转换。原因待查明

mrh hai 10 meses
pai
achega
4875fb3dc9
Modificáronse 3 ficheiros con 38 adicións e 17 borrados
  1. 33 16
      crawl_multi.py
  2. 2 1
      mytest/news_paper_t.py
  3. 3 0
      search_keyward.py

+ 33 - 16
crawl_multi.py

@@ -4,6 +4,7 @@ import re
 import subprocess
 import sys
 import pickle
+from dotenv import load_dotenv
 from crawl4ai import *
 from search_keyward import test_dir_links_not_local
 from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
@@ -30,7 +31,7 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
 from typing import List
-
+load_dotenv()
 async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
     # Minimal browser config
     browser_config = BrowserConfig(
@@ -71,11 +72,14 @@ async def multi_crawl_one_dir_html(dir_path:Path):
     urls = [link['href'] for link in links]
     
     
+    # Save results using pickle
+    output_file = dir_path / "crawl_results.pkl"
+    if output_file.exists():
+        print(f"{output_file} already exists. Skipping...")
+        return output_file
     # Perform parallel crawling
     results = await crawl_parallel(urls, max_concurrent=10)
     
-    # Save results using pickle
-    output_file = os.path.join(dir_path, "crawl_results.pkl")
     with open(output_file, "wb") as f:
         pickle.dump(results, f)
     
@@ -93,7 +97,7 @@ def docling_html2md(html_file_path: Path):
     return markdown_content
 
 def pandoc_html2md(html_file_path: Path, output_file_path: Path):
-    # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
     pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
     # 执行转换指令
     cmd = [
@@ -131,7 +135,8 @@ async def load_result(dir_path:Path):
         print(f"{idx} {result.url}")
         # 保存 HTML 文件
         html_file_path = output_all_page_html_dir / f"{idx}.html"
-        save_to_file(result.html, html_file_path)
+        if not html_file_path.exists():
+            save_to_file(result.html, html_file_path)
         
         # 如果存在 markdown 内容,转换为 DOCX 并保存
         if result.markdown:
@@ -141,21 +146,23 @@ async def load_result(dir_path:Path):
         try:
             # 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
             docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
-            docling_md = docling_html2md(html_file_path)
-            # 在 md 的第一行加入 url 链接
-            docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
-            save_to_file(docling_md, docling_md_path)
+            if not docling_md_path.exists():
+                docling_md = docling_html2md(html_file_path)
+                # 在 md 的第一行加入 url 链接
+                docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
+                save_to_file(docling_md, docling_md_path)
         except Exception as e:
             print(f"Error converting HTML to Markdown using docling: {e}")
             print(f"html_file_path {html_file_path}")
             
         md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
-        pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
-        # 将刚刚写入的md文件第一行加入 url 链接
-        with open(md_file_path, 'r', encoding='utf-8') as f:
-            md_content = f.read()
-            md_content = f"[{result.url}]({result.url})\n\n" + md_content
-            save_to_file(md_content, md_file_path)
+        if md_file_path.exists():
+            pandoc_html2md(html_file_path, md_file_path)
+            # 将刚刚写入的md文件第一行加入 url 链接
+            with open(md_file_path, 'r', encoding='utf-8') as f:
+                md_content = f.read()
+                md_content = f"[{result.url}]({result.url})\n\n" + md_content
+                save_to_file(md_content, md_file_path)
         
     return result
 
@@ -186,11 +193,21 @@ def conver_all_md_to_docx(dir_path:Path):
             print(f"转换失败!{all_md_save_path}")
             print(e)
 
+async def all_search_key_main():
+    # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
+    search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
+    # print(search_key_dirs)
+    for dir_path in search_key_dirs:
+        await multi_crawl_one_dir_html(dir_path)
+        await load_result(dir_path)
+        conver_all_md_to_docx(dir_path)
+
 async def main():
     dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
     # await multi_crawl_one_dir_html(dir_path)
     # await load_result(dir_path)
-    conver_all_md_to_docx(dir_path)
+    # conver_all_md_to_docx(dir_path)
+    await all_search_key_main()
     
 
 if __name__ == "__main__":

+ 2 - 1
mytest/news_paper_t.py

@@ -3,7 +3,7 @@ import os
 
 def main():
     # 本地 HTML 文件路径
-    file_path = r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
+    file_path = r"K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha martiana essential oil\all_page_html\3.html"
     
     # 读取本地 HTML 文件内容
     with open(file_path, 'r', encoding='utf-8') as file:
@@ -20,6 +20,7 @@ def main():
 
     # 打印文章标题和附加数据
     print(first_article.title)
+    print(first_article.authors)
     print(first_article.additional_data)
     print(f"summary {first_article.summary}")
     print(f"url {first_article.url}")

+ 3 - 0
search_keyward.py

@@ -161,6 +161,9 @@ async def test_sigle_html_links(save_html_path=None):
     return links_not_local
 
 async def test_dir_links_not_local(dir_path:Path):
+    '''
+    获取所有目录中的 html 文件,解析得到过滤后的域名列表, base_domain 不包含本地路径
+    '''
     html_files = [f for f in dir_path.iterdir() if f.suffix == '.html']
     all_links = []
     for html_file in html_files: