|
@@ -4,6 +4,7 @@ import re
|
|
|
import subprocess
|
|
import subprocess
|
|
|
import sys
|
|
import sys
|
|
|
import pickle
|
|
import pickle
|
|
|
|
|
+from dotenv import load_dotenv
|
|
|
from crawl4ai import *
|
|
from crawl4ai import *
|
|
|
from search_keyward import test_dir_links_not_local
|
|
from search_keyward import test_dir_links_not_local
|
|
|
from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
|
|
from mylib.base import ensure_output_dir, load_from_pickle, save_to_file
|
|
@@ -30,7 +31,7 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
sys.path.append(parent_dir)
|
|
sys.path.append(parent_dir)
|
|
|
|
|
|
|
|
from typing import List
|
|
from typing import List
|
|
|
-
|
|
|
|
|
|
|
+load_dotenv()
|
|
|
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
|
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
|
|
# Minimal browser config
|
|
# Minimal browser config
|
|
|
browser_config = BrowserConfig(
|
|
browser_config = BrowserConfig(
|
|
@@ -71,11 +72,14 @@ async def multi_crawl_one_dir_html(dir_path:Path):
|
|
|
urls = [link['href'] for link in links]
|
|
urls = [link['href'] for link in links]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+ # Save results using pickle
|
|
|
|
|
+ output_file = dir_path / "crawl_results.pkl"
|
|
|
|
|
+ if output_file.exists():
|
|
|
|
|
+ print(f"{output_file} already exists. Skipping...")
|
|
|
|
|
+ return output_file
|
|
|
# Perform parallel crawling
|
|
# Perform parallel crawling
|
|
|
results = await crawl_parallel(urls, max_concurrent=10)
|
|
results = await crawl_parallel(urls, max_concurrent=10)
|
|
|
|
|
|
|
|
- # Save results using pickle
|
|
|
|
|
- output_file = os.path.join(dir_path, "crawl_results.pkl")
|
|
|
|
|
with open(output_file, "wb") as f:
|
|
with open(output_file, "wb") as f:
|
|
|
pickle.dump(results, f)
|
|
pickle.dump(results, f)
|
|
|
|
|
|
|
@@ -93,7 +97,7 @@ def docling_html2md(html_file_path: Path):
|
|
|
return markdown_content
|
|
return markdown_content
|
|
|
|
|
|
|
|
def pandoc_html2md(html_file_path: Path, output_file_path: Path):
|
|
def pandoc_html2md(html_file_path: Path, output_file_path: Path):
|
|
|
- # pandoc -f html -t docx -o 0.docx "K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil\all_paper\0.html"
|
|
|
|
|
|
|
+ # pandoc -f html -t markdown -o 0.md "K:\code\upwork\zhang_crawl_bio\output\google_search\Acalypha menavody essential oil\all_page_html\1.html"
|
|
|
pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
|
|
pandoc_exe = r'K:\code\upwork\zhang_crawl_bio\venv\Library\bin\pandoc.exe'
|
|
|
# 执行转换指令
|
|
# 执行转换指令
|
|
|
cmd = [
|
|
cmd = [
|
|
@@ -131,7 +135,8 @@ async def load_result(dir_path:Path):
|
|
|
print(f"{idx} {result.url}")
|
|
print(f"{idx} {result.url}")
|
|
|
# 保存 HTML 文件
|
|
# 保存 HTML 文件
|
|
|
html_file_path = output_all_page_html_dir / f"{idx}.html"
|
|
html_file_path = output_all_page_html_dir / f"{idx}.html"
|
|
|
- save_to_file(result.html, html_file_path)
|
|
|
|
|
|
|
+ if not html_file_path.exists():
|
|
|
|
|
+ save_to_file(result.html, html_file_path)
|
|
|
|
|
|
|
|
# 如果存在 markdown 内容,转换为 DOCX 并保存
|
|
# 如果存在 markdown 内容,转换为 DOCX 并保存
|
|
|
if result.markdown:
|
|
if result.markdown:
|
|
@@ -141,21 +146,23 @@ async def load_result(dir_path:Path):
|
|
|
try:
|
|
try:
|
|
|
# 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
|
|
# 使用 docling 将 HTML 转换为 Markdown 并保存到 all_docling_markdown 目录
|
|
|
docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
|
|
docling_md_path = output_all_docling_markdown_dir / f"{idx}.md"
|
|
|
- docling_md = docling_html2md(html_file_path)
|
|
|
|
|
- # 在 md 的第一行加入 url 链接
|
|
|
|
|
- docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
|
|
|
|
|
- save_to_file(docling_md, docling_md_path)
|
|
|
|
|
|
|
+ if not docling_md_path.exists():
|
|
|
|
|
+ docling_md = docling_html2md(html_file_path)
|
|
|
|
|
+ # 在 md 的第一行加入 url 链接
|
|
|
|
|
+ docling_md = f"[{result.url}]({result.url})\n\n" + docling_md
|
|
|
|
|
+ save_to_file(docling_md, docling_md_path)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"Error converting HTML to Markdown using docling: {e}")
|
|
print(f"Error converting HTML to Markdown using docling: {e}")
|
|
|
print(f"html_file_path {html_file_path}")
|
|
print(f"html_file_path {html_file_path}")
|
|
|
|
|
|
|
|
md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
|
|
md_file_path = output_all_pandoc_markdown_dir / f"{idx}.md"
|
|
|
- pandoc_html2md(html_file_path, output_all_pandoc_markdown_dir / f"{idx}.md")
|
|
|
|
|
- # 将刚刚写入的md文件第一行加入 url 链接
|
|
|
|
|
- with open(md_file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
- md_content = f.read()
|
|
|
|
|
- md_content = f"[{result.url}]({result.url})\n\n" + md_content
|
|
|
|
|
- save_to_file(md_content, md_file_path)
|
|
|
|
|
|
|
+ if md_file_path.exists():
|
|
|
|
|
+ pandoc_html2md(html_file_path, md_file_path)
|
|
|
|
|
+ # 将刚刚写入的md文件第一行加入 url 链接
|
|
|
|
|
+ with open(md_file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ md_content = f.read()
|
|
|
|
|
+ md_content = f"[{result.url}]({result.url})\n\n" + md_content
|
|
|
|
|
+ save_to_file(md_content, md_file_path)
|
|
|
|
|
|
|
|
return result
|
|
return result
|
|
|
|
|
|
|
@@ -186,11 +193,21 @@ def conver_all_md_to_docx(dir_path:Path):
|
|
|
print(f"转换失败!{all_md_save_path}")
|
|
print(f"转换失败!{all_md_save_path}")
|
|
|
print(e)
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
+async def all_search_key_main():
|
|
|
|
|
+ # 获取 GOOGLE_SEARCH_DIR 下面所有文件夹
|
|
|
|
|
+ search_key_dirs = [f for f in GOOGLE_SEARCH_DIR.iterdir() if f.is_dir()]
|
|
|
|
|
+ # print(search_key_dirs)
|
|
|
|
|
+ for dir_path in search_key_dirs:
|
|
|
|
|
+ await multi_crawl_one_dir_html(dir_path)
|
|
|
|
|
+ await load_result(dir_path)
|
|
|
|
|
+ conver_all_md_to_docx(dir_path)
|
|
|
|
|
+
|
|
|
async def main():
|
|
async def main():
|
|
|
dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
|
|
dir_path = Path(r"K:\code\upwork\zhang_crawl_bio\output\Acalypha malabarica essential oil")
|
|
|
# await multi_crawl_one_dir_html(dir_path)
|
|
# await multi_crawl_one_dir_html(dir_path)
|
|
|
# await load_result(dir_path)
|
|
# await load_result(dir_path)
|
|
|
- conver_all_md_to_docx(dir_path)
|
|
|
|
|
|
|
+ # conver_all_md_to_docx(dir_path)
|
|
|
|
|
+ await all_search_key_main()
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|