import asyncio import base64 from crawl4ai import * from pathlib import Path import json import pickle OUTPUT_DIR = Path("output").absolute() from crawl4ai.async_configs import BrowserConfig # Using proxy URL browser_config = BrowserConfig( # proxy="http://localhost:1881", headless=False, # user_agent_mode="random", # use_managed_browser=True, verbose=True, # use_persistent_context=True, user_data_dir=OUTPUT_DIR / "user_data_dir" ) # 如果目录不存在则创建 def ensure_output_dir(output_dir: Path): if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) # 保存到文件 def save_to_file(content, filename:Path): ensure_output_dir(filename.parent) if not isinstance(content, str): # 如果可以用 json 格式化,则格式化 try: content = json.dumps(content, indent=4, ensure_ascii=False) except: # 如果不是 str ,则格式化 if not isinstance(content, str): content = str(content) with open(filename, "w", encoding="utf-8") as file: file.write(content) def save_to_pickle(obj, filename): with open(filename, "wb") as file: pickle.dump(obj, file) return filename def save_base64_to_file(base64_str, file_path): with open(file_path, 'wb') as file: file.write(base64.b64decode(base64_str)) return file_path def load_from_pickle(filename): with open(filename, "rb") as file: return pickle.load(file) def save_all_result(result:CrawlResult, output_dir): # save result obj to pickle save_to_pickle(result, output_dir / "result.pickle") result_attributes = dir(result) for attr in result_attributes: if hasattr(result, attr) and not attr.startswith("__"): format_str = f"{attr}: {result.__getattribute__(attr)}" save_to_file(format_str, output_dir / f"{attr}.txt") # 如果字符串有空格,替换成加号 def replace_space(search_key:str): if search_key.find(" ") != -1: return search_key.replace(" ", "+") return search_key