| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- import asyncio
- import base64
- from crawl4ai import *
- from pathlib import Path
- import json
- import pickle
- from crawl4ai.async_configs import BrowserConfig
- # Using proxy URL
- browser_config = BrowserConfig(
- # proxy="http://localhost:1881",
- headless=False,
- # user_agent_mode="random",
- # use_managed_browser=True,
- verbose=True,
- # use_persistent_context=True,
- # user_data_dir=OUTPUT_DIR / "user_data_dir",
- viewport_height=1080,
- viewport_width=1920,
- )
- # 如果目录不存在则创建
- def ensure_output_dir(output_dir: Path):
- if not output_dir.exists():
- output_dir.mkdir(parents=True, exist_ok=True)
- # 保存到文件
- def save_to_file(content, filename:Path):
- ensure_output_dir(filename.parent)
- if not isinstance(content, str):
- # 如果可以用 json 格式化,则格式化
- try:
- content = json.dumps(content, indent=4, ensure_ascii=False)
- except:
- # 如果不是 str ,则格式化
- if not isinstance(content, str):
- content = str(content)
-
- with open(filename, "w", encoding="utf-8") as file:
- file.write(content)
- return filename
- def save_to_pickle(obj, filename):
- with open(filename, "wb") as file:
- pickle.dump(obj, file)
- return filename
- def save_base64_to_file(base64_str, file_path):
- with open(file_path, 'wb') as file:
- file.write(base64.b64decode(base64_str))
- return file_path
- def load_from_pickle(filename):
- with open(filename, "rb") as file:
- return pickle.load(file)
-
- def save_all_result(result:CrawlResult, output_dir):
- # save result obj to pickle
- save_to_pickle(result, output_dir / "result.pickle")
- result_attributes = dir(result)
- for attr in result_attributes:
- if hasattr(result, attr) and not attr.startswith("__"):
- format_str = f"{attr}: {result.__getattribute__(attr)}"
- save_to_file(format_str, output_dir / f"{attr}.txt")
- # 如果字符串有空格,替换成加号
- def replace_space(search_key:str):
- if search_key.find(" ") != -1:
- return search_key.replace(" ", "+")
- return search_key
|