| 1234567891011121314151617181920212223242526272829 |
- import asyncio
- from crawl4ai import *
- from pathlib import Path
- import json
- output_dir = Path("output")
- # 读取output\links.json "external" 键值,得到列表
- def read_links_from_json(file_path):
- with open(file_path, 'r', encoding='utf-8') as file:
- data = json.load(file,)
- links = data.get("external", [])
- return links
- # 筛选列表中键 "base_domain" 的值不含有 google
- def filter_links(links):
- filtered_links = [link for link in links if "google" not in link["base_domain"]]
- return filtered_links
- def get_rearch_result_links(file_path):
- links = read_links_from_json(file_path)
- filtered_links = filter_links(links)
- return filtered_links
- def main():
- filtered_links = get_rearch_result_links("output/links.json")
- print(filtered_links)
- print(len(filtered_links))
- if __name__ == "__main__":
- main()
|