get_article_info.py 872 B

1234567891011121314151617181920212223242526272829
  1. import asyncio
  2. from crawl4ai import *
  3. from pathlib import Path
  4. import json
  5. output_dir = Path("output")
  6. # 读取output\links.json "external" 键值,得到列表
  7. def read_links_from_json(file_path):
  8. with open(file_path, 'r', encoding='utf-8') as file:
  9. data = json.load(file,)
  10. links = data.get("external", [])
  11. return links
  12. # 筛选列表中键 "base_domain" 的值不含有 google
  13. def filter_links(links):
  14. filtered_links = [link for link in links if "google" not in link["base_domain"]]
  15. return filtered_links
  16. def get_rearch_result_links(file_path):
  17. links = read_links_from_json(file_path)
  18. filtered_links = filter_links(links)
  19. return filtered_links
  20. def main():
  21. filtered_links = get_rearch_result_links("output/links.json")
  22. print(filtered_links)
  23. print(len(filtered_links))
  24. if __name__ == "__main__":
  25. main()