crawl_lib_func.py 1007 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import re
  2. def filter_links(links):
  3. '''
  4. input: {
  5. 'internal': [{}],
  6. 'external': [
  7. {
  8. "href": "xx",
  9. "text": "xxm",
  10. "title": "",
  11. "base_domain": "benlcollins.com"
  12. }
  13. ],
  14. }
  15. '''
  16. external_links = links["external"]
  17. filtered_links = [link for link in external_links if "google" not in link["base_domain"]]
  18. return filtered_links
  19. def is_valid_domain(domain):
  20. # 正则表达式匹配域名格式
  21. pattern = r'^([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}$'
  22. return re.match(pattern, domain) is not None
  23. def filter_local_domain(links):
  24. '''
  25. input: [{
  26. "href": "xx",
  27. "text": "xxm",
  28. "title": "",
  29. "base_domain": "benlcollins.com"
  30. }]
  31. '''
  32. filtered_links = []
  33. for link in links:
  34. if 'base_domain' in link and is_valid_domain(link['base_domain']):
  35. filtered_links.append(link)
  36. return filtered_links