get_suport_ua.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. # 支持的系统和浏览器品牌
  2. supported_systems = ["Windows", "Linux", "Mac"]
  3. supported_browsers = {
  4. "Chrome": 91, # 假设 Chrome 的最低支持版本是 91
  5. "Firefox": 89, # 假设 Firefox 的最低支持版本是 89
  6. "Safari": 14, # 假设 Safari 的最低支持版本是 14
  7. "Edg": 91, # 假设 Edge 的最低支持版本是 91
  8. }
  9. # 输入和输出文件路径
  10. input_file = "K:/code/upwork/zhang_crawl_bio/mylib/user_agents.txt"
  11. output_file = "K:/code/upwork/zhang_crawl_bio/mylib/filtered_user_agents.txt"
  12. def extract_version(ua, browser):
  13. """
  14. 从用户代理字符串中提取浏览器版本号
  15. """
  16. try:
  17. if browser == "Chrome":
  18. # Chrome 的版本号通常在 "Chrome/" 后面
  19. start = ua.find("Chrome/")
  20. if start == -1:
  21. return None
  22. start += len("Chrome/")
  23. # 提取直到非数字字符为止
  24. version_str = ""
  25. for char in ua[start:]:
  26. if char.isdigit():
  27. version_str += char
  28. else:
  29. break
  30. return int(version_str) if version_str else None
  31. elif browser == "Firefox":
  32. # Firefox 的版本号通常在 "Firefox/" 后面
  33. start = ua.find("Firefox/")
  34. if start == -1:
  35. return None
  36. start += len("Firefox/")
  37. # 提取直到非数字字符为止
  38. version_str = ""
  39. for char in ua[start:]:
  40. if char.isdigit():
  41. version_str += char
  42. else:
  43. break
  44. return int(version_str) if version_str else None
  45. elif browser == "Safari":
  46. # Safari 的版本号通常在 "Version/" 后面
  47. start = ua.find("Version/")
  48. if start == -1:
  49. return None
  50. start += len("Version/")
  51. # 提取直到非数字字符为止
  52. version_str = ""
  53. for char in ua[start:]:
  54. if char.isdigit():
  55. version_str += char
  56. else:
  57. break
  58. return int(version_str) if version_str else None
  59. elif browser == "Edg":
  60. # Edge 的版本号通常在 "Edg/" 后面
  61. start = ua.find("Edg/")
  62. if start == -1:
  63. return None
  64. start += len("Edg/")
  65. # 提取直到非数字字符为止
  66. version_str = ""
  67. for char in ua[start:]:
  68. if char.isdigit():
  69. version_str += char
  70. else:
  71. break
  72. return int(version_str) if version_str else None
  73. except Exception as e:
  74. print(f"Error extracting version from UA: {ua}, Error: {e}")
  75. return None
  76. # 读取并过滤用户代理
  77. with open(input_file, "r") as infile, open(output_file, "w") as outfile:
  78. for line in infile:
  79. line = line.strip()
  80. # 检查是否包含支持的系统和浏览器品牌
  81. if any(system in line for system in supported_systems):
  82. for browser, min_version in supported_browsers.items():
  83. if browser in line:
  84. version = extract_version(line, browser)
  85. if version is not None and version >= min_version:
  86. outfile.write(line + "\n")
  87. break
  88. print(f"过滤后的用户代理已保存到 {output_file}")