| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- # 支持的系统和浏览器品牌
- supported_systems = ["Windows", "Linux", "Mac"]
- supported_browsers = {
- "Chrome": 91, # 假设 Chrome 的最低支持版本是 91
- "Firefox": 89, # 假设 Firefox 的最低支持版本是 89
- "Safari": 14, # 假设 Safari 的最低支持版本是 14
- "Edg": 91, # 假设 Edge 的最低支持版本是 91
- }
- # 输入和输出文件路径
- input_file = "K:/code/upwork/zhang_crawl_bio/mylib/user_agents.txt"
- output_file = "K:/code/upwork/zhang_crawl_bio/mylib/filtered_user_agents.txt"
- def extract_version(ua, browser):
- """
- 从用户代理字符串中提取浏览器版本号
- """
- try:
- if browser == "Chrome":
- # Chrome 的版本号通常在 "Chrome/" 后面
- start = ua.find("Chrome/")
- if start == -1:
- return None
- start += len("Chrome/")
- # 提取直到非数字字符为止
- version_str = ""
- for char in ua[start:]:
- if char.isdigit():
- version_str += char
- else:
- break
- return int(version_str) if version_str else None
- elif browser == "Firefox":
- # Firefox 的版本号通常在 "Firefox/" 后面
- start = ua.find("Firefox/")
- if start == -1:
- return None
- start += len("Firefox/")
- # 提取直到非数字字符为止
- version_str = ""
- for char in ua[start:]:
- if char.isdigit():
- version_str += char
- else:
- break
- return int(version_str) if version_str else None
- elif browser == "Safari":
- # Safari 的版本号通常在 "Version/" 后面
- start = ua.find("Version/")
- if start == -1:
- return None
- start += len("Version/")
- # 提取直到非数字字符为止
- version_str = ""
- for char in ua[start:]:
- if char.isdigit():
- version_str += char
- else:
- break
- return int(version_str) if version_str else None
- elif browser == "Edg":
- # Edge 的版本号通常在 "Edg/" 后面
- start = ua.find("Edg/")
- if start == -1:
- return None
- start += len("Edg/")
- # 提取直到非数字字符为止
- version_str = ""
- for char in ua[start:]:
- if char.isdigit():
- version_str += char
- else:
- break
- return int(version_str) if version_str else None
- except Exception as e:
- print(f"Error extracting version from UA: {ua}, Error: {e}")
- return None
- # 读取并过滤用户代理
- with open(input_file, "r") as infile, open(output_file, "w") as outfile:
- for line in infile:
- line = line.strip()
- # 检查是否包含支持的系统和浏览器品牌
- if any(system in line for system in supported_systems):
- for browser, min_version in supported_browsers.items():
- if browser in line:
- version = extract_version(line, browser)
- if version is not None and version >= min_version:
- outfile.write(line + "\n")
- break
- print(f"过滤后的用户代理已保存到 {output_file}")
|