import csv import chardet import sys import logging from typing import List logger = logging.getLogger(__name__) def detect_encoding(file_path: str, sample_size: int = 100000) -> str: """检测文件编码 Args: file_path: 文件路径 sample_size: 用于检测的样本大小 Returns: 检测到的编码字符串 """ try: with open(file_path, 'rb') as f: # 读取样本数据用于检测 raw_data = f.read(sample_size) result = chardet.detect(raw_data) # 获取置信度最高的编码 encoding = result['encoding'] confidence = result['confidence'] logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})") # 如果置信度低于阈值,尝试其他常见编码 if confidence < 0.7: logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings") return 'utf-8-sig' return encoding except Exception as e: logger.error(f"Error detecting encoding for {file_path}: {e}") sys.exit(1) def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]: """读取CSV文件并转换为指定编码 Args: file_path: 文件路径 to_encode: 目标编码,默认为utf-8 Returns: 包含CSV数据的二维列表 """ # 常见编码列表,按优先级排序 encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1', 'latin1'] # 先尝试检测编码 detected_encoding = detect_encoding(file_path) if detected_encoding: encodings_to_try.insert(0, detected_encoding) # 尝试用不同编码读取文件 for encoding in encodings_to_try: try: logger.info(f"Trying encoding: {encoding}") with open(file_path, 'r', encoding=encoding, errors='replace') as f: reader = csv.reader(f) data = list(reader) # 如果源编码与目标编码不同,进行转换 if encoding.lower() != to_encode.lower(): logger.info(f"Converting from {encoding} to {to_encode}") data = [ [cell.encode('utf-8', errors='replace').decode('utf-8') if isinstance(cell, str) else cell for cell in row] for row in data ] logger.info(f"Successfully read file with encoding: {encoding}") return data except UnicodeDecodeError as e: logger.warning(f"Failed to decode with {encoding}: {e}") continue except Exception as e: logger.error(f"Error with encoding {encoding}: {e}") continue logger.error("Failed to read file with all attempted encodings") sys.exit(1)