import csv import chardet import logging from typing import List import os from mylib.logging_config import setup_logging setup_logging() logger = logging.getLogger(__name__) logger.info(f"{__file__}") def detect_encoding(file_path: str, sample_size: int = 100000) -> str: """检测文件编码 Args: file_path: 文件路径 sample_size: 用于检测的样本大小 Returns: 检测到的编码字符串 """ try: if not os.path.exists(file_path): logger.error(f"File does not exist: {file_path}") return 'utf-8-sig' with open(file_path, 'rb') as f: # 读取样本数据用于检测 raw_data = f.read(sample_size) result = chardet.detect(raw_data) # 获取置信度最高的编码 encoding = result.get('encoding') confidence = result.get('confidence', 0) logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})") # 如果置信度低于阈值或编码为 None,尝试其他常见编码 if not encoding or confidence < 0.7: logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings") return 'shift_jis' # 优先返回日文编码 return encoding except Exception as e: logger.error(f"Error detecting encoding for {file_path}: {e}") return 'shift_jis' # 返回日文编码作为默认值 def read_csv(file_path: str) -> List[List[str]]: """读取CSV文件并转换为utf-8编码 Args: file_path: 文件路径 Returns: 包含CSV数据的二维列表 """ # 常见编码列表,优先尝试日文编码 encodings_to_try = [ 'cp936', 'shift_jis', # 日文常用编码 'cp932', # Windows日文编码 'euc-jp', # 日文EUC编码 'iso-2022-jp',# 日文JIS编码 'cp936', # 简体中文Windows编码 'utf-8-sig', # UTF-8 with BOM 'gb18030', # 中文编码 'big5', # 繁体中文 'iso-8859-1', 'latin1' ] # 先尝试检测编码 detected_encoding = detect_encoding(file_path) if detected_encoding: encodings_to_try.insert(0, detected_encoding) # 尝试用不同编码读取文件 for encoding in encodings_to_try: try: logger.info(f"Trying encoding: {encoding}") with open(file_path, 'r', encoding=encoding, errors='replace') as f: reader = csv.reader(f) data = list(reader) logger.info(f"Successfully read file with encoding: {encoding}") # 打印前几行内容,使用DEBUG级别 for row in data[:5]: logger.debug(f"Row: {row}") # 检查日文字符是否正确解码 if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')): japanese_chars = ''.join([cell for row in data[:5] for cell in row]) if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars): # 检查是否包含日文字符 logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding") continue # 将数据写入新的utf-8编码文件 output_file_path = file_path + '.utf8.csv' with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out: writer = csv.writer(f_out) writer.writerows(data) logger.info(f"File saved as UTF-8: {output_file_path}") return data except UnicodeDecodeError as e: logger.warning(f"Failed to decode with {encoding}: {e}") continue except Exception as e: logger.error(f"Error with encoding {encoding}: {e}") continue logger.error("Failed to read file with all attempted encodings") return [] # 返回空列表而不是退出 def main(): from logging_config import setup_logging setup_logging() logging.getLogger(__name__).setLevel(logging.DEBUG) file_path = "/home/mrh/code/excel_tool/temp/测试.csv" read_csv(file_path) if __name__ == "__main__": main()