| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import csv
- import chardet
- import logging
- from typing import List
- import os
- from mylib.logging_config import setup_logging
- setup_logging()
- logger = logging.getLogger(__name__)
- logger.info(f"{__file__}")
- def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
- """检测文件编码
-
- Args:
- file_path: 文件路径
- sample_size: 用于检测的样本大小
-
- Returns:
- 检测到的编码字符串
- """
- try:
- if not os.path.exists(file_path):
- logger.error(f"File does not exist: {file_path}")
- return 'utf-8-sig'
-
- with open(file_path, 'rb') as f:
- # 读取样本数据用于检测
- raw_data = f.read(sample_size)
- result = chardet.detect(raw_data)
-
- # 获取置信度最高的编码
- encoding = result.get('encoding')
- confidence = result.get('confidence', 0)
-
- logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
-
- # 如果置信度低于阈值或编码为 None,尝试其他常见编码
- if not encoding or confidence < 0.7:
- logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
- return 'utf-8-sig'
-
- return encoding
-
- except Exception as e:
- logger.error(f"Error detecting encoding for {file_path}: {e}")
- return 'utf-8-sig' # 返回默认编码而不是退出
- def read_csv(file_path: str) -> List[List[str]]:
- """读取CSV文件并转换为utf-8编码
-
- Args:
- file_path: 文件路径
-
- Returns:
- 包含CSV数据的二维列表
- """
- # 常见编码列表,按优先级排序,优先尝试日文编码
- encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
-
- # 先尝试检测编码
- detected_encoding = detect_encoding(file_path)
- if detected_encoding:
- encodings_to_try.insert(0, detected_encoding)
-
- # 尝试用不同编码读取文件
- for encoding in encodings_to_try:
- try:
- logger.info(f"Trying encoding: {encoding}")
-
- with open(file_path, 'r', encoding=encoding, errors='replace') as f:
- reader = csv.reader(f)
- data = list(reader)
-
- logger.info(f"Successfully read file with encoding: {encoding}")
-
- # 打印前几行内容,使用DEBUG级别
- for row in data[:5]:
- logger.debug(f"Row: {row}")
-
- # 将数据写入新的utf-8编码文件
- output_file_path = file_path + '.utf8.csv'
- with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
- writer = csv.writer(f_out)
- writer.writerows(data)
-
- logger.info(f"File saved as UTF-8: {output_file_path}")
- return data
-
- except UnicodeDecodeError as e:
- logger.warning(f"Failed to decode with {encoding}: {e}")
- continue
- except Exception as e:
- logger.error(f"Error with encoding {encoding}: {e}")
- continue
-
- logger.error("Failed to read file with all attempted encodings")
- return [] # 返回空列表而不是退出
- def main():
- from logging_config import setup_logging
- setup_logging()
- logging.getLogger(__name__).setLevel(logging.DEBUG)
- file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
- read_csv(file_path)
- if __name__ == "__main__":
- main()
|