read_encoding_cvs.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import csv
  2. import chardet
  3. import logging
  4. from typing import List
  5. import os
  6. logger = logging.getLogger(__name__)
  7. def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
  8. """检测文件编码
  9. Args:
  10. file_path: 文件路径
  11. sample_size: 用于检测的样本大小
  12. Returns:
  13. 检测到的编码字符串
  14. """
  15. try:
  16. if not os.path.exists(file_path):
  17. logger.error(f"File does not exist: {file_path}")
  18. return 'utf-8-sig'
  19. with open(file_path, 'rb') as f:
  20. # 读取样本数据用于检测
  21. raw_data = f.read(sample_size)
  22. result = chardet.detect(raw_data)
  23. # 获取置信度最高的编码
  24. encoding = result.get('encoding')
  25. confidence = result.get('confidence', 0)
  26. logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
  27. # 如果置信度低于阈值或编码为 None,尝试其他常见编码
  28. if not encoding or confidence < 0.7:
  29. logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
  30. return 'utf-8-sig'
  31. return encoding
  32. except Exception as e:
  33. logger.error(f"Error detecting encoding for {file_path}: {e}")
  34. return 'utf-8-sig' # 返回默认编码而不是退出
  35. def read_csv(file_path: str) -> List[List[str]]:
  36. """读取CSV文件并转换为utf-8编码
  37. Args:
  38. file_path: 文件路径
  39. Returns:
  40. 包含CSV数据的二维列表
  41. """
  42. # 常见编码列表,按优先级排序
  43. encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1', 'latin1']
  44. # 先尝试检测编码
  45. detected_encoding = detect_encoding(file_path)
  46. if detected_encoding:
  47. encodings_to_try.insert(0, detected_encoding)
  48. # 尝试用不同编码读取文件
  49. for encoding in encodings_to_try:
  50. try:
  51. logger.info(f"Trying encoding: {encoding}")
  52. with open(file_path, 'r', encoding=encoding, errors='replace') as f:
  53. reader = csv.reader(f)
  54. data = list(reader)
  55. logger.info(f"Successfully read file with encoding: {encoding}")
  56. # 打印前几行内容,使用DEBUG级别
  57. for row in data[:5]:
  58. logger.debug(f"Row: {row}")
  59. # 将数据写入新的utf-8编码文件
  60. output_file_path = file_path + '.utf8.csv'
  61. with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
  62. writer = csv.writer(f_out)
  63. writer.writerows(data)
  64. logger.info(f"File saved as UTF-8: {output_file_path}")
  65. return data
  66. except UnicodeDecodeError as e:
  67. logger.warning(f"Failed to decode with {encoding}: {e}")
  68. continue
  69. except Exception as e:
  70. logger.error(f"Error with encoding {encoding}: {e}")
  71. continue
  72. logger.error("Failed to read file with all attempted encodings")
  73. return [] # 返回空列表而不是退出
  74. def main():
  75. from logging_config import setup_logging
  76. setup_logging()
  77. logging.getLogger(__name__).setLevel(logging.DEBUG)
  78. file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
  79. read_csv(file_path)
  80. if __name__ == "__main__":
  81. main()