read_encoding_cvs.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. import csv
  2. import chardet
  3. import logging
  4. from typing import List
  5. import os
  6. from mylib.logging_config import setup_logging
  7. setup_logging()
  8. logger = logging.getLogger(__name__)
  9. logger.info(f"{__file__}")
  10. def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
  11. """检测文件编码
  12. Args:
  13. file_path: 文件路径
  14. sample_size: 用于检测的样本大小
  15. Returns:
  16. 检测到的编码字符串
  17. """
  18. try:
  19. if not os.path.exists(file_path):
  20. logger.error(f"File does not exist: {file_path}")
  21. return 'utf-8-sig'
  22. with open(file_path, 'rb') as f:
  23. # 读取样本数据用于检测
  24. raw_data = f.read(sample_size)
  25. result = chardet.detect(raw_data)
  26. # 获取置信度最高的编码
  27. encoding = result.get('encoding')
  28. confidence = result.get('confidence', 0)
  29. logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
  30. # 如果置信度低于阈值或编码为 None,尝试其他常见编码
  31. if not encoding or confidence < 0.7:
  32. logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
  33. return 'utf-8-sig'
  34. return encoding
  35. except Exception as e:
  36. logger.error(f"Error detecting encoding for {file_path}: {e}")
  37. return 'utf-8-sig' # 返回默认编码而不是退出
  38. def read_csv(file_path: str) -> List[List[str]]:
  39. """读取CSV文件并转换为utf-8编码
  40. Args:
  41. file_path: 文件路径
  42. Returns:
  43. 包含CSV数据的二维列表
  44. """
  45. # 常见编码列表,按优先级排序,优先尝试日文编码
  46. encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
  47. # 先尝试检测编码
  48. detected_encoding = detect_encoding(file_path)
  49. if detected_encoding:
  50. encodings_to_try.insert(0, detected_encoding)
  51. # 尝试用不同编码读取文件
  52. for encoding in encodings_to_try:
  53. try:
  54. logger.info(f"Trying encoding: {encoding}")
  55. with open(file_path, 'r', encoding=encoding, errors='replace') as f:
  56. reader = csv.reader(f)
  57. data = list(reader)
  58. logger.info(f"Successfully read file with encoding: {encoding}")
  59. # 打印前几行内容,使用DEBUG级别
  60. for row in data[:5]:
  61. logger.debug(f"Row: {row}")
  62. # 将数据写入新的utf-8编码文件
  63. output_file_path = file_path + '.utf8.csv'
  64. with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
  65. writer = csv.writer(f_out)
  66. writer.writerows(data)
  67. logger.info(f"File saved as UTF-8: {output_file_path}")
  68. return data
  69. except UnicodeDecodeError as e:
  70. logger.warning(f"Failed to decode with {encoding}: {e}")
  71. continue
  72. except Exception as e:
  73. logger.error(f"Error with encoding {encoding}: {e}")
  74. continue
  75. logger.error("Failed to read file with all attempted encodings")
  76. return [] # 返回空列表而不是退出
  77. def main():
  78. from logging_config import setup_logging
  79. setup_logging()
  80. logging.getLogger(__name__).setLevel(logging.DEBUG)
  81. file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
  82. read_csv(file_path)
  83. if __name__ == "__main__":
  84. main()