read_encoding_cvs.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import csv
  2. import chardet
  3. import logging
  4. from typing import List
  5. import os
  6. from mylib.logging_config import setup_logging
  7. setup_logging()
  8. logger = logging.getLogger(__name__)
  9. logger.info(f"{__file__}")
  10. def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
  11. """检测文件编码
  12. Args:
  13. file_path: 文件路径
  14. sample_size: 用于检测的样本大小
  15. Returns:
  16. 检测到的编码字符串
  17. """
  18. try:
  19. if not os.path.exists(file_path):
  20. logger.error(f"File does not exist: {file_path}")
  21. return 'utf-8-sig'
  22. with open(file_path, 'rb') as f:
  23. # 读取样本数据用于检测
  24. raw_data = f.read(sample_size)
  25. result = chardet.detect(raw_data)
  26. # 获取置信度最高的编码
  27. encoding = result.get('encoding')
  28. confidence = result.get('confidence', 0)
  29. logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
  30. # 如果置信度低于阈值或编码为 None,尝试其他常见编码
  31. if not encoding or confidence < 0.7:
  32. logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
  33. return 'shift_jis' # 优先返回日文编码
  34. return encoding
  35. except Exception as e:
  36. logger.error(f"Error detecting encoding for {file_path}: {e}")
  37. return 'shift_jis' # 返回日文编码作为默认值
  38. def read_csv(file_path: str) -> List[List[str]]:
  39. """读取CSV文件并转换为utf-8编码
  40. Args:
  41. file_path: 文件路径
  42. Returns:
  43. 包含CSV数据的二维列表
  44. """
  45. # 常见编码列表,优先尝试日文编码
  46. encodings_to_try = [
  47. 'shift_jis', # 日文常用编码
  48. 'cp932', # Windows日文编码
  49. 'euc-jp', # 日文EUC编码
  50. 'iso-2022-jp',# 日文JIS编码
  51. 'utf-8-sig', # UTF-8 with BOM
  52. 'gb18030', # 中文编码
  53. 'big5', # 繁体中文
  54. 'iso-8859-1',
  55. 'latin1'
  56. ]
  57. # 先尝试检测编码
  58. detected_encoding = detect_encoding(file_path)
  59. if detected_encoding:
  60. encodings_to_try.insert(0, detected_encoding)
  61. # 尝试用不同编码读取文件
  62. for encoding in encodings_to_try:
  63. try:
  64. logger.info(f"Trying encoding: {encoding}")
  65. with open(file_path, 'r', encoding=encoding, errors='replace') as f:
  66. reader = csv.reader(f)
  67. data = list(reader)
  68. logger.info(f"Successfully read file with encoding: {encoding}")
  69. # 打印前几行内容,使用DEBUG级别
  70. for row in data[:5]:
  71. logger.debug(f"Row: {row}")
  72. # 检查日文字符是否正确解码
  73. if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
  74. japanese_chars = ''.join([cell for row in data[:5] for cell in row])
  75. if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars): # 检查是否包含日文字符
  76. logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
  77. continue
  78. # 将数据写入新的utf-8编码文件
  79. output_file_path = file_path + '.utf8.csv'
  80. with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
  81. writer = csv.writer(f_out)
  82. writer.writerows(data)
  83. logger.info(f"File saved as UTF-8: {output_file_path}")
  84. return data
  85. except UnicodeDecodeError as e:
  86. logger.warning(f"Failed to decode with {encoding}: {e}")
  87. continue
  88. except Exception as e:
  89. logger.error(f"Error with encoding {encoding}: {e}")
  90. continue
  91. logger.error("Failed to read file with all attempted encodings")
  92. return [] # 返回空列表而不是退出
  93. def main():
  94. from logging_config import setup_logging
  95. setup_logging()
  96. logging.getLogger(__name__).setLevel(logging.DEBUG)
  97. file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
  98. read_csv(file_path)
  99. if __name__ == "__main__":
  100. main()