read_encoding_cvs.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. import csv
  2. import chardet
  3. import logging
  4. from typing import List, Optional
  5. import os
  6. from mylib.logging_config import setup_logging
  7. setup_logging()
  8. logger = logging.getLogger("excel_tool" + '.' + __name__)
  9. def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
  10. """检测文件编码
  11. Args:
  12. file_path: 文件路径
  13. sample_size: 用于检测的样本大小
  14. Returns:
  15. 检测到的编码字符串
  16. """
  17. try:
  18. if not os.path.exists(file_path):
  19. logger.error(f"File does not exist: {file_path}")
  20. return 'utf-8-sig'
  21. with open(file_path, 'rb') as f:
  22. # 读取样本数据用于检测
  23. raw_data = f.read(sample_size)
  24. result = chardet.detect(raw_data)
  25. # 获取置信度最高的编码
  26. encoding = result.get('encoding')
  27. confidence = result.get('confidence', 0)
  28. logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
  29. # 如果置信度低于阈值或编码为 None,尝试其他常见编码
  30. if not encoding or confidence < 0.7:
  31. logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
  32. return 'shift_jis' # 优先返回日文编码
  33. return encoding
  34. except Exception as e:
  35. logger.error(f"Error detecting encoding for {file_path}: {e}")
  36. return 'shift_jis' # 返回日文编码作为默认值
  37. def save_csv(data: List[List[str]], file_path: str) -> None:
  38. """将CSV数据保存为UTF-8编码文件
  39. Args:
  40. data: 要保存的CSV数据
  41. file_path: 目标文件路径
  42. """
  43. try:
  44. with open(file_path, 'w', encoding='utf-8', newline='') as f:
  45. writer = csv.writer(f)
  46. writer.writerows(data)
  47. logger.info(f"File saved as UTF-8: {file_path}")
  48. except Exception as e:
  49. logger.error(f"Error saving file {file_path}: {e}")
  50. raise
  51. def read_with_cp936(file_path: str) -> List[List[str]]:
  52. """使用cp936编码读取CSV文件
  53. Args:
  54. file_path: 文件路径
  55. Returns:
  56. 包含CSV数据的二维列表
  57. """
  58. try:
  59. logger.info(f"Reading file with cp936 encoding: {file_path}")
  60. with open(file_path, 'r', encoding='cp936', errors='replace') as f:
  61. reader = csv.reader(f)
  62. data = list(reader)
  63. # 保存转换后的文件
  64. output_file_path = file_path + '.utf8.csv'
  65. save_csv(data, output_file_path)
  66. return data
  67. except Exception as e:
  68. logger.error(f"Error reading file with cp936 encoding: {e}")
  69. return []
  70. def read_csv(file_path: str, specified_encoding: Optional[str] = None) -> List[List[str]]:
  71. """读取CSV文件并转换为utf-8编码
  72. Args:
  73. file_path: 文件路径
  74. specified_encoding: 用户指定的编码方式
  75. Returns:
  76. 包含CSV数据的二维列表
  77. """
  78. # 如果指定了cp936编码,直接使用专用函数
  79. if specified_encoding == 'cp936':
  80. return read_with_cp936(file_path)
  81. # 常见编码列表,优先尝试日文编码
  82. encodings_to_try = [
  83. 'shift_jis', # 日文常用编码
  84. 'cp932', # Windows日文编码
  85. 'euc-jp', # 日文EUC编码
  86. 'iso-2022-jp',# 日文JIS编码
  87. 'utf-8-sig', # UTF-8 with BOM
  88. 'gb18030', # 中文编码
  89. 'big5', # 繁体中文
  90. 'iso-8859-1',
  91. 'latin1'
  92. ]
  93. # 如果用户指定了编码,优先使用
  94. if specified_encoding:
  95. encodings_to_try.insert(0, specified_encoding)
  96. else:
  97. # 先尝试检测编码
  98. detected_encoding = detect_encoding(file_path)
  99. if detected_encoding:
  100. encodings_to_try.insert(0, detected_encoding)
  101. # 尝试用不同编码读取文件
  102. for encoding in encodings_to_try:
  103. try:
  104. logger.info(f"Trying encoding: {encoding}")
  105. with open(file_path, 'r', encoding=encoding, errors='replace') as f:
  106. reader = csv.reader(f)
  107. data = list(reader)
  108. logger.info(f"Successfully read file with encoding: {encoding}")
  109. # 打印前几行内容,使用DEBUG级别
  110. for row in data[:5]:
  111. logger.debug(f"Row: {row}")
  112. # 检查日文字符是否正确解码
  113. if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
  114. japanese_chars = ''.join([cell for row in data[:5] for cell in row])
  115. if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars): # 检查是否包含日文字符
  116. logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
  117. continue
  118. return data
  119. except UnicodeDecodeError as e:
  120. logger.warning(f"Failed to decode with {encoding}: {e}")
  121. continue
  122. except Exception as e:
  123. logger.error(f"Error with encoding {encoding}: {e}")
  124. continue
  125. logger.error("Failed to read file with all attempted encodings")
  126. return [] # 返回空列表而不是退出
  127. def main():
  128. file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
  129. data =read_csv(file_path, 'cp936')
  130. # 保存转换后的文件
  131. # output_file_path = file_path + '.utf8.csv'
  132. # save_csv(data, output_file_path)
  133. logger.info(data)
  134. if __name__ == "__main__":
  135. main()