read_encoding_cvs.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. import csv
  2. import chardet
  3. import sys
  4. import logging
  5. logger = logging.getLogger(__name__)
  6. def detect_encoding(file_path):
  7. try:
  8. with open(file_path, 'rb') as f:
  9. raw_data = f.read()
  10. result = chardet.detect(raw_data)
  11. return result['encoding']
  12. except Exception as e:
  13. logger.error(f"Error detecting encoding for {file_path}: {e}")
  14. sys.exit(1)
  15. def read_csv(file_path, to_encode='utf-8'):
  16. encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
  17. detected_encoding = detect_encoding(file_path)
  18. logger.info(f"Detected encoding: {detected_encoding}")
  19. if detected_encoding:
  20. encodings_to_try.insert(0, detected_encoding)
  21. for encoding in encodings_to_try:
  22. try:
  23. with open(file_path, 'r', encoding=encoding) as f:
  24. reader = csv.reader(f)
  25. data = list(reader)
  26. # Convert to UTF-8 if needed
  27. if encoding.lower() != to_encode.lower():
  28. data = [
  29. [cell.encode('utf-8').decode('utf-8') if isinstance(cell, str) else cell
  30. for cell in row]
  31. for row in data
  32. ]
  33. return data
  34. except UnicodeDecodeError:
  35. continue
  36. except Exception as e:
  37. logger.error(f"Error with encoding {encoding}: {e}")
  38. continue
  39. logger.error("Failed to read file with all attempted encodings")
  40. sys.exit(1)