Переглянути джерело

feat: Set UTF-8 as default encoding for `read_csv` output

mrh (aider) 1 рік тому
батько
коміт
3bd414e118
1 змінених файлів з 12 додано та 5 видалено
  1. 12 5
      mylib/read_encoding_cvs.py

+ 12 - 5
mylib/read_encoding_cvs.py

@@ -1,5 +1,4 @@
 import csv
-import csv
 import chardet
 import sys
 import logging
@@ -18,8 +17,7 @@ def detect_encoding(file_path):
         sys.exit(1)
 
 
-
-def read_csv(file_path, to_encode=''):
+def read_csv(file_path, to_encode='utf-8'):
     encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
     detected_encoding = detect_encoding(file_path)
     logger.info(f"Detected encoding: {detected_encoding}")
@@ -31,7 +29,16 @@ def read_csv(file_path, to_encode=''):
         try:
             with open(file_path, 'r', encoding=encoding) as f:
                 reader = csv.reader(f)
-                return list(reader)
+                data = list(reader)
+                
+                # Convert to UTF-8 if needed
+                if encoding.lower() != to_encode.lower():
+                    data = [
+                        [cell.encode('utf-8').decode('utf-8') if isinstance(cell, str) else cell 
+                         for cell in row] 
+                        for row in data
+                    ]
+                return data
         except UnicodeDecodeError:
             continue
         except Exception as e:
@@ -39,4 +46,4 @@ def read_csv(file_path, to_encode=''):
             continue
     
     logger.error("Failed to read file with all attempted encodings")
-    sys.exit(1)
+    sys.exit(1)