فهرست منبع

feat: improve CSV encoding detection and handling with type hints and error handling

mrh (aider) 1 سال پیش
والد
کامیت
ab0cae1b34
1فایلهای تغییر یافته به همراه53 افزوده شده و 10 حذف شده
  1. 53 10
      mylib/read_encoding_cvs.py

+ 53 - 10
mylib/read_encoding_cvs.py

@@ -2,44 +2,87 @@ import csv
 import chardet
 import sys
 import logging
+from typing import List
 
 logger = logging.getLogger(__name__)
 
 
-def detect_encoding(file_path):
+def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
+    """检测文件编码
+    
+    Args:
+        file_path: 文件路径
+        sample_size: 用于检测的样本大小
+        
+    Returns:
+        检测到的编码字符串
+    """
     try:
         with open(file_path, 'rb') as f:
-            raw_data = f.read()
+            # 读取样本数据用于检测
+            raw_data = f.read(sample_size)
             result = chardet.detect(raw_data)
-            return result['encoding']
+            
+            # 获取置信度最高的编码
+            encoding = result['encoding']
+            confidence = result['confidence']
+            
+            logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
+            
+            # 如果置信度低于阈值,尝试其他常见编码
+            if confidence < 0.7:
+                logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
+                return 'utf-8-sig'
+                
+            return encoding
+            
     except Exception as e:
         logger.error(f"Error detecting encoding for {file_path}: {e}")
         sys.exit(1)
 
 
-def read_csv(file_path, to_encode='utf-8'):
-    encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
-    detected_encoding = detect_encoding(file_path)
-    logger.info(f"Detected encoding: {detected_encoding}")
+def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
+    """读取CSV文件并转换为指定编码
     
+    Args:
+        file_path: 文件路径
+        to_encode: 目标编码,默认为utf-8
+        
+    Returns:
+        包含CSV数据的二维列表
+    """
+    # 常见编码列表,按优先级排序
+    encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1']
+    
+    # 先尝试检测编码
+    detected_encoding = detect_encoding(file_path)
     if detected_encoding:
         encodings_to_try.insert(0, detected_encoding)
     
+    # 尝试用不同编码读取文件
     for encoding in encodings_to_try:
         try:
+            logger.info(f"Trying encoding: {encoding}")
+            
             with open(file_path, 'r', encoding=encoding) as f:
                 reader = csv.reader(f)
                 data = list(reader)
                 
-                # Convert to UTF-8 if needed
+                # 如果源编码与目标编码不同,进行转换
                 if encoding.lower() != to_encode.lower():
+                    logger.info(f"Converting from {encoding} to {to_encode}")
                     data = [
-                        [cell.encode('utf-8').decode('utf-8') if isinstance(cell, str) else cell 
+                        [cell.encode('utf-8', errors='replace').decode('utf-8') 
+                         if isinstance(cell, str) else cell 
                          for cell in row] 
                         for row in data
                     ]
+                    
+                logger.info(f"Successfully read file with encoding: {encoding}")
                 return data
-        except UnicodeDecodeError:
+                
+        except UnicodeDecodeError as e:
+            logger.warning(f"Failed to decode with {encoding}: {e}")
             continue
         except Exception as e:
             logger.error(f"Error with encoding {encoding}: {e}")