1 jaar geleden · ab0cae1b34
--- a/mylib/read_encoding_cvs.py
+++ b/mylib/read_encoding_cvs.py
@@ -2,44 +2,87 @@ import csv
 
				 import chardet
			
 
				 import sys
			
 
				 import logging
			
 
				+from typing import List
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				-def detect_encoding(file_path):
			
 
				+def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
			
 
				+    """检测文件编码
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 文件路径
			
 
				+        sample_size: 用于检测的样本大小
			
 
				+        
			
 
				+    Returns:
			
 
				+        检测到的编码字符串
			
 
				+    """
			
 
				     try:
			
 
				         with open(file_path, 'rb') as f:
			
 
				-            raw_data = f.read()
			
 
				+            # 读取样本数据用于检测
			
 
				+            raw_data = f.read(sample_size)
			
 
				             result = chardet.detect(raw_data)
			
 
				-            return result['encoding']
			
 
				+            
			
 
				+            # 获取置信度最高的编码
			
 
				+            encoding = result['encoding']
			
 
				+            confidence = result['confidence']
			
 
				+            
			
 
				+            logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
			
 
				+            
			
 
				+            # 如果置信度低于阈值，尝试其他常见编码
			
 
				+            if confidence < 0.7:
			
 
				+                logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
			
 
				+                return 'utf-8-sig'
			
 
				+                
			
 
				+            return encoding
			
 
				+            
			
 
				     except Exception as e:
			
 
				         logger.error(f"Error detecting encoding for {file_path}: {e}")
			
 
				         sys.exit(1)
			
 
				 
			
 
				 
			
 
				-def read_csv(file_path, to_encode='utf-8'):
			
 
				-    encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
			
 
				-    detected_encoding = detect_encoding(file_path)
			
 
				-    logger.info(f"Detected encoding: {detected_encoding}")
			
 
				+def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
			
 
				+    """读取CSV文件并转换为指定编码
			
 
				     
			
 
				+    Args:
			
 
				+        file_path: 文件路径
			
 
				+        to_encode: 目标编码，默认为utf-8
			
 
				+        
			
 
				+    Returns:
			
 
				+        包含CSV数据的二维列表
			
 
				+    """
			
 
				+    # 常见编码列表，按优先级排序
			
 
				+    encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1']
			
 
				+    
			
 
				+    # 先尝试检测编码
			
 
				+    detected_encoding = detect_encoding(file_path)
			
 
				     if detected_encoding:
			
 
				         encodings_to_try.insert(0, detected_encoding)
			
 
				     
			
 
				+    # 尝试用不同编码读取文件
			
 
				     for encoding in encodings_to_try:
			
 
				         try:
			
 
				+            logger.info(f"Trying encoding: {encoding}")
			
 
				+            
			
 
				             with open(file_path, 'r', encoding=encoding) as f:
			
 
				                 reader = csv.reader(f)
			
 
				                 data = list(reader)
			
 
				                 
			
 
				-                # Convert to UTF-8 if needed
			
 
				+                # 如果源编码与目标编码不同，进行转换
			
 
				                 if encoding.lower() != to_encode.lower():
			
 
				+                    logger.info(f"Converting from {encoding} to {to_encode}")
			
 
				                     data = [
			
 
				-                        [cell.encode('utf-8').decode('utf-8') if isinstance(cell, str) else cell 
			
 
				+                        [cell.encode('utf-8', errors='replace').decode('utf-8') 
			
 
				+                         if isinstance(cell, str) else cell 
			
 
				                          for cell in row] 
			
 
				                         for row in data
			
 
				                     ]
			
 
				+                    
			
 
				+                logger.info(f"Successfully read file with encoding: {encoding}")
			
 
				                 return data
			
 
				-        except UnicodeDecodeError:
			
 
				+                
			
 
				+        except UnicodeDecodeError as e:
			
 
				+            logger.warning(f"Failed to decode with {encoding}: {e}")
			
 
				             continue
			
 
				         except Exception as e:
			
 
				             logger.error(f"Error with encoding {encoding}: {e}")