|
|
@@ -2,44 +2,87 @@ import csv
|
|
|
import chardet
|
|
|
import sys
|
|
|
import logging
|
|
|
+from typing import List
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
-def detect_encoding(file_path):
|
|
|
+def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
|
|
|
+ """检测文件编码
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path: 文件路径
|
|
|
+ sample_size: 用于检测的样本大小
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 检测到的编码字符串
|
|
|
+ """
|
|
|
try:
|
|
|
with open(file_path, 'rb') as f:
|
|
|
- raw_data = f.read()
|
|
|
+ # 读取样本数据用于检测
|
|
|
+ raw_data = f.read(sample_size)
|
|
|
result = chardet.detect(raw_data)
|
|
|
- return result['encoding']
|
|
|
+
|
|
|
+ # 获取置信度最高的编码
|
|
|
+ encoding = result['encoding']
|
|
|
+ confidence = result['confidence']
|
|
|
+
|
|
|
+ logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
|
|
|
+
|
|
|
+ # 如果置信度低于阈值,尝试其他常见编码
|
|
|
+ if confidence < 0.7:
|
|
|
+ logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
|
|
|
+ return 'utf-8-sig'
|
|
|
+
|
|
|
+ return encoding
|
|
|
+
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error detecting encoding for {file_path}: {e}")
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
-def read_csv(file_path, to_encode='utf-8'):
|
|
|
- encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
|
|
|
- detected_encoding = detect_encoding(file_path)
|
|
|
- logger.info(f"Detected encoding: {detected_encoding}")
|
|
|
+def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
|
|
|
+ """读取CSV文件并转换为指定编码
|
|
|
|
|
|
+ Args:
|
|
|
+ file_path: 文件路径
|
|
|
+ to_encode: 目标编码,默认为utf-8
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 包含CSV数据的二维列表
|
|
|
+ """
|
|
|
+ # 常见编码列表,按优先级排序
|
|
|
+ encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1']
|
|
|
+
|
|
|
+ # 先尝试检测编码
|
|
|
+ detected_encoding = detect_encoding(file_path)
|
|
|
if detected_encoding:
|
|
|
encodings_to_try.insert(0, detected_encoding)
|
|
|
|
|
|
+ # 尝试用不同编码读取文件
|
|
|
for encoding in encodings_to_try:
|
|
|
try:
|
|
|
+ logger.info(f"Trying encoding: {encoding}")
|
|
|
+
|
|
|
with open(file_path, 'r', encoding=encoding) as f:
|
|
|
reader = csv.reader(f)
|
|
|
data = list(reader)
|
|
|
|
|
|
- # Convert to UTF-8 if needed
|
|
|
+ # 如果源编码与目标编码不同,进行转换
|
|
|
if encoding.lower() != to_encode.lower():
|
|
|
+ logger.info(f"Converting from {encoding} to {to_encode}")
|
|
|
data = [
|
|
|
- [cell.encode('utf-8').decode('utf-8') if isinstance(cell, str) else cell
|
|
|
+ [cell.encode('utf-8', errors='replace').decode('utf-8')
|
|
|
+ if isinstance(cell, str) else cell
|
|
|
for cell in row]
|
|
|
for row in data
|
|
|
]
|
|
|
+
|
|
|
+ logger.info(f"Successfully read file with encoding: {encoding}")
|
|
|
return data
|
|
|
- except UnicodeDecodeError:
|
|
|
+
|
|
|
+ except UnicodeDecodeError as e:
|
|
|
+ logger.warning(f"Failed to decode with {encoding}: {e}")
|
|
|
continue
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error with encoding {encoding}: {e}")
|