amazon
/
excel_tool


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
							import csv
import chardet
import sys
import logging
from typing import List

logger = logging.getLogger(__name__)


def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
    """检测文件编码
    
    Args:
        file_path: 文件路径
        sample_size: 用于检测的样本大小
        
    Returns:
        检测到的编码字符串
    """
    try:
        with open(file_path, 'rb') as f:
            # 读取样本数据用于检测
            raw_data = f.read(sample_size)
            result = chardet.detect(raw_data)
            
            # 获取置信度最高的编码
            encoding = result.get('encoding')
            confidence = result.get('confidence', 0)
            
            logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
            
            # 如果置信度低于阈值或编码为 None，尝试其他常见编码
            if not encoding or confidence < 0.7:
                logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
                return 'utf-8-sig'
                
            return encoding
            
    except Exception as e:
        logger.error(f"Error detecting encoding for {file_path}: {e}")
        return 'utf-8-sig'  # 返回默认编码而不是退出


def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
    """读取CSV文件并转换为指定编码
    
    Args:
        file_path: 文件路径
        to_encode: 目标编码，默认为utf-8
        
    Returns:
        包含CSV数据的二维列表
    """
    # 常见编码列表，按优先级排序
    encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp', 'iso-8859-1', 'latin1']
    
    # 先尝试检测编码
    detected_encoding = detect_encoding(file_path)
    if detected_encoding:
        encodings_to_try.insert(0, detected_encoding)
    
    # 尝试用不同编码读取文件
    for encoding in encodings_to_try:
        try:
            logger.info(f"Trying encoding: {encoding}")
            
            with open(file_path, 'r', encoding=encoding, errors='replace') as f:
                reader = csv.reader(f)
                data = list(reader)
                
                # 如果源编码与目标编码不同，进行转换
                if encoding.lower() != to_encode.lower():
                    logger.info(f"Converting from {encoding} to {to_encode}")
                    data = [
                        [cell.encode(to_encode, errors='replace').decode(to_encode) 
                         if isinstance(cell, str) else cell 
                         for cell in row] 
                        for row in data
                    ]
                    
                logger.info(f"Successfully read file with encoding: {encoding}")
                return data
                
        except UnicodeDecodeError as e:
            logger.warning(f"Failed to decode with {encoding}: {e}")
            continue
        except Exception as e:
            logger.error(f"Error with encoding {encoding}: {e}")
            continue
    
    logger.error("Failed to read file with all attempted encodings")
    return []  # 返回空列表而不是退出