Procházet zdrojové kódy

fix: Handle None encoding and empty file cases in CSV reading

mrh (aider) před 1 rokem
rodič
revize
b4838272b8
2 změnil soubory, kde provedl 11 přidání a 5 odebrání
  1. 5 5
      mylib/read_encoding_cvs.py
  2. 6 0
      mylib/translate_utils.py

+ 5 - 5
mylib/read_encoding_cvs.py

@@ -24,13 +24,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
             result = chardet.detect(raw_data)
             
             # 获取置信度最高的编码
-            encoding = result['encoding']
-            confidence = result['confidence']
+            encoding = result.get('encoding')
+            confidence = result.get('confidence', 0)
             
             logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
             
-            # 如果置信度低于阈值,尝试其他常见编码
-            if confidence < 0.7:
+            # 如果置信度低于阈值或编码为 None,尝试其他常见编码
+            if not encoding or confidence < 0.7:
                 logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
                 return 'utf-8-sig'
                 
@@ -72,7 +72,7 @@ def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
                 if encoding.lower() != to_encode.lower():
                     logger.info(f"Converting from {encoding} to {to_encode}")
                     data = [
-                        [cell.encode('utf-8', errors='replace').decode('utf-8') 
+                        [cell.encode(to_encode, errors='replace').decode(to_encode) 
                          if isinstance(cell, str) else cell 
                          for cell in row] 
                         for row in data

+ 6 - 0
mylib/translate_utils.py

@@ -45,10 +45,12 @@ def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = No
         data = read_csv(file_path, encoding)
         
         if not data:
+            logger.error("读取的文件为空")
             raise ValueError("读取的文件为空")
             
         # 确保header_row在有效范围内
         if header_row >= len(data):
+            logger.error(f"标题行 {header_row} 超出文件范围")
             raise ValueError(f"标题行 {header_row} 超出文件范围")
             
         # 使用指定行作为列名,前面的行丢弃
@@ -75,6 +77,7 @@ def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], st
     """
     try:
         if df.empty:
+            logger.error("DataFrame为空")
             return pd.Series()
             
         # 处理列号或列名或列字母
@@ -82,15 +85,18 @@ def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], st
             column_identifier = column_letter_to_index(column_identifier)
         if isinstance(column_identifier, int):
             if column_identifier < 0 or column_identifier >= len(df.columns):
+                logger.error(f"列号 {column_identifier} 超出范围")
                 raise ValueError(f"列号 {column_identifier} 超出范围")
             column_identifier = df.columns[column_identifier]
             
         # 确保列名存在
         if column_identifier not in df.columns:
+            logger.error(f"列名 {column_identifier} 不存在")
             raise ValueError(f"列名 {column_identifier} 不存在")
             
         # 确保开始行在有效范围内
         if start_row >= len(df) or start_row < 0:
+            logger.error(f"开始行 {start_row} 超出范围")
             raise ValueError(f"开始行 {start_row} 超出范围")
             
         # 提取指定列的数据