1 year ago · b4838272b8
--- a/mylib/read_encoding_cvs.py
+++ b/mylib/read_encoding_cvs.py
@@ -24,13 +24,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
 
															             result = chardet.detect(raw_data)
														
 
															             # 获取置信度最高的编码
														
 
															-            encoding = result['encoding']
														
 
															-            confidence = result['confidence']
														
 
															+            encoding = result.get('encoding')
														
 
															+            confidence = result.get('confidence', 0)
														
 
															             logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
														
 
															-            # 如果置信度低于阈值，尝试其他常见编码
														
 
															-            if confidence < 0.7:
														
 
															+            # 如果置信度低于阈值或编码为 None，尝试其他常见编码
														
 
															+            if not encoding or confidence < 0.7:
														
 
															                 logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
														
 
															                 return 'utf-8-sig'
														
@@ -72,7 +72,7 @@ def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
 
															                 if encoding.lower() != to_encode.lower():
														
 
															                     logger.info(f"Converting from {encoding} to {to_encode}")
														
 
															                     data = [
														
 
															-                        [cell.encode('utf-8', errors='replace').decode('utf-8') 
														
 
															+                        [cell.encode(to_encode, errors='replace').decode(to_encode) 
														
 
															                          if isinstance(cell, str) else cell 
														
 
															                          for cell in row] 
														
 
															                         for row in data
														
--- a/mylib/translate_utils.py
+++ b/mylib/translate_utils.py
@@ -45,10 +45,12 @@ def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = No
 
															         data = read_csv(file_path, encoding)
														
 
															         if not data:
														
 
															+            logger.error("读取的文件为空")
														
 
															             raise ValueError("读取的文件为空")
														
 
															         # 确保header_row在有效范围内
														
 
															         if header_row >= len(data):
														
 
															+            logger.error(f"标题行 {header_row} 超出文件范围")
														
 
															             raise ValueError(f"标题行 {header_row} 超出文件范围")
														
 
															         # 使用指定行作为列名，前面的行丢弃
														
@@ -75,6 +77,7 @@ def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], st
 
															     """
														
 
															     try:
														
 
															         if df.empty:
														
 
															+            logger.error("DataFrame为空")
														
 
															             return pd.Series()
														
 
															         # 处理列号或列名或列字母
														
@@ -82,15 +85,18 @@ def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], st
 
															             column_identifier = column_letter_to_index(column_identifier)
														
 
															         if isinstance(column_identifier, int):
														
 
															             if column_identifier < 0 or column_identifier >= len(df.columns):
														
 
															+                logger.error(f"列号 {column_identifier} 超出范围")
														
 
															                 raise ValueError(f"列号 {column_identifier} 超出范围")
														
 
															             column_identifier = df.columns[column_identifier]
														
 
															         # 确保列名存在
														
 
															         if column_identifier not in df.columns:
														
 
															+            logger.error(f"列名 {column_identifier} 不存在")
														
 
															             raise ValueError(f"列名 {column_identifier} 不存在")
														
 
															         # 确保开始行在有效范围内
														
 
															         if start_row >= len(df) or start_row < 0:
														
 
															+            logger.error(f"开始行 {start_row} 超出范围")
														
 
															             raise ValueError(f"开始行 {start_row} 超出范围")
														
 
															         # 提取指定列的数据