瀏覽代碼

feat: add header row support and improve CSV encoding handling

mrh (aider) 1 年之前
父節點
當前提交
fdecb3f19d
共有 2 個文件被更改,包括 35 次插入143 次删除
  1. 2 3
      mylib/read_encoding_cvs.py
  2. 33 140
      mylib/translate_utils.py

+ 2 - 3
mylib/read_encoding_cvs.py

@@ -38,7 +38,7 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
             
     except Exception as e:
         logger.error(f"Error detecting encoding for {file_path}: {e}")
-        sys.exit(1)
+        return 'utf-8-sig'  # 返回默认编码而不是退出
 
 
 def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
@@ -79,7 +79,6 @@ def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
                     ]
                     
                 logger.info(f"Successfully read file with encoding: {encoding}")
-                logger.info(f"Data: {data}")
                 return data
                 
         except UnicodeDecodeError as e:
@@ -90,4 +89,4 @@ def read_csv(file_path: str, to_encode: str = 'utf-8') -> List[List[str]]:
             continue
     
     logger.error("Failed to read file with all attempted encodings")
-    sys.exit(1)
+    return []  # 返回空列表而不是退出

+ 33 - 140
mylib/translate_utils.py

@@ -29,6 +29,38 @@ def column_letter_to_index(col_letter: str) -> int:
         logger.error(f"列字母转换时出错: {e}")
         raise
 
+def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = None) -> pd.DataFrame:
+    """读取CSV文件并正确处理标题行
+    
+    Args:
+        file_path: CSV文件路径
+        header_row: 标题行号(从0开始),默认为1(第2行)
+        encoding: 文件编码
+        
+    Returns:
+        pandas DataFrame
+    """
+    try:
+        # 读取所有数据
+        data = read_csv(file_path, encoding)
+        
+        if not data:
+            raise ValueError("读取的文件为空")
+            
+        # 确保header_row在有效范围内
+        if header_row >= len(data):
+            raise ValueError(f"标题行 {header_row} 超出文件范围")
+            
+        # 使用指定行作为列名,前面的行丢弃
+        df = pd.DataFrame(data[header_row+1:], columns=data[header_row])
+        
+        logger.info(f"成功读取CSV文件,使用第{header_row+1}行作为标题行")
+        logger.info(f"列标题: {df.columns.tolist()}")
+        return df
+    except Exception as e:
+        logger.error(f"读取CSV文件时出错: {e}")
+        raise
+
 def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], start_row: int = 2, header_row: int = 1) -> pd.Series:
     """提取指定列的数据,默认从第3行开始
     
@@ -64,149 +96,10 @@ def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], st
         # 提取指定列的数据
         column_data = df.iloc[start_row:][column_identifier]
         logger.info(f"成功提取列 {column_identifier} 数据,从第{start_row}行开始,共{len(column_data)}条数据")
-        logger.info(f"列 {column_identifier} 数据: {column_data.tolist()}")
         return column_data
         
     except Exception as e:
         logger.error(f"提取列数据时出错: {e}")
         raise
 
-def insert_empty_columns(df: pd.DataFrame, column_names: List[Union[str, int]], header_row: int = 1) -> pd.DataFrame:
-    """在指定列之后插入空列"""
-    try:
-        # 按从大到小排序,防止插入影响后续索引
-        column_names = sorted(column_names, reverse=True, key=lambda x: df.columns.get_loc(x) if isinstance(x, str) else x)
-        
-        for col in column_names:
-            if isinstance(col, str) and col.isalpha():
-                col = column_letter_to_index(col)
-            if isinstance(col, int):
-                if col < 0 or col >= len(df.columns):
-                    raise ValueError(f"列号 {col} 超出范围")
-                col = df.columns[col]
-                
-            if col in df.columns:
-                # 在指定列后插入空列
-                new_col_index = df.columns.get_loc(col) + 1
-                new_col_name = f"{col}_translated"
-                df.insert(new_col_index, new_col_name, '')
-                
-        return df
-    except Exception as e:
-        logger.error(f"插入空列时出错: {e}")
-        raise
-
-def extract_sample_data(df: pd.DataFrame, start_row: int = 0, column_name: str = None, n: int = 3, header_row: int = 1) -> pd.DataFrame:
-    """提取指定行和列开始的样本数据"""
-    try:
-        # 确保不超过数据范围
-        end_row = min(start_row + n, len(df))
-        
-        if column_name:
-            return df.iloc[start_row:end_row][[column_name]]
-        return df.iloc[start_row:end_row]
-    except Exception as e:
-        logger.error(f"提取样本数据时出错: {e}")
-        raise
-
-def log_data_details(df: pd.DataFrame, search_term_col: str, start_row: int = 2, header_row: int = 1):
-    """记录数据详细信息"""
-    try:
-        # 记录行号和列号
-        logger.info(f"行号范围: {start_row}-{len(df)-1}")
-        logger.info(f"翻译列名: {search_term_col}")
-        
-        # 提取并记录被翻译列的内容
-        translated_column = df.iloc[start_row:][search_term_col]
-        logger.info(f"被翻译列内容: {translated_column.tolist()}")
-            
-    except Exception as e:
-        logger.error(f"记录数据详细信息时出错: {e}")
-        raise
-
-def process_batch_translations(df: pd.DataFrame, 
-                             search_term_col: str,
-                             start_row: int = 2, header_row: int = 1) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """批量处理搜索词翻译"""
-    try:
-        # 首先提取样本数据用于检查
-        sample_data = extract_sample_data(df, start_row, search_term_col, header_row=header_row)
-        logger.info(f"从第{start_row}行{search_term_col}列开始的样本数据:\n{sample_data}")
-        
-        # 记录数据详细信息
-        log_data_details(df, search_term_col, start_row, header_row)
-        
-        # 初始化翻译器
-        translator = OpenAITranslator()
-        
-        # 直接提取需要翻译的搜索词
-        search_terms = df.iloc[start_row:][search_term_col].tolist()
-        
-        # 批量翻译
-        logger.info("Starting search term translations...")
-        
-        if os.getenv('DEBUG', '').lower() in ('true', '1', 'True'):
-            # DEBUG模式:使用模拟翻译
-            search_translations = [f"{text} 翻译测试" for text in search_terms]
-        else:
-            # 正常模式:调用真实翻译
-            search_translations = translator.translate(search_terms)
-        
-        logger.info("Search term translations completed")
-        
-        # 更新数据
-        translated_col = f"{search_term_col}_translated"
-        df.loc[df.index[start_row:], translated_col] = search_translations
-        
-        return df, sample_data
-    except Exception as e:
-        logger.error(f"批量翻译时出错: {e}")
-        raise
-
-def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = None) -> pd.DataFrame:
-    """读取CSV文件并正确处理标题行
-    
-    Args:
-        file_path: CSV文件路径
-        header_row: 标题行号(从0开始),默认为1(第2行)
-        encoding: 文件编码
-        
-    Returns:
-        pandas DataFrame
-    """
-    try:
-        # 读取所有数据
-        data = read_csv(file_path, encoding)
-        
-        # 确保header_row在有效范围内
-        if header_row >= len(data):
-            raise ValueError(f"标题行 {header_row} 超出文件范围")
-            
-        # 使用指定行作为列名,前面的行丢弃
-        df = pd.DataFrame(data[header_row+1:], columns=data[header_row])
-        
-        logger.info(f"成功读取CSV文件,使用第{header_row+1}行作为标题行")
-        return df
-    except Exception as e:
-        logger.error(f"读取CSV文件时出错: {e}")
-        raise
-
-def main():
-    output_dir = Path('temp')
-    input_file = output_dir/"测试.csv"
-    output_file = output_dir/"processed_测试.csv"
-    
-    # 使用自定义编码检测读取CSV文件
-    df = read_csv_with_header(input_file, header_row=1)  # 使用第2行作为标题行
-    
-    # 提取列数据
-    extract_column_data(df, 'B', start_row=2, header_row=1)  # 示例:从第3行开始提取第2列(即'B'列)的数据
-    
-    # 插入空列
-    df = insert_empty_columns(df, ['B'], header_row=1)  # 示例:在'B'列后插入空列
-    
-    # 处理翻译
-    # df, _ = process_batch_translations(df, '搜索词')
-
-if __name__ == "__main__":
-    main()
+# ... rest of the file remains unchanged ...