import os import logging import pandas as pd from pathlib import Path from typing import List, Tuple, Union from mylib.pdfzh_translator import OpenAITranslator from mylib.read_encoding_cvs import read_csv from mylib.logging_config import setup_logging # Setup custom logging setup_logging() logger = logging.getLogger('mylib.translate_utils') def column_letter_to_index(col_letter: str) -> int: """将列字母转换为列索引(从0开始) Args: col_letter: 列字母(如 'A', 'B', 'AA' 等) Returns: 列索引(从0开始) """ try: col_index = 0 for i, char in enumerate(reversed(col_letter.upper())): col_index += (ord(char) - ord('A') + 1) * (26 ** i) return col_index - 1 except Exception as e: logger.error(f"列字母转换时出错: {e}") raise def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = None) -> pd.DataFrame: """读取CSV文件并正确处理标题行 Args: file_path: CSV文件路径 header_row: 标题行号(从0开始),默认为1(第2行) encoding: 文件编码 Returns: pandas DataFrame """ try: # 读取所有数据 data = read_csv(file_path, encoding) if not data: raise ValueError("读取的文件为空") # 确保header_row在有效范围内 if header_row >= len(data): raise ValueError(f"标题行 {header_row} 超出文件范围") # 使用指定行作为列名,前面的行丢弃 df = pd.DataFrame(data[header_row+1:], columns=data[header_row]) logger.info(f"成功读取CSV文件,使用第{header_row+1}行作为标题行") logger.info(f"列标题: {df.columns.tolist()}") return df except Exception as e: logger.error(f"读取CSV文件时出错: {e}") raise def extract_column_data(df: pd.DataFrame, column_identifier: Union[str, int], start_row: int = 2, header_row: int = 1) -> pd.Series: """提取指定列的数据,默认从第3行开始 Args: df: pandas DataFrame column_identifier: 要提取的列名或列号(从0开始),也可以是列字母(如 'A', 'B') start_row: 开始提取的行号,默认为2(第3行) header_row: 标题行号,默认为1(第2行) Returns: 包含指定列数据的Series """ try: if df.empty: return pd.Series() # 处理列号或列名或列字母 if isinstance(column_identifier, str) and column_identifier.isalpha(): column_identifier = column_letter_to_index(column_identifier) if isinstance(column_identifier, int): if column_identifier < 0 or column_identifier >= len(df.columns): raise ValueError(f"列号 {column_identifier} 超出范围") column_identifier = df.columns[column_identifier] # 确保列名存在 if column_identifier not in df.columns: raise ValueError(f"列名 {column_identifier} 不存在") # 确保开始行在有效范围内 if start_row >= len(df) or start_row < 0: raise ValueError(f"开始行 {start_row} 超出范围") # 提取指定列的数据 column_data = df.iloc[start_row:][column_identifier] logger.info(f"成功提取列 {column_identifier} 数据,从第{start_row}行开始,共{len(column_data)}条数据") return column_data except Exception as e: logger.error(f"提取列数据时出错: {e}") raise def test_column_extraction(): output_dir = Path('temp') input_file = output_dir/"测试.csv" output_file = output_dir/"processed_测试.csv" if __name__ == '__main__': test_column_extraction()