import csv import logging from typing import List, Optional, Union from mylib.logging_config import setup_logging from mylib.pdfzh_translator import OpenAITranslator # Setup custom logging setup_logging() logger = logging.getLogger('mylib') def column_letter_to_index(col_letter: str) -> int: """将Excel列字母转换为0-based索引""" index = 0 for char in col_letter.upper(): index = index * 26 + (ord(char) - ord('A') + 1) return index - 1 def index_to_column_letter(index: int) -> str: """将0-based索引转换为Excel列字母""" col_letter = '' while index >= 0: col_letter = chr(ord('A') + (index % 26)) + col_letter index = (index // 26) - 1 return col_letter def read_csv_with_header( file_path: str, encoding: str = 'cp936' ) -> List[List[str]]: """读取CSV文件并返回数据和表头""" try: with open(file_path, 'r', encoding=encoding) as f: reader = csv.reader(f) header = next(reader) # 读取表头(第1行) data = [row for row in reader] # 读取剩余数据 logger.info(f"成功读取文件:{file_path}") logger.debug(f"表头:{header}") return header, data except Exception as e: logger.error(f"读取文件失败:{e}") raise def search_keywords( data: List[List[str]], header: List[str], keywords: Union[str, List[str]], row_index: int = 0 ) -> List[str]: """搜索指定行中包含关键词的单元格并返回列名列表""" if isinstance(keywords, str): keywords = [keywords] found_columns = set() # 检查行索引是否在数据范围内 if row_index >= len(data): logger.warning(f"行索引 {row_index} 超出数据范围") return [] # 搜索数据行(从第2行开始) row = data[row_index] for col_index, cell in enumerate(row): if any(keyword in cell for keyword in keywords): col_letter = index_to_column_letter(col_index) found_columns.add(col_letter) logger.debug(f"在 {col_letter}{row_index + 2} 找到关键词: {cell}") found_columns = sorted(found_columns, key=lambda x: column_letter_to_index(x)) logger.info(f"找到包含关键词的列: {', '.join(found_columns)}") return list(found_columns) def translate_columns_data( data: List[List[str]], header: List[str], column_indices: List[int], start_row: int = 2, # 默认从第2行开始 end_row: Optional[int] = None, source_lang: str = 'auto', target_lang: str = 'zh-CN' ) -> List[List[str]]: """翻译多个指定列的数据""" translator = OpenAITranslator(lang_out=target_lang, lang_in=source_lang) end_row = end_row if end_row is not None else len(data) rows_to_translate = data[start_row - 1:end_row] # 转换为0-based索引 # 按顺序处理每一列 for i, col_index in enumerate(column_indices): # 计算当前列的实际索引 current_col_index = col_index + i # 插入新列 for row in data: row.insert(current_col_index + 1, '') # 更新表头 header.insert(current_col_index + 1, f"{header[current_col_index]}_translated") # 提取要翻译的文本 texts_to_translate = [row[current_col_index] for row in rows_to_translate] # 批量翻译 translated_texts = translator._batch_translate(texts_to_translate) # 将翻译结果插入新列 for j, row in enumerate(rows_to_translate): row[current_col_index + 1] = translated_texts[j] return data, header def save_csv( data: List[List[str]], header: List[str], output_file: str, encoding: str = 'utf-8-sig' ): """保存CSV文件""" try: with open(output_file, 'w', encoding=encoding, newline='') as f: writer = csv.writer(f) writer.writerow(header) writer.writerows(data) logger.info(f"结果已保存到: {output_file}") except Exception as e: logger.error(f"保存文件失败:{e}") raise def process_csv( input_file: str, output_file: str, columns: Union[str, List[str]], start_row: int = 2, # 默认从第2行开始 end_row: Optional[int] = None, source_lang: str = 'auto', target_lang: str = 'zh-CN', encoding: str = 'cp936' ): """处理CSV文件的主函数""" try: # 转换列字母为索引 if isinstance(columns, str): columns = [columns] column_indices = [column_letter_to_index(col) for col in columns] # 读取文件 header, data = read_csv_with_header(input_file, encoding=encoding) # 翻译指定列 data, header = translate_columns_data( data, header, column_indices, start_row, end_row, source_lang, target_lang ) # 保存结果 save_csv(data, header, output_file) except Exception as e: logger.error(f"处理文件时出错:{e}") raise if __name__ == "__main__": from dotenv import load_dotenv load_dotenv() # 示例用法 file_path = "/home/mrh/code/excel_tool/temp/测试.csv" output_path = "/home/mrh/code/excel_tool/temp/测试_processed.csv" # 读取文件并搜索关键词 header, data = read_csv_with_header(file_path) found_columns = search_keywords(data, header, ["搜索词", "类别"]) # 处理文件 process_csv( input_file=file_path, output_file=output_path, columns=found_columns, start_row=2, source_lang='auto', target_lang='zh-CN' )