new_col_translate.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. import os
  2. import logging
  3. import pandas as pd
  4. from pathlib import Path
  5. from typing import List, Tuple, Union
  6. from mylib.pdfzh_translator import OpenAITranslator
  7. from mylib.read_encoding_cvs import read_csv
  8. from mylib.logging_config import setup_logging
  9. # Setup custom logging
  10. setup_logging()
  11. logger = logging.getLogger('mylib.translate_utils')
  12. def column_letter_to_index(col_letter: str) -> int:
  13. """将列字母转换为列索引(从0开始)
  14. Args:
  15. col_letter: 列字母(如 'A', 'B', 'AA' 等)
  16. Returns:
  17. 列索引(从0开始)
  18. """
  19. try:
  20. col_index = 0
  21. for i, char in enumerate(reversed(col_letter.upper())):
  22. col_index += (ord(char) - ord('A') + 1) * (26 ** i)
  23. return col_index - 1
  24. except Exception as e:
  25. logger.error(f"列字母转换时出错: {e}")
  26. raise
  27. def read_csv_with_header(file_path: str, header_row: int = 1, encoding: str = None) -> pd.DataFrame:
  28. """读取CSV文件并正确处理标题行
  29. Args:
  30. file_path: CSV文件路径
  31. header_row: 标题行号(从0开始),默认为1(第2行)
  32. encoding: 文件编码
  33. Returns:
  34. pandas DataFrame
  35. """
  36. try:
  37. if not os.path.exists(file_path):
  38. logger.error(f"文件不存在: {file_path}")
  39. raise FileNotFoundError(f"文件不存在: {file_path}")
  40. # 读取所有数据
  41. data = read_csv(file_path, encoding)
  42. if not data:
  43. logger.error("读取的文件为空")
  44. raise ValueError("读取的文件为空")
  45. # 确保header_row在有效范围内
  46. if header_row >= len(data):
  47. logger.error(f"标题行 {header_row} 超出文件范围")
  48. raise ValueError(f"标题行 {header_row} 超出文件范围")
  49. # 使用指定行作为列名,前面的行丢弃
  50. df = pd.DataFrame(data[header_row+1:], columns=data[header_row])
  51. logger.info(f"成功读取CSV文件,使用第{header_row+1}行作为标题行")
  52. logger.info(f"列标题: {df.columns.tolist()}")
  53. return df
  54. except Exception as e:
  55. logger.error(f"读取CSV文件时出错: {e}")
  56. raise
  57. def translate_column_data(df: pd.DataFrame, column_identifier: Union[str, int],
  58. start_row: int = 1, end_row: int = None,
  59. source_lang: str = 'auto', target_lang: str = 'zh-CN') -> pd.DataFrame:
  60. """翻译指定列的数据并在右侧插入翻译结果列
  61. Args:
  62. df: pandas DataFrame
  63. column_identifier: 要翻译的列名或列号(从0开始),也可以是列字母(如 'A', 'B')
  64. start_row: 开始翻译的行号,默认为1(第2行)
  65. end_row: 结束翻译的行号,默认为None(到最后一行)
  66. source_lang: 源语言代码,默认为'auto'
  67. target_lang: 目标语言代码,默认为'zh-CN'
  68. Returns:
  69. 包含翻译结果的DataFrame
  70. """
  71. try:
  72. if df.empty:
  73. logger.error("DataFrame为空")
  74. return df
  75. # 处理列号或列名或列字母
  76. if isinstance(column_identifier, str) and column_identifier.isalpha():
  77. column_identifier = column_letter_to_index(column_identifier)
  78. if isinstance(column_identifier, int):
  79. if column_identifier < 0 or column_identifier >= len(df.columns):
  80. logger.error(f"列号 {column_identifier} 超出范围")
  81. raise ValueError(f"列号 {column_identifier} 超出范围")
  82. column_identifier = df.columns[column_identifier]
  83. # 确保列名存在
  84. if column_identifier not in df.columns:
  85. logger.error(f"列名 {column_identifier} 不存在")
  86. raise ValueError(f"列名 {column_identifier} 不存在")
  87. # 处理行范围
  88. if end_row is None:
  89. end_row = len(df)
  90. if start_row < 0 or start_row >= len(df) or end_row < 0 or end_row > len(df):
  91. logger.error(f"行范围 {start_row}-{end_row} 超出范围")
  92. raise ValueError(f"行范围 {start_row}-{end_row} 超出范围")
  93. # 提取要翻译的数据
  94. texts_to_translate = df.iloc[start_row:end_row][column_identifier].tolist()
  95. logger.info(f"准备翻译 {len(texts_to_translate)} 条数据,从第{start_row}行到第{end_row}行")
  96. # 初始化翻译器
  97. translator = OpenAITranslator(lang_out=target_lang, lang_in=source_lang)
  98. # 执行翻译
  99. translated_texts = translator._batch_translate(texts_to_translate)
  100. # 在右侧插入新列
  101. new_column_name = f"{column_identifier}_translated"
  102. df.insert(df.columns.get_loc(column_identifier) + 1, new_column_name, "")
  103. # 填充翻译结果
  104. df.loc[start_row:end_row-1, new_column_name] = translated_texts
  105. logger.info(f"翻译完成,已插入新列 {new_column_name}")
  106. return df
  107. except Exception as e:
  108. logger.error(f"翻译列数据时出错: {e}")
  109. raise
  110. def process_csv(input_file: str, output_file: str, column_identifier: Union[str, int],
  111. start_row: int = 1, end_row: int = None,
  112. source_lang: str = 'auto', target_lang: str = 'zh-CN'):
  113. """处理CSV文件并保存翻译结果
  114. Args:
  115. input_file: 输入CSV文件路径
  116. output_file: 输出CSV文件路径
  117. column_identifier: 要翻译的列名或列号(从0开始),也可以是列字母(如 'A', 'B')
  118. start_row: 开始翻译的行号,默认为1(第2行)
  119. end_row: 结束翻译的行号,默认为None(到最后一行)
  120. source_lang: 源语言代码,默认为'auto'
  121. target_lang: 目标语言代码,默认为'zh-CN'
  122. """
  123. try:
  124. # 读取CSV文件
  125. df = read_csv_with_header(input_file)
  126. # 翻译指定列
  127. df = translate_column_data(df, column_identifier, start_row, end_row, source_lang, target_lang)
  128. # 保存结果
  129. df.to_csv(output_file, index=False, encoding='utf-8-sig')
  130. logger.info(f"翻译结果已保存到 {output_file}")
  131. except Exception as e:
  132. logger.error(f"处理CSV文件时出错: {e}")
  133. raise
  134. if __name__ == '__main__':
  135. # 示例用法
  136. input_file = Path('/path/to/input.csv')
  137. output_file = Path('/path/to/output.csv')
  138. process_csv(input_file, output_file, column_identifier='B', start_row=1, end_row=10)