translate_utils.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. import os
  2. import logging
  3. import pandas as pd
  4. from pathlib import Path
  5. from typing import List, Tuple
  6. from mylib.pdfzh_translator import OpenAITranslator
  7. from mylib.read_encoding_cvs import read_csv
  8. from mylib.logging_config import setup_logging
  9. # Setup custom logging
  10. setup_logging()
  11. logger = logging.getLogger('mylib.translate_utils')
  12. def extract_column_data(df: pd.DataFrame, column_name: str, start_row: int = 2) -> pd.Series:
  13. """提取指定列的数据,默认从第3行开始
  14. Args:
  15. df: pandas DataFrame
  16. column_name: 要提取的列名
  17. start_row: 开始提取的行号,默认为2(第3行)
  18. Returns:
  19. 包含指定列数据的Series
  20. """
  21. try:
  22. if df.empty:
  23. return pd.Series()
  24. # 确保列名存在
  25. if column_name not in df.columns:
  26. raise ValueError(f"列名 {column_name} 不存在")
  27. # 确保开始行在有效范围内
  28. if start_row >= len(df) or start_row < 0:
  29. raise ValueError(f"开始行 {start_row} 超出范围")
  30. # 提取指定列的数据
  31. column_data = df.iloc[start_row:][column_name]
  32. logger.info(f"成功提取列 {column_name} 数据,从第{start_row}行开始,共{len(column_data)}条数据")
  33. logger.info(f"列 {column_name} 数据: {column_data.tolist()}")
  34. return column_data
  35. except Exception as e:
  36. logger.error(f"提取列数据时出错: {e}")
  37. raise
  38. def insert_empty_columns(df: pd.DataFrame, column_names: List[str]) -> pd.DataFrame:
  39. """在指定列之后插入空列"""
  40. try:
  41. # 按从大到小排序,防止插入影响后续索引
  42. column_names = sorted(column_names, reverse=True)
  43. for col in column_names:
  44. if col in df.columns:
  45. # 在指定列后插入空列
  46. new_col_index = df.columns.get_loc(col) + 1
  47. new_col_name = f"{col}_translated"
  48. df.insert(new_col_index, new_col_name, '')
  49. return df
  50. except Exception as e:
  51. logger.error(f"插入空列时出错: {e}")
  52. raise
  53. def extract_sample_data(df: pd.DataFrame, start_row: int = 0, column_name: str = None, n: int = 3) -> pd.DataFrame:
  54. """提取指定行和列开始的样本数据"""
  55. try:
  56. # 确保不超过数据范围
  57. end_row = min(start_row + n, len(df))
  58. if column_name:
  59. return df.iloc[start_row:end_row][[column_name]]
  60. return df.iloc[start_row:end_row]
  61. except Exception as e:
  62. logger.error(f"提取样本数据时出错: {e}")
  63. raise
  64. def log_data_details(df: pd.DataFrame, search_term_col: str, start_row: int = 2):
  65. """记录数据详细信息"""
  66. try:
  67. # 记录行号和列号
  68. logger.info(f"行号范围: {start_row}-{len(df)-1}")
  69. logger.info(f"翻译列名: {search_term_col}")
  70. # 提取并记录被翻译列的内容
  71. translated_column = df.iloc[start_row:][search_term_col]
  72. logger.info(f"被翻译列内容: {translated_column.tolist()}")
  73. except Exception as e:
  74. logger.error(f"记录数据详细信息时出错: {e}")
  75. raise
  76. def process_batch_translations(df: pd.DataFrame,
  77. search_term_col: str,
  78. start_row: int = 2) -> Tuple[pd.DataFrame, pd.DataFrame]:
  79. """批量处理搜索词翻译"""
  80. try:
  81. # 首先提取样本数据用于检查
  82. sample_data = extract_sample_data(df, start_row, search_term_col)
  83. logger.info(f"从第{start_row}行{search_term_col}列开始的样本数据:\n{sample_data}")
  84. # 记录数据详细信息
  85. log_data_details(df, search_term_col, start_row)
  86. # 初始化翻译器
  87. translator = OpenAITranslator()
  88. # 直接提取需要翻译的搜索词
  89. search_terms = df.iloc[start_row:][search_term_col].tolist()
  90. # 批量翻译
  91. logger.info("Starting search term translations...")
  92. if os.getenv('DEBUG', '').lower() in ('true', '1', 'True'):
  93. # DEBUG模式:使用模拟翻译
  94. search_translations = [f"{text} 翻译测试" for text in search_terms]
  95. else:
  96. # 正常模式:调用真实翻译
  97. search_translations = translator.translate(search_terms)
  98. logger.info("Search term translations completed")
  99. # 更新数据
  100. translated_col = f"{search_term_col}_translated"
  101. df.loc[df.index[start_row:], translated_col] = search_translations
  102. return df, sample_data
  103. except Exception as e:
  104. logger.error(f"批量翻译时出错: {e}")
  105. raise
  106. def main():
  107. output_dir = Path('temp')
  108. input_file = output_dir/"测试.csv"
  109. output_file = output_dir/"processed_测试.csv"
  110. # 读取CSV文件
  111. df = pd.read_csv(input_file)
  112. # 提取列数据
  113. extract_column_data(df, '搜索词')
  114. # 插入空列
  115. df = insert_empty_columns(df, ['搜索词'])
  116. # 处理翻译
  117. # df, _ = process_batch_translations(df, '搜索词')
  118. if __name__ == "__main__":
  119. main()