translate_utils.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import os
  2. import logging
  3. from pathlib import Path
  4. from typing import List, Tuple
  5. from mylib.pdfzh_translator import OpenAITranslator
  6. from mylib.read_encoding_cvs import read_csv
  7. from logging_config import setup_logging
  8. from mylib.logging_config import setup_logging
  9. # Setup custom logging
  10. setup_logging()
  11. logger = logging.getLogger(__name__)
  12. def insert_empty_columns(data: List[List[str]], column_indices: List[int]) -> List[List[str]]:
  13. """在指定列之后插入空列"""
  14. try:
  15. # 按从大到小排序,防止插入影响后续索引
  16. column_indices.sort(reverse=True)
  17. for row in data:
  18. for index in column_indices:
  19. row.insert(index + 1, '')
  20. return data
  21. except Exception as e:
  22. logger.error(f"Error inserting empty columns: {e}")
  23. raise
  24. def extract_sample_data(data: List[List[str]], start_row: int = 0, column_index: int = 0, n: int = 3, m: int = 2) -> List[List[str]]:
  25. """提取指定行和列开始的样本数据"""
  26. try:
  27. sample = []
  28. # 确保不超过数据范围
  29. end_row = min(start_row + n, len(data))
  30. end_col = min(column_index + m, len(data[0]) if data else 0)
  31. for row in data[start_row:end_row]:
  32. sample.append(row[column_index:end_col])
  33. return sample
  34. except Exception as e:
  35. logger.error(f"Error extracting sample data: {e}")
  36. raise
  37. def log_data_details(data: List[List[str]], search_term_index: int, start_row: int = 3):
  38. """记录数据详细信息"""
  39. try:
  40. # 记录行号和列号
  41. logger.info(f"行号范围: {start_row}-{len(data)-1}")
  42. logger.info(f"翻译列号: {search_term_index}")
  43. # 提取并记录被翻译列的内容
  44. translated_column = [row[search_term_index] for row in data[start_row:]]
  45. logger.info(f"被翻译列内容: {translated_column}")
  46. except Exception as e:
  47. logger.error(f"记录数据详细信息时出错: {e}")
  48. raise
  49. def process_batch_translations(data: List[List[str]],
  50. search_term_index: int,
  51. start_row: int = 3) -> Tuple[List[List[str]], List[List[str]]]:
  52. """批量处理搜索词翻译"""
  53. try:
  54. # 首先提取样本数据用于检查
  55. sample_data = extract_sample_data(data, start_row, search_term_index)
  56. logger.info(f"从第{start_row}行第{search_term_index}列开始的样本数据:\n{sample_data}")
  57. # 记录数据详细信息
  58. log_data_details(data, search_term_index, start_row)
  59. # 初始化翻译器
  60. translator = OpenAITranslator()
  61. # 直接提取需要翻译的搜索词
  62. search_terms = [row[search_term_index] for row in data[start_row-1:]]
  63. # 批量翻译
  64. logger.info("Starting search term translations...")
  65. if os.getenv('DEBUG', '').lower() in ('true', '1', 'True'):
  66. # DEBUG模式:使用模拟翻译
  67. search_translations = [f"{text} 翻译测试" for text in search_terms]
  68. else:
  69. # 正常模式:调用真实翻译
  70. search_translations = translator.translate(search_terms)
  71. logger.info("Search term translations completed")
  72. # 更新数据
  73. for i, row in enumerate(data[start_row-1:], start=start_row-1):
  74. try:
  75. # 更新搜索词翻译列
  76. row[search_term_index + 1] = search_translations[i-(start_row-1)]
  77. except Exception as e:
  78. logger.error(f"Error processing row {i}: {e}")
  79. raise
  80. return data, sample_data
  81. except Exception as e:
  82. logger.error(f"Error in batch translation: {e}")
  83. raise
  84. def main():
  85. output_dir = Path('temp')
  86. input_file = output_dir/"测试.csv"
  87. output_file = output_dir/"processed_测试.csv"
  88. data = read_csv(input_file)
  89. process_batch_translations(data, 2)
  90. if __name__ == "__main__":
  91. main()