new_col_translate.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import csv
  2. import logging
  3. from typing import List, Optional, Union
  4. from mylib.logging_config import setup_logging
  5. from mylib.pdfzh_translator import OpenAITranslator
  6. # Setup custom logging
  7. setup_logging()
  8. logger = logging.getLogger('new_col_translate')
  9. def column_letter_to_index(col_letter: str) -> int:
  10. """将Excel列字母转换为0-based索引"""
  11. index = 0
  12. for char in col_letter.upper():
  13. index = index * 26 + (ord(char) - ord('A') + 1)
  14. return index - 1
  15. def read_csv_with_header(file_path: str, encoding: str = 'cp936') -> List[List[str]]:
  16. """读取CSV文件并返回数据和表头"""
  17. try:
  18. with open(file_path, 'r', encoding=encoding) as f:
  19. reader = csv.reader(f)
  20. header = next(reader)
  21. data = [row for row in reader]
  22. logger.info(f"成功读取文件:{file_path}")
  23. logger.debug(f"表头:{header}")
  24. return header, data
  25. except Exception as e:
  26. logger.error(f"读取文件失败:{e}")
  27. raise
  28. def translate_columns_data(
  29. data: List[List[str]],
  30. header: List[str],
  31. column_indices: List[int],
  32. start_row: int = 1,
  33. end_row: Optional[int] = None,
  34. source_lang: str = 'auto',
  35. target_lang: str = 'zh-CN'
  36. ) -> List[List[str]]:
  37. """翻译多个指定列的数据"""
  38. translator = OpenAITranslator(lang_out=target_lang, lang_in=source_lang)
  39. end_row = end_row if end_row is not None else len(data)
  40. rows_to_translate = data[start_row:end_row]
  41. logger.info(f"开始翻译 {start_row} 到 {end_row} 行的数据")
  42. # 按顺序处理每一列
  43. for i, col_index in enumerate(column_indices):
  44. # 计算当前列的实际索引(考虑之前插入的列)
  45. current_col_index = col_index + i
  46. # 插入新列
  47. for row in data:
  48. row.insert(current_col_index + 1, '')
  49. # 更新表头
  50. header.insert(current_col_index + 1, f"{header[current_col_index]}_translated")
  51. # 提取要翻译的文本
  52. texts_to_translate = [row[current_col_index] for row in rows_to_translate]
  53. logger.debug(f"待翻译文本示例(列 {current_col_index}):{texts_to_translate[:3]}")
  54. # 批量翻译
  55. translated_texts = translator._batch_translate(texts_to_translate)
  56. # 将翻译结果插入新列
  57. for j, row in enumerate(rows_to_translate):
  58. row[current_col_index + 1] = translated_texts[j]
  59. logger.info(f"列 {current_col_index} 翻译完成")
  60. logger.info("所有列翻译完成")
  61. return data, header
  62. def save_csv(
  63. data: List[List[str]],
  64. header: List[str],
  65. output_file: str,
  66. encoding: str = 'utf-8-sig'
  67. ):
  68. """保存CSV文件"""
  69. try:
  70. with open(output_file, 'w', encoding=encoding, newline='') as f:
  71. writer = csv.writer(f)
  72. writer.writerow(header)
  73. writer.writerows(data)
  74. logger.info(f"结果已保存到: {output_file}")
  75. except Exception as e:
  76. logger.error(f"保存文件失败:{e}")
  77. raise
  78. def process_csv(
  79. input_file: str,
  80. output_file: str,
  81. columns: Union[str, List[str]],
  82. start_row: int = 1,
  83. end_row: Optional[int] = None,
  84. source_lang: str = 'auto',
  85. target_lang: str = 'zh-CN',
  86. encoding: str = 'cp936'
  87. ):
  88. """处理CSV文件的主函数"""
  89. try:
  90. # 转换列字母为索引
  91. if isinstance(columns, str):
  92. columns = [columns]
  93. column_indices = [column_letter_to_index(col) for col in columns]
  94. # 读取文件
  95. header, data = read_csv_with_header(input_file, encoding=encoding)
  96. # 翻译指定列
  97. data, header = translate_columns_data(
  98. data,
  99. header,
  100. column_indices,
  101. start_row,
  102. end_row,
  103. source_lang,
  104. target_lang
  105. )
  106. # 保存结果
  107. save_csv(data, header, output_file)
  108. except Exception as e:
  109. logger.error(f"处理文件时出错:{e}")
  110. raise
  111. if __name__ == "__main__":
  112. # 示例用法
  113. file_path = "/home/mrh/code/excel_tool/temp/测试.csv"
  114. output_path = "/home/mrh/code/excel_tool/temp/测试_processed.csv"
  115. process_csv(
  116. input_file=file_path,
  117. output_file=output_path,
  118. columns=['B', 'F', 'G', 'H'], # 现在支持多个列
  119. start_row=1,
  120. source_lang='auto',
  121. target_lang='zh-CN'
  122. )