translate_csv.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import pandas as pd
  2. from dotenv import load_dotenv
  3. from mylib.pdfzh_translator import OpenAITranslator
  4. import time
  5. import logging
  6. # 配置日志
  7. logging.basicConfig(level=logging.INFO)
  8. logger = logging.getLogger(__name__)
  9. def translate_csv(input_file, chunk_size=100):
  10. # 加载环境变量
  11. load_dotenv()
  12. # 读取CSV文件
  13. df = pd.read_csv(input_file)
  14. # 在B列右边插入一列空列
  15. df.insert(2, 'C', '')
  16. # 初始化翻译器
  17. translator = OpenAITranslator()
  18. # 分块处理数据
  19. total_rows = len(df)
  20. start_row = 1 # 从第二行开始(跳过标题)
  21. while start_row < total_rows:
  22. end_row = min(start_row + chunk_size, total_rows)
  23. logger.info(f"Processing rows {start_row} to {end_row}")
  24. try:
  25. # 获取当前块的数据
  26. data_to_translate = df.iloc[start_row:end_row, 1].tolist()
  27. # 批量翻译
  28. translated_texts = translator._batch_translate(data_to_translate)
  29. # 将翻译结果写入C列
  30. df.iloc[start_row:end_row, 2] = translated_texts
  31. # 保存中间结果
  32. output_file = input_file.replace('.csv', '_translated.csv')
  33. df.to_csv(output_file, index=False)
  34. # 更新起始行
  35. start_row = end_row
  36. # 添加延迟以避免速率限制
  37. time.sleep(1)
  38. except Exception as e:
  39. logger.error(f"Error processing rows {start_row}-{end_row}: {str(e)}")
  40. logger.info("Retrying after 5 seconds...")
  41. time.sleep(5)
  42. logger.info("Translation completed successfully")
  43. if __name__ == '__main__':
  44. input_file = '/home/mrh/code/excel_tool/temp/测试.csv.utf8.csv'
  45. translate_csv(input_file)