ソースを参照

feat: add chunk processing and error handling for CSV translation

mrh (aider) 1 年間 前
コミット
d4400ae2fa
1 ファイル変更41 行追加13 行削除
  1. 41 13
      excel2sql/translate_csv.py

+ 41 - 13
excel2sql/translate_csv.py

@@ -1,8 +1,14 @@
 import pandas as pd
 from dotenv import load_dotenv
 from mylib.pdfzh_translator import OpenAITranslator
+import time
+import logging
 
-def translate_csv(input_file):
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def translate_csv(input_file, chunk_size=100):
     # 加载环境变量
     load_dotenv()
 
@@ -12,21 +18,43 @@ def translate_csv(input_file):
     # 在B列右边插入一列空列
     df.insert(2, 'C', '')
 
-    # 获取B列从第二行开始的数据(跳过标题行)
-    data_to_translate = df.iloc[1:, 1].tolist()
-
     # 初始化翻译器
     translator = OpenAITranslator()
 
-    # 批量翻译
-    translated_texts = translator._batch_translate(data_to_translate)
-
-    # 将翻译结果写入C列,从第二行开始
-    df.iloc[1:, 2] = translated_texts
-
-    # 保存修改后的CSV文件
-    output_file = input_file.replace('.csv', '_translated.csv')
-    df.to_csv(output_file, index=False)
+    # 分块处理数据
+    total_rows = len(df)
+    start_row = 1  # 从第二行开始(跳过标题)
+    
+    while start_row < total_rows:
+        end_row = min(start_row + chunk_size, total_rows)
+        logger.info(f"Processing rows {start_row} to {end_row}")
+        
+        try:
+            # 获取当前块的数据
+            data_to_translate = df.iloc[start_row:end_row, 1].tolist()
+            
+            # 批量翻译
+            translated_texts = translator._batch_translate(data_to_translate)
+            
+            # 将翻译结果写入C列
+            df.iloc[start_row:end_row, 2] = translated_texts
+            
+            # 保存中间结果
+            output_file = input_file.replace('.csv', '_translated.csv')
+            df.to_csv(output_file, index=False)
+            
+            # 更新起始行
+            start_row = end_row
+            
+            # 添加延迟以避免速率限制
+            time.sleep(1)
+            
+        except Exception as e:
+            logger.error(f"Error processing rows {start_row}-{end_row}: {str(e)}")
+            logger.info("Retrying after 5 seconds...")
+            time.sleep(5)
+
+    logger.info("Translation completed successfully")
 
 if __name__ == '__main__':
     input_file = '/home/mrh/code/excel_tool/temp/测试.csv.utf8.csv'