|
|
@@ -1,8 +1,14 @@
|
|
|
import pandas as pd
|
|
|
from dotenv import load_dotenv
|
|
|
from mylib.pdfzh_translator import OpenAITranslator
|
|
|
+import time
|
|
|
+import logging
|
|
|
|
|
|
-def translate_csv(input_file):
|
|
|
+# 配置日志
|
|
|
+logging.basicConfig(level=logging.INFO)
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+def translate_csv(input_file, chunk_size=100):
|
|
|
# 加载环境变量
|
|
|
load_dotenv()
|
|
|
|
|
|
@@ -12,21 +18,43 @@ def translate_csv(input_file):
|
|
|
# 在B列右边插入一列空列
|
|
|
df.insert(2, 'C', '')
|
|
|
|
|
|
- # 获取B列从第二行开始的数据(跳过标题行)
|
|
|
- data_to_translate = df.iloc[1:, 1].tolist()
|
|
|
-
|
|
|
# 初始化翻译器
|
|
|
translator = OpenAITranslator()
|
|
|
|
|
|
- # 批量翻译
|
|
|
- translated_texts = translator._batch_translate(data_to_translate)
|
|
|
-
|
|
|
- # 将翻译结果写入C列,从第二行开始
|
|
|
- df.iloc[1:, 2] = translated_texts
|
|
|
-
|
|
|
- # 保存修改后的CSV文件
|
|
|
- output_file = input_file.replace('.csv', '_translated.csv')
|
|
|
- df.to_csv(output_file, index=False)
|
|
|
+ # 分块处理数据
|
|
|
+ total_rows = len(df)
|
|
|
+ start_row = 1 # 从第二行开始(跳过标题)
|
|
|
+
|
|
|
+ while start_row < total_rows:
|
|
|
+ end_row = min(start_row + chunk_size, total_rows)
|
|
|
+ logger.info(f"Processing rows {start_row} to {end_row}")
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 获取当前块的数据
|
|
|
+ data_to_translate = df.iloc[start_row:end_row, 1].tolist()
|
|
|
+
|
|
|
+ # 批量翻译
|
|
|
+ translated_texts = translator._batch_translate(data_to_translate)
|
|
|
+
|
|
|
+ # 将翻译结果写入C列
|
|
|
+ df.iloc[start_row:end_row, 2] = translated_texts
|
|
|
+
|
|
|
+ # 保存中间结果
|
|
|
+ output_file = input_file.replace('.csv', '_translated.csv')
|
|
|
+ df.to_csv(output_file, index=False)
|
|
|
+
|
|
|
+ # 更新起始行
|
|
|
+ start_row = end_row
|
|
|
+
|
|
|
+ # 添加延迟以避免速率限制
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"Error processing rows {start_row}-{end_row}: {str(e)}")
|
|
|
+ logger.info("Retrying after 5 seconds...")
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ logger.info("Translation completed successfully")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
input_file = '/home/mrh/code/excel_tool/temp/测试.csv.utf8.csv'
|