process_data.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import csv
  2. import chardet
  3. import sys
  4. import logging
  5. from pathlib import Path
  6. from mylib.pdfzh_translator import OpenAITranslator
  7. from brand_add_url_link import create_hyperlink, create_asin_link
  8. from mylib.logging_config import setup_logging
  9. # Setup custom logging
  10. setup_logging()
  11. logger = logging.getLogger(__name__)
  12. def detect_encoding(file_path):
  13. try:
  14. with open(file_path, 'rb') as f:
  15. raw_data = f.read()
  16. result = chardet.detect(raw_data)
  17. return result['encoding']
  18. except Exception as e:
  19. logger.error(f"Error detecting encoding for {file_path}: {e}")
  20. sys.exit(1)
  21. def read_csv(file_path):
  22. encodings_to_try = ['utf-8-sig', 'gb18030', 'shift_jis', 'euc-jp']
  23. detected_encoding = detect_encoding(file_path)
  24. logger.info(f"Detected encoding: {detected_encoding}")
  25. if detected_encoding:
  26. encodings_to_try.insert(0, detected_encoding)
  27. for encoding in encodings_to_try:
  28. try:
  29. with open(file_path, 'r', encoding=encoding) as f:
  30. reader = csv.reader(f)
  31. return list(reader)
  32. except UnicodeDecodeError:
  33. continue
  34. except Exception as e:
  35. logger.error(f"Error with encoding {encoding}: {e}")
  36. continue
  37. logger.error("Failed to read file with all attempted encodings")
  38. sys.exit(1)
  39. def insert_empty_columns(data, column_indices):
  40. """在指定列之后插入空列"""
  41. try:
  42. # 按从大到小排序,防止插入影响后续索引
  43. column_indices.sort(reverse=True)
  44. for row in data:
  45. for index in column_indices:
  46. row.insert(index + 1, '')
  47. return data
  48. except Exception as e:
  49. logger.error(f"Error inserting empty columns: {e}")
  50. sys.exit(1)
  51. def process_batch_translations(data, search_term_index, category_indices):
  52. """批量处理翻译"""
  53. try:
  54. # 初始化翻译器
  55. translator = OpenAITranslator("openai", "zh-CN", "en", "gpt-3.5-turbo")
  56. # 收集所有需要翻译的文本
  57. translation_batches = {
  58. 'search_terms': [row[search_term_index] for row in data[2:]], # 从第三行开始
  59. 'categories': []
  60. }
  61. # 收集类别翻译
  62. for index in category_indices:
  63. translation_batches['categories'].extend([row[index] for row in data[2:]]) # 从第三行开始
  64. # 批量翻译
  65. logger.info("Starting batch translations...")
  66. search_translations = translator.translate(translation_batches['search_terms'])
  67. category_translations = translator.translate(translation_batches['categories'])
  68. logger.info("Batch translations completed")
  69. # 更新数据
  70. for i, row in enumerate(data[2:], start=2): # 从第三行开始处理
  71. try:
  72. # 更新搜索词翻译列
  73. row[search_term_index + 1] = search_translations[i-2]
  74. # 添加亚马逊搜索链接(跳过标题行)
  75. amazon_url = f"https://www.amazon.co.jp/s?k={row[search_term_index]}"
  76. row[search_term_index] = create_hyperlink(row[search_term_index], amazon_url)
  77. # 更新类别翻译
  78. category_trans_index = (i-2) * len(category_indices)
  79. for cat_index in category_indices:
  80. row[cat_index + 1] = category_translations[category_trans_index]
  81. category_trans_index += 1
  82. except Exception as e:
  83. logger.error(f"Error processing row {i}: {e}")
  84. sys.exit(1)
  85. return data
  86. except Exception as e:
  87. logger.error(f"Error in batch translation: {e}")
  88. sys.exit(1)
  89. def add_brand_asin_links(data, brand_indices, asin_indices):
  90. """为品牌和ASIN列添加链接"""
  91. try:
  92. for row in data[2:]: # 从第三行开始处理
  93. # 处理品牌列
  94. for index in brand_indices:
  95. if index < len(row) and row[index]:
  96. row[index] = create_hyperlink(row[index], 'https://www.amazon.co.jp/s?k=')
  97. # 处理ASIN列
  98. for index in asin_indices:
  99. if index < len(row) and row[index]:
  100. row[index] = create_asin_link(row[index])
  101. return data
  102. except Exception as e:
  103. logger.error(f"Error adding brand/ASIN links: {e}")
  104. sys.exit(1)
  105. def save_csv(data, file_path):
  106. try:
  107. with open(file_path, 'w', encoding='utf-8-sig', newline='') as f:
  108. writer = csv.writer(f)
  109. writer.writerows(data)
  110. except Exception as e:
  111. logger.error(f"Error saving CSV to {file_path}: {e}")
  112. sys.exit(1)
  113. def main(input_file, output_file):
  114. try:
  115. # Read CSV with proper encoding
  116. data = read_csv(input_file)
  117. # 定义需要处理的列索引
  118. search_term_index = 1 # 搜索词列
  119. brand_indices = [2, 3, 4] # 品牌列
  120. asin_indices = [7, 11, 15] # ASIN列
  121. category_indices = [5, 6, 7] # 类别列
  122. # 插入空列用于翻译
  123. insert_indices = [search_term_index] + category_indices
  124. data = insert_empty_columns(data, insert_indices)
  125. # 更新标题行
  126. data[0][search_term_index + 1] = "中文翻译"
  127. for index in category_indices:
  128. data[0].insert(index + 1, "中文翻译")
  129. # 处理翻译
  130. data = process_batch_translations(data, search_term_index, category_indices)
  131. # 添加品牌和ASIN链接
  132. data = add_brand_asin_links(data, brand_indices, asin_indices)
  133. # 保存处理后的数据
  134. save_csv(data, output_file)
  135. logger.info(f"Successfully processed and saved to {output_file}")
  136. except Exception as e:
  137. logger.error(f"Error processing file: {e}")
  138. sys.exit(1)
  139. if __name__ == "__main__":
  140. output_dir = Path('temp')
  141. input_file = output_dir/"测试.csv"
  142. output_file = output_dir/"processed_测试.csv"
  143. main(input_file, output_file)