Kaynağa Gözat

fix: improve encoding detection and error handling in CSV processing

Your Name (aider) 1 yıl önce
ebeveyn
işleme
ad43844bf3
1 değiştirilmiş dosya ile 27 ekleme ve 24 silme
  1. 27 24
      translate_new_col.py

+ 27 - 24
translate_new_col.py

@@ -2,6 +2,7 @@ import os
 import csv
 from urllib.parse import quote
 from ai_trans import translate_sentences
+import chardet
 
 def create_search_link(value):
     """为搜索词创建亚马逊搜索链接"""
@@ -9,29 +10,25 @@ def create_search_link(value):
 
 def detect_encoding(file_path):
     """检测文件编码"""
-    # 常见日文编码列表
-    encodings = ['utf-8-sig', 'shift_jis', 'euc-jp', 'utf-16', 'cp932', 'iso-2022-jp']
-    
-    # 尝试读取文件
-    for encoding in encodings:
-        try:
-            with open(file_path, 'r', encoding=encoding) as f:
-                f.read(1024)  # 读取前1024字节测试
-                return encoding
-        except UnicodeDecodeError:
-            continue
-    
-    # 如果常见编码都失败,尝试二进制读取检测BOM
+    # 使用chardet进行更可靠的编码检测
     with open(file_path, 'rb') as f:
-        bom = f.read(4)
-        if bom.startswith(b'\xef\xbb\xbf'):
-            return 'utf-8-sig'
-        elif bom.startswith(b'\xff\xfe'):
-            return 'utf-16'
-        elif bom.startswith(b'\xfe\xff'):
-            return 'utf-16-be'
-    
-    return 'cp932'  # 默认返回日文常用编码
+        raw_data = f.read(10000)  # 读取前10000字节用于检测
+        result = chardet.detect(raw_data)
+        encoding = result['encoding']
+        
+        # 处理一些常见的不准确检测结果
+        if encoding == 'SHIFT_JIS':
+            return 'shift_jis'
+        elif encoding == 'EUC-JP':
+            return 'euc-jp'
+        elif encoding == 'ISO-8859-1':
+            # 可能是UTF-8被误判为ISO-8859-1
+            try:
+                raw_data.decode('utf-8')
+                return 'utf-8'
+            except:
+                return 'cp932'
+        return encoding or 'utf-8'
 
 def read_csv(file_path):
     """读取CSV文件并返回数据列表"""
@@ -39,11 +36,17 @@ def read_csv(file_path):
     print(f"Detected encoding: {encoding}")
     
     try:
-        with open(file_path, mode='r', encoding=encoding) as file:
+        with open(file_path, mode='r', encoding=encoding, errors='replace') as file:
             # 使用csv.Sniffer检测分隔符
-            dialect = csv.Sniffer().sniff(file.read(1024))
+            sample = file.read(1024)
             file.seek(0)
             
+            try:
+                dialect = csv.Sniffer().sniff(sample)
+            except:
+                # 如果无法自动检测,使用默认设置
+                dialect = csv.excel
+                
             reader = csv.reader(file, dialect)
             data = [row for row in reader]