1 yıl önce · ad43844bf3
--- a/translate_new_col.py
+++ b/translate_new_col.py
@@ -2,6 +2,7 @@ import os
 
				 import csv
			
 
				 from urllib.parse import quote
			
 
				 from ai_trans import translate_sentences
			
 
				+import chardet
			
 
				 
			
 
				 def create_search_link(value):
			
 
				     """为搜索词创建亚马逊搜索链接"""
			
@@ -9,29 +10,25 @@ def create_search_link(value):
 
				 
			
 
				 def detect_encoding(file_path):
			
 
				     """检测文件编码"""
			
 
				-    # 常见日文编码列表
			
 
				-    encodings = ['utf-8-sig', 'shift_jis', 'euc-jp', 'utf-16', 'cp932', 'iso-2022-jp']
			
 
				-    
			
 
				-    # 尝试读取文件
			
 
				-    for encoding in encodings:
			
 
				-        try:
			
 
				-            with open(file_path, 'r', encoding=encoding) as f:
			
 
				-                f.read(1024)  # 读取前1024字节测试
			
 
				-                return encoding
			
 
				-        except UnicodeDecodeError:
			
 
				-            continue
			
 
				-    
			
 
				-    # 如果常见编码都失败，尝试二进制读取检测BOM
			
 
				+    # 使用chardet进行更可靠的编码检测
			
 
				     with open(file_path, 'rb') as f:
			
 
				-        bom = f.read(4)
			
 
				-        if bom.startswith(b'\xef\xbb\xbf'):
			
 
				-            return 'utf-8-sig'
			
 
				-        elif bom.startswith(b'\xff\xfe'):
			
 
				-            return 'utf-16'
			
 
				-        elif bom.startswith(b'\xfe\xff'):
			
 
				-            return 'utf-16-be'
			
 
				-    
			
 
				-    return 'cp932'  # 默认返回日文常用编码
			
 
				+        raw_data = f.read(10000)  # 读取前10000字节用于检测
			
 
				+        result = chardet.detect(raw_data)
			
 
				+        encoding = result['encoding']
			
 
				+        
			
 
				+        # 处理一些常见的不准确检测结果
			
 
				+        if encoding == 'SHIFT_JIS':
			
 
				+            return 'shift_jis'
			
 
				+        elif encoding == 'EUC-JP':
			
 
				+            return 'euc-jp'
			
 
				+        elif encoding == 'ISO-8859-1':
			
 
				+            # 可能是UTF-8被误判为ISO-8859-1
			
 
				+            try:
			
 
				+                raw_data.decode('utf-8')
			
 
				+                return 'utf-8'
			
 
				+            except:
			
 
				+                return 'cp932'
			
 
				+        return encoding or 'utf-8'
			
 
				 
			
 
				 def read_csv(file_path):
			
 
				     """读取CSV文件并返回数据列表"""
			
@@ -39,11 +36,17 @@ def read_csv(file_path):
 
				     print(f"Detected encoding: {encoding}")
			
 
				     
			
 
				     try:
			
 
				-        with open(file_path, mode='r', encoding=encoding) as file:
			
 
				+        with open(file_path, mode='r', encoding=encoding, errors='replace') as file:
			
 
				             # 使用csv.Sniffer检测分隔符
			
 
				-            dialect = csv.Sniffer().sniff(file.read(1024))
			
 
				+            sample = file.read(1024)
			
 
				             file.seek(0)
			
 
				             
			
 
				+            try:
			
 
				+                dialect = csv.Sniffer().sniff(sample)
			
 
				+            except:
			
 
				+                # 如果无法自动检测，使用默认设置
			
 
				+                dialect = csv.excel
			
 
				+                
			
 
				             reader = csv.reader(file, dialect)
			
 
				             data = [row for row in reader]