|
@@ -2,6 +2,7 @@ import os
|
|
|
import csv
|
|
import csv
|
|
|
from urllib.parse import quote
|
|
from urllib.parse import quote
|
|
|
from ai_trans import translate_sentences
|
|
from ai_trans import translate_sentences
|
|
|
|
|
+import chardet
|
|
|
|
|
|
|
|
def create_search_link(value):
|
|
def create_search_link(value):
|
|
|
"""为搜索词创建亚马逊搜索链接"""
|
|
"""为搜索词创建亚马逊搜索链接"""
|
|
@@ -9,29 +10,25 @@ def create_search_link(value):
|
|
|
|
|
|
|
|
def detect_encoding(file_path):
|
|
def detect_encoding(file_path):
|
|
|
"""检测文件编码"""
|
|
"""检测文件编码"""
|
|
|
- # 常见日文编码列表
|
|
|
|
|
- encodings = ['utf-8-sig', 'shift_jis', 'euc-jp', 'utf-16', 'cp932', 'iso-2022-jp']
|
|
|
|
|
-
|
|
|
|
|
- # 尝试读取文件
|
|
|
|
|
- for encoding in encodings:
|
|
|
|
|
- try:
|
|
|
|
|
- with open(file_path, 'r', encoding=encoding) as f:
|
|
|
|
|
- f.read(1024) # 读取前1024字节测试
|
|
|
|
|
- return encoding
|
|
|
|
|
- except UnicodeDecodeError:
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- # 如果常见编码都失败,尝试二进制读取检测BOM
|
|
|
|
|
|
|
+ # 使用chardet进行更可靠的编码检测
|
|
|
with open(file_path, 'rb') as f:
|
|
with open(file_path, 'rb') as f:
|
|
|
- bom = f.read(4)
|
|
|
|
|
- if bom.startswith(b'\xef\xbb\xbf'):
|
|
|
|
|
- return 'utf-8-sig'
|
|
|
|
|
- elif bom.startswith(b'\xff\xfe'):
|
|
|
|
|
- return 'utf-16'
|
|
|
|
|
- elif bom.startswith(b'\xfe\xff'):
|
|
|
|
|
- return 'utf-16-be'
|
|
|
|
|
-
|
|
|
|
|
- return 'cp932' # 默认返回日文常用编码
|
|
|
|
|
|
|
+ raw_data = f.read(10000) # 读取前10000字节用于检测
|
|
|
|
|
+ result = chardet.detect(raw_data)
|
|
|
|
|
+ encoding = result['encoding']
|
|
|
|
|
+
|
|
|
|
|
+ # 处理一些常见的不准确检测结果
|
|
|
|
|
+ if encoding == 'SHIFT_JIS':
|
|
|
|
|
+ return 'shift_jis'
|
|
|
|
|
+ elif encoding == 'EUC-JP':
|
|
|
|
|
+ return 'euc-jp'
|
|
|
|
|
+ elif encoding == 'ISO-8859-1':
|
|
|
|
|
+ # 可能是UTF-8被误判为ISO-8859-1
|
|
|
|
|
+ try:
|
|
|
|
|
+ raw_data.decode('utf-8')
|
|
|
|
|
+ return 'utf-8'
|
|
|
|
|
+ except:
|
|
|
|
|
+ return 'cp932'
|
|
|
|
|
+ return encoding or 'utf-8'
|
|
|
|
|
|
|
|
def read_csv(file_path):
|
|
def read_csv(file_path):
|
|
|
"""读取CSV文件并返回数据列表"""
|
|
"""读取CSV文件并返回数据列表"""
|
|
@@ -39,11 +36,17 @@ def read_csv(file_path):
|
|
|
print(f"Detected encoding: {encoding}")
|
|
print(f"Detected encoding: {encoding}")
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
- with open(file_path, mode='r', encoding=encoding) as file:
|
|
|
|
|
|
|
+ with open(file_path, mode='r', encoding=encoding, errors='replace') as file:
|
|
|
# 使用csv.Sniffer检测分隔符
|
|
# 使用csv.Sniffer检测分隔符
|
|
|
- dialect = csv.Sniffer().sniff(file.read(1024))
|
|
|
|
|
|
|
+ sample = file.read(1024)
|
|
|
file.seek(0)
|
|
file.seek(0)
|
|
|
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ dialect = csv.Sniffer().sniff(sample)
|
|
|
|
|
+ except:
|
|
|
|
|
+ # 如果无法自动检测,使用默认设置
|
|
|
|
|
+ dialect = csv.excel
|
|
|
|
|
+
|
|
|
reader = csv.reader(file, dialect)
|
|
reader = csv.reader(file, dialect)
|
|
|
data = [row for row in reader]
|
|
data = [row for row in reader]
|
|
|
|
|
|