|
@@ -7,6 +7,7 @@ from mylib.logging_config import setup_logging
|
|
|
setup_logging()
|
|
setup_logging()
|
|
|
logger = logging.getLogger(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
|
logger.info(f"{__file__}")
|
|
logger.info(f"{__file__}")
|
|
|
|
|
+
|
|
|
def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
|
|
def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
|
|
|
"""检测文件编码
|
|
"""检测文件编码
|
|
|
|
|
|
|
@@ -36,13 +37,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
|
|
|
# 如果置信度低于阈值或编码为 None,尝试其他常见编码
|
|
# 如果置信度低于阈值或编码为 None,尝试其他常见编码
|
|
|
if not encoding or confidence < 0.7:
|
|
if not encoding or confidence < 0.7:
|
|
|
logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
|
|
logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
|
|
|
- return 'utf-8-sig'
|
|
|
|
|
|
|
+ return 'shift_jis' # 优先返回日文编码
|
|
|
|
|
|
|
|
return encoding
|
|
return encoding
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"Error detecting encoding for {file_path}: {e}")
|
|
logger.error(f"Error detecting encoding for {file_path}: {e}")
|
|
|
- return 'utf-8-sig' # 返回默认编码而不是退出
|
|
|
|
|
|
|
+ return 'shift_jis' # 返回日文编码作为默认值
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_csv(file_path: str) -> List[List[str]]:
|
|
def read_csv(file_path: str) -> List[List[str]]:
|
|
@@ -54,8 +55,18 @@ def read_csv(file_path: str) -> List[List[str]]:
|
|
|
Returns:
|
|
Returns:
|
|
|
包含CSV数据的二维列表
|
|
包含CSV数据的二维列表
|
|
|
"""
|
|
"""
|
|
|
- # 常见编码列表,按优先级排序,优先尝试日文编码
|
|
|
|
|
- encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
|
|
|
|
|
|
|
+ # 常见编码列表,优先尝试日文编码
|
|
|
|
|
+ encodings_to_try = [
|
|
|
|
|
+ 'shift_jis', # 日文常用编码
|
|
|
|
|
+ 'cp932', # Windows日文编码
|
|
|
|
|
+ 'euc-jp', # 日文EUC编码
|
|
|
|
|
+ 'iso-2022-jp',# 日文JIS编码
|
|
|
|
|
+ 'utf-8-sig', # UTF-8 with BOM
|
|
|
|
|
+ 'gb18030', # 中文编码
|
|
|
|
|
+ 'big5', # 繁体中文
|
|
|
|
|
+ 'iso-8859-1',
|
|
|
|
|
+ 'latin1'
|
|
|
|
|
+ ]
|
|
|
|
|
|
|
|
# 先尝试检测编码
|
|
# 先尝试检测编码
|
|
|
detected_encoding = detect_encoding(file_path)
|
|
detected_encoding = detect_encoding(file_path)
|
|
@@ -77,6 +88,13 @@ def read_csv(file_path: str) -> List[List[str]]:
|
|
|
for row in data[:5]:
|
|
for row in data[:5]:
|
|
|
logger.debug(f"Row: {row}")
|
|
logger.debug(f"Row: {row}")
|
|
|
|
|
|
|
|
|
|
+ # 检查日文字符是否正确解码
|
|
|
|
|
+ if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
|
|
|
|
|
+ japanese_chars = ''.join([cell for row in data[:5] for cell in row])
|
|
|
|
|
+ if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars): # 检查是否包含日文字符
|
|
|
|
|
+ logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
# 将数据写入新的utf-8编码文件
|
|
# 将数据写入新的utf-8编码文件
|
|
|
output_file_path = file_path + '.utf8.csv'
|
|
output_file_path = file_path + '.utf8.csv'
|
|
|
with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
|
|
with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out:
|