1 год назад · 4bc0f13871
--- a/mylib/read_encoding_cvs.py
+++ b/mylib/read_encoding_cvs.py
@@ -7,6 +7,7 @@ from mylib.logging_config import setup_logging
 
															 setup_logging()
														
 
															 logger = logging.getLogger(__name__)
														
 
															 logger.info(f"{__file__}")
														
 
															+
														
 
															 def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
														
 
															     """检测文件编码
														
@@ -36,13 +37,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
 
															             # 如果置信度低于阈值或编码为 None，尝试其他常见编码
														
 
															             if not encoding or confidence < 0.7:
														
 
															                 logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
														
 
															-                return 'utf-8-sig'
														
 
															+                return 'shift_jis'  # 优先返回日文编码
														
 
															             return encoding
														
 
															     except Exception as e:
														
 
															         logger.error(f"Error detecting encoding for {file_path}: {e}")
														
 
															-        return 'utf-8-sig'  # 返回默认编码而不是退出
														
 
															+        return 'shift_jis'  # 返回日文编码作为默认值
														
 
															 def read_csv(file_path: str) -> List[List[str]]:
														
@@ -54,8 +55,18 @@ def read_csv(file_path: str) -> List[List[str]]:
 
															     Returns:
														
 
															         包含CSV数据的二维列表
														
 
															     """
														
 
															-    # 常见编码列表，按优先级排序，优先尝试日文编码
														
 
															-    encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
														
 
															+    # 常见编码列表，优先尝试日文编码
														
 
															+    encodings_to_try = [
														
 
															+        'shift_jis',  # 日文常用编码
														
 
															+        'cp932',      # Windows日文编码
														
 
															+        'euc-jp',     # 日文EUC编码
														
 
															+        'iso-2022-jp',# 日文JIS编码
														
 
															+        'utf-8-sig',  # UTF-8 with BOM
														
 
															+        'gb18030',    # 中文编码
														
 
															+        'big5',       # 繁体中文
														
 
															+        'iso-8859-1',
														
 
															+        'latin1'
														
 
															+    ]
														
 
															     # 先尝试检测编码
														
 
															     detected_encoding = detect_encoding(file_path)
														
@@ -77,6 +88,13 @@ def read_csv(file_path: str) -> List[List[str]]:
 
															                 for row in data[:5]:
														
 
															                     logger.debug(f"Row: {row}")
														
 
															+                # 检查日文字符是否正确解码
														
 
															+                if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
														
 
															+                    japanese_chars = ''.join([cell for row in data[:5] for cell in row])
														
 
															+                    if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars):  # 检查是否包含日文字符
														
 
															+                        logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
														
 
															+                        continue
														
 
															+                
														
 
															                 # 将数据写入新的utf-8编码文件
														
 
															                 output_file_path = file_path + '.utf8.csv'
														
 
															                 with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out: