1 år sedan · 4bc0f13871
--- a/mylib/read_encoding_cvs.py
+++ b/mylib/read_encoding_cvs.py
@@ -7,6 +7,7 @@ from mylib.logging_config import setup_logging
 
				 setup_logging()
			
 
				 logger = logging.getLogger(__name__)
			
 
				 logger.info(f"{__file__}")
			
 
				+
			
 
				 def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
			
 
				     """检测文件编码
			
 
				     
			
@@ -36,13 +37,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
 
				             # 如果置信度低于阈值或编码为 None，尝试其他常见编码
			
 
				             if not encoding or confidence < 0.7:
			
 
				                 logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
			
 
				-                return 'utf-8-sig'
			
 
				+                return 'shift_jis'  # 优先返回日文编码
			
 
				                 
			
 
				             return encoding
			
 
				             
			
 
				     except Exception as e:
			
 
				         logger.error(f"Error detecting encoding for {file_path}: {e}")
			
 
				-        return 'utf-8-sig'  # 返回默认编码而不是退出
			
 
				+        return 'shift_jis'  # 返回日文编码作为默认值
			
 
				 
			
 
				 
			
 
				 def read_csv(file_path: str) -> List[List[str]]:
			
@@ -54,8 +55,18 @@ def read_csv(file_path: str) -> List[List[str]]:
 
				     Returns:
			
 
				         包含CSV数据的二维列表
			
 
				     """
			
 
				-    # 常见编码列表，按优先级排序，优先尝试日文编码
			
 
				-    encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
			
 
				+    # 常见编码列表，优先尝试日文编码
			
 
				+    encodings_to_try = [
			
 
				+        'shift_jis',  # 日文常用编码
			
 
				+        'cp932',      # Windows日文编码
			
 
				+        'euc-jp',     # 日文EUC编码
			
 
				+        'iso-2022-jp',# 日文JIS编码
			
 
				+        'utf-8-sig',  # UTF-8 with BOM
			
 
				+        'gb18030',    # 中文编码
			
 
				+        'big5',       # 繁体中文
			
 
				+        'iso-8859-1',
			
 
				+        'latin1'
			
 
				+    ]
			
 
				     
			
 
				     # 先尝试检测编码
			
 
				     detected_encoding = detect_encoding(file_path)
			
@@ -77,6 +88,13 @@ def read_csv(file_path: str) -> List[List[str]]:
 
				                 for row in data[:5]:
			
 
				                     logger.debug(f"Row: {row}")
			
 
				                 
			
 
				+                # 检查日文字符是否正确解码
			
 
				+                if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
			
 
				+                    japanese_chars = ''.join([cell for row in data[:5] for cell in row])
			
 
				+                    if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars):  # 检查是否包含日文字符
			
 
				+                        logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
			
 
				+                        continue
			
 
				+                
			
 
				                 # 将数据写入新的utf-8编码文件
			
 
				                 output_file_path = file_path + '.utf8.csv'
			
 
				                 with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out: