Bläddra i källkod

feat: prioritize Japanese encodings and add character validation

mrh (aider) 1 år sedan
förälder
incheckning
4bc0f13871
1 ändrade filer med 22 tillägg och 4 borttagningar
  1. 22 4
      mylib/read_encoding_cvs.py

+ 22 - 4
mylib/read_encoding_cvs.py

@@ -7,6 +7,7 @@ from mylib.logging_config import setup_logging
 setup_logging()
 logger = logging.getLogger(__name__)
 logger.info(f"{__file__}")
+
 def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
     """检测文件编码
     
@@ -36,13 +37,13 @@ def detect_encoding(file_path: str, sample_size: int = 100000) -> str:
             # 如果置信度低于阈值或编码为 None,尝试其他常见编码
             if not encoding or confidence < 0.7:
                 logger.warning(f"Low confidence in detected encoding {encoding}, trying common encodings")
-                return 'utf-8-sig'
+                return 'shift_jis'  # 优先返回日文编码
                 
             return encoding
             
     except Exception as e:
         logger.error(f"Error detecting encoding for {file_path}: {e}")
-        return 'utf-8-sig'  # 返回默认编码而不是退出
+        return 'shift_jis'  # 返回日文编码作为默认值
 
 
 def read_csv(file_path: str) -> List[List[str]]:
@@ -54,8 +55,18 @@ def read_csv(file_path: str) -> List[List[str]]:
     Returns:
         包含CSV数据的二维列表
     """
-    # 常见编码列表,按优先级排序,优先尝试日文编码
-    encodings_to_try = ['shift_jis', 'euc-jp', 'utf-8-sig', 'gb18030', 'iso-8859-1', 'latin1']
+    # 常见编码列表,优先尝试日文编码
+    encodings_to_try = [
+        'shift_jis',  # 日文常用编码
+        'cp932',      # Windows日文编码
+        'euc-jp',     # 日文EUC编码
+        'iso-2022-jp',# 日文JIS编码
+        'utf-8-sig',  # UTF-8 with BOM
+        'gb18030',    # 中文编码
+        'big5',       # 繁体中文
+        'iso-8859-1',
+        'latin1'
+    ]
     
     # 先尝试检测编码
     detected_encoding = detect_encoding(file_path)
@@ -77,6 +88,13 @@ def read_csv(file_path: str) -> List[List[str]]:
                 for row in data[:5]:
                     logger.debug(f"Row: {row}")
                 
+                # 检查日文字符是否正确解码
+                if encoding.startswith(('shift_jis', 'cp932', 'euc-jp')):
+                    japanese_chars = ''.join([cell for row in data[:5] for cell in row])
+                    if not any('\u3040' <= char <= '\u30ff' for char in japanese_chars):  # 检查是否包含日文字符
+                        logger.warning(f"Japanese characters not detected with {encoding}, trying next encoding")
+                        continue
+                
                 # 将数据写入新的utf-8编码文件
                 output_file_path = file_path + '.utf8.csv'
                 with open(output_file_path, 'w', encoding='utf-8', newline='') as f_out: