encode_converter.h 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. /**
  2. * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  3. * MIT License (https://opensource.org/licenses/MIT)
  4. */
  5. #ifndef __WS__ENCODE_CONVERTER_H__
  6. #define __WS__ENCODE_CONVERTER_H__
  7. #include <string>
  8. #include <stdint.h>
  9. #include <vector>
  10. #ifdef _MSC_VER
  11. #include <windows.h>
  12. #endif // _MSC_VER
  13. namespace funasr {
  14. typedef unsigned char U8CHAR_T;
  15. typedef unsigned short U16CHAR_T;
  16. typedef std::basic_string<U8CHAR_T> u8string;
  17. typedef std::basic_string<U16CHAR_T> u16string;
  18. class EncodeConverter {
  19. public:
  20. static const U16CHAR_T defUniChar = 0x25a1; //WHITE SQUARE
  21. public:
  22. static void SwapEndian(U16CHAR_T* pbuf, size_t len);
  23. static size_t Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
  24. ///< @param pu16 UTF16 string
  25. ///< @param pu8 UTF8 string
  26. static size_t Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
  27. U8CHAR_T* pu8, size_t olen);
  28. static u8string Utf16ToUtf8(const u16string& u16str);
  29. static size_t Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16);
  30. static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
  31. ///< @param pu8 UTF8 string
  32. ///< @param pu16 UTF16 string
  33. static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
  34. U16CHAR_T* pu16, size_t olen);
  35. static u16string Utf8ToUtf16(const u8string& u8str);
  36. ///< @param pu8 string
  37. ///< @return if string is encoded as UTF8 - true, otherwise false
  38. static bool IsUTF8(const U8CHAR_T* pu8, size_t ilen);
  39. ///< @param u8str string
  40. ///< @return if string is encoded as UTF8 - true, otherwise false
  41. static bool IsUTF8(const u8string& u8str);
  42. ///< @param UTF8 string
  43. ///< @return the word number of UTF8
  44. static size_t GetUTF8Len(const U8CHAR_T* pu8, size_t ilen);
  45. ///< @param UTF8 string
  46. ///< @return the word number of UTF8
  47. static size_t GetUTF8Len(const u8string& u8str);
  48. ///< @param pu16 UTF16 string
  49. ///< @param ilen UTF16 length
  50. ///< @return UTF8 string length
  51. static size_t Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen);
  52. static uint16_t ToUni(const char* sc, int &len);
  53. static bool IsChineseCharacter(U16CHAR_T &u16) {
  54. return (u16 >= 0x4e00 && u16 <= 0x9fff) // common
  55. || (u16 >= 0x3400 && u16 <= 0x4dff); // rare, extension A
  56. }
  57. // whether the string is all Chinese
  58. static bool IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen);
  59. static bool HasAlpha(const U8CHAR_T* pu8, size_t ilen);
  60. static bool NeedAddTailBlank(std::string str);
  61. static bool IsAllAlpha(const U8CHAR_T* pu8, size_t ilen);
  62. static bool IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen);
  63. static bool IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen);
  64. static bool IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen);
  65. static std::vector<std::string> MergeEnglishWord(std::vector<std::string> &str_vec_input,
  66. std::vector<int> &merge_mask);
  67. static size_t Utf8ToCharset(const std::string &input, std::vector<std::string> &output);
  68. #ifdef _MSC_VER
  69. // convert to the local ansi page
  70. static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) {
  71. int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0);
  72. unsigned short*wszGBK = new unsigned short[len + 1];
  73. memset(wszGBK, 0, len * 2 + 2);
  74. MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len);
  75. len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr);
  76. char *szGBK = new char[len + 1];
  77. memset(szGBK, 0, len + 1);
  78. WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr);
  79. std::string strTemp(szGBK);
  80. delete[]szGBK;
  81. delete[]wszGBK;
  82. return strTemp;
  83. }
  84. #endif
  85. };
  86. }
  87. #endif //__WS_ENCODE_CONVERTER_H__