| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- /**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
- * MIT License (https://opensource.org/licenses/MIT)
- */
- #ifndef __WS__ENCODE_CONVERTER_H__
- #define __WS__ENCODE_CONVERTER_H__
- #include <string>
- #include <stdint.h>
- #include <vector>
- #ifdef _MSC_VER
- #include <windows.h>
- #endif // _MSC_VER
- namespace funasr {
- typedef unsigned char U8CHAR_T;
- typedef unsigned short U16CHAR_T;
- typedef std::basic_string<U8CHAR_T> u8string;
- typedef std::basic_string<U16CHAR_T> u16string;
- class EncodeConverter {
- public:
- static const U16CHAR_T defUniChar = 0x25a1; //WHITE SQUARE
- public:
- static void SwapEndian(U16CHAR_T* pbuf, size_t len);
- static size_t Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
- ///< @param pu16 UTF16 string
- ///< @param pu8 UTF8 string
- static size_t Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
- U8CHAR_T* pu8, size_t olen);
- static u8string Utf16ToUtf8(const u16string& u16str);
- static size_t Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16);
- static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
- ///< @param pu8 UTF8 string
- ///< @param pu16 UTF16 string
- static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
- U16CHAR_T* pu16, size_t olen);
- static u16string Utf8ToUtf16(const u8string& u8str);
- ///< @param pu8 string
- ///< @return if string is encoded as UTF8 - true, otherwise false
- static bool IsUTF8(const U8CHAR_T* pu8, size_t ilen);
- ///< @param u8str string
- ///< @return if string is encoded as UTF8 - true, otherwise false
- static bool IsUTF8(const u8string& u8str);
- ///< @param UTF8 string
- ///< @return the word number of UTF8
- static size_t GetUTF8Len(const U8CHAR_T* pu8, size_t ilen);
- ///< @param UTF8 string
- ///< @return the word number of UTF8
- static size_t GetUTF8Len(const u8string& u8str);
- ///< @param pu16 UTF16 string
- ///< @param ilen UTF16 length
- ///< @return UTF8 string length
- static size_t Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen);
- static uint16_t ToUni(const char* sc, int &len);
- static bool IsChineseCharacter(U16CHAR_T &u16) {
- return (u16 >= 0x4e00 && u16 <= 0x9fff) // common
- || (u16 >= 0x3400 && u16 <= 0x4dff); // rare, extension A
- }
- // whether the string is all Chinese
- static bool IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen);
- static bool HasAlpha(const U8CHAR_T* pu8, size_t ilen);
- static bool NeedAddTailBlank(std::string str);
- static bool IsAllAlpha(const U8CHAR_T* pu8, size_t ilen);
- static bool IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen);
- static bool IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen);
- static bool IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen);
- static std::vector<std::string> MergeEnglishWord(std::vector<std::string> &str_vec_input,
- std::vector<int> &merge_mask);
- static size_t Utf8ToCharset(const std::string &input, std::vector<std::string> &output);
- #ifdef _MSC_VER
- // convert to the local ansi page
- static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) {
- int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0);
- unsigned short*wszGBK = new unsigned short[len + 1];
- memset(wszGBK, 0, len * 2 + 2);
- MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len);
- len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr);
- char *szGBK = new char[len + 1];
- memset(szGBK, 0, len + 1);
- WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr);
- std::string strTemp(szGBK);
- delete[]szGBK;
- delete[]wszGBK;
- return strTemp;
- }
- #endif
- };
- }
- #endif //__WS_ENCODE_CONVERTER_H__
|