| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575 |
- /**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
- * MIT License (https://opensource.org/licenses/MIT)
- */
- #include "encode_converter.h"
- #include <assert.h>
- namespace funasr {
- using namespace std;
- U16CHAR_T UTF16[8];
- U8CHAR_T UTF8[8];
- size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
- size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
- void EncodeConverter::SwapEndian(U16CHAR_T* pbuf, size_t len)
- {
- for (size_t i = 0; i < len; i++) {
- pbuf[i] = ((pbuf[i] >> 8) | (pbuf[i] << 8));
- }
- }
- size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
- {
- size_t n = 0;
- if (pu16[0] <= 0x007F)
- {
- pu8[0] = (pu16[0] & 0x7F);
- n = 1;
- }
- else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
- {
- pu8[1] = (0x80 | (pu16[0] & 0x003F));
- pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
- n = 2;
- }
- else if (pu16[0] >= 0x0800)
- {
- pu8[2] = (0x80 | (pu16[0] & 0x003F));
- pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
- pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
- n = 3;
- }
- return n;
- }
- #define is2ByteUtf16(u16) ( (u16) >= 0x0080 && (u16) <= 0x07FF )
- #define is3ByteUtf16(u16) ( (u16) >= 0x0800 )
- size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
- {
- size_t n = 0;
- if (pu16[0] <= 0x007F)
- {
- pu8[0] = (pu16[0] & 0x7F);
- n = 1;
- }
- else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
- {
- pu8[1] = (0x80 | (pu16[0] & 0x003F));
- pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
- n = 2;
- }
- else if (pu16[0] >= 0x0800)
- {
- pu8[2] = (0x80 | (pu16[0] & 0x003F));
- pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
- pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
- n = 3;
- }
- return n;
- }
- size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
- U8CHAR_T* pu8, size_t olen)
- {
- size_t offset = 0;
- size_t sz = 0;
- /*
- for (size_t i = 0; i < ilen && offset < static_cast<int>(olen) - 3; i++) {
- sz = utf16ToUtf8(pu16 + i, pu8 + offset);
- offset += sz;
- }
- */
- for (size_t i = 0; i < ilen && static_cast<int>(offset) < static_cast<int>(olen); i++) {
- sz = Utf16ToUtf8(pu16 + i, pu8 + offset);
- if (static_cast<int>(offset + static_cast<int>(sz)) <= static_cast<int>(olen))
- offset += sz;
- }
-
- // pu8[offset] = '\0';
- return offset;
- }
- u8string EncodeConverter::Utf16ToUtf8(const u16string& u16str)
- {
- size_t buflen = u16str.length()*3 + 1;
- U8CHAR_T* pu8 = new U8CHAR_T[buflen];
- size_t len = Utf16ToUtf8(u16str.data(), u16str.length(),
- pu8, buflen);
- u8string u8str(pu8, len);
- delete [] pu8;
- return u8str;
- }
- size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16)
- {
- size_t n = 0;
- if ((pu8[0] & 0xF0) == 0xE0)
- {
- if ((pu8[1] & 0xC0) == 0x80 &&
- (pu8[2] & 0xC0) == 0x80)
- {
- pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
- }
- else
- {
- pu16[0] = defUniChar;
- }
- n = 3;
- }
- else if ((pu8[0] & 0xE0) == 0xC0)
- {
- if ((pu8[1] & 0xC0) == 0x80)
- {
- pu16[0] = ((pu8[0] & 0x1C) >> 2);
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
- }
- else
- {
- pu16[0] = defUniChar;
- }
- n = 2;
- }
- else if ((pu8[0] & 0x80) == 0x00)
- {
- pu16[0] = pu8[0];
- n = 1;
- }
- return n;
- }
- size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
- {
- size_t n = 0;
- if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
- {
- if ((pu8[1] & 0xC0) == 0x80 &&
- (pu8[2] & 0xC0) == 0x80)
- {
- pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
- n = 3;
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
- {
- if ((pu8[1] & 0xC0) == 0x80)
- {
- pu16[0] = ((pu8[0] & 0x1C) >> 2);
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
- n = 2;
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else if ((pu8[0] & 0x80) == 0x00)
- {
- pu16[0] = pu8[0];
- n = 1;
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- return n;
- }
- size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
- {
- size_t n = 0;
- if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
- {
- if ((pu8[1] & 0xC0) == 0x80 &&
- (pu8[2] & 0xC0) == 0x80)
- {
- pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
- n = 3;
- if( !is3ByteUtf16(pu16[0]) )
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
- {
- if ((pu8[1] & 0xC0) == 0x80)
- {
- pu16[0] = ((pu8[0] & 0x1C) >> 2);
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
- n = 2;
- if( !is2ByteUtf16(pu16[0]) )
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- }
- else if ((pu8[0] & 0x80) == 0x00)
- {
- pu16[0] = pu8[0];
- n = 1;
- }
- else
- {
- pu16[0] = 0x0000;
- n = 1;
- }
- return n;
- /*
- size_t n = 0;
- if ((pu8[0] & 0xF0) == 0xE0)
- {
- if (ilen >= 3 && (pu8[1] & 0xC0) == 0x80 &&
- (pu8[2] & 0xC0) == 0x80)
- {
- pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
- }
- else
- {
- pu16[0] = defUniChar;
- }
- n = 3;
- }
- else if ((pu8[0] & 0xE0) == 0xC0)
- {
- if( ilen >= 2 && (pu8[1] & 0xC0) == 0x80)
- {
- pu16[0] = ((pu8[0] & 0x1C) >> 2);
- pu16[0] <<= 8;
- pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
- }
- else
- {
- pu16[0] = defUniChar;
- }
- n = 2;
- }
- else if ((pu8[0] & 0x80) == 0x00)
- {
- pu16[0] = pu8[0];
- n = 1;
- }
- else
- {
- pu16[0] = defUniChar;
- n = 1;
- for (size_t i = 1; i < ilen; i++)
- {
- if ((pu8[i] & 0xF0) == 0xE0 || (pu8[i] & 0xE0) == 0xC0 || (pu8[i] & 0x80) == 0x00)
- break;
- n++;
- }
- }
- return n;
- */
- }
- size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
- U16CHAR_T* pu16, size_t olen)
- {
- int offset = 0;
- size_t sz = 0;
- for (size_t i = 0; i < ilen && offset < static_cast<int>(olen); offset ++)
- {
- sz = Utf8ToUtf16(pu8 + i, ilen - i, pu16 + offset);
- i += sz;
- if (sz == 0) {
- // failed
- // assert(sz != 0);
- break;
- }
- }
- // pu16[offset] = '\0';
- return offset;
- }
- u16string EncodeConverter::Utf8ToUtf16(const u8string& u8str)
- {
- U16CHAR_T* p16 = new U16CHAR_T[u8str.length() + 1];
- size_t len = Utf8ToUtf16(u8str.data(), u8str.length(),
- p16, u8str.length() + 1);
- u16string u16str(p16, len);
- delete[] p16;
- return u16str;
- }
- bool EncodeConverter::IsUTF8(const U8CHAR_T* pu8, size_t ilen)
- {
- size_t i;
- size_t n = 0;
- for (i = 0; i < ilen; i += n)
- {
- if ((pu8[i] & 0xF0) == 0xE0 &&
- (pu8[i + 1] & 0xC0) == 0x80 &&
- (pu8[i + 2] & 0xC0) == 0x80)
- {
- n = 3;
- }
- else if ((pu8[i] & 0xE0) == 0xC0 &&
- (pu8[i + 1] & 0xC0) == 0x80)
- {
- n = 2;
- }
- else if ((pu8[i] & 0x80) == 0x00)
- {
- n = 1;
- }
- else
- {
- break;
- }
- }
- return i == ilen;
- }
- bool EncodeConverter::IsUTF8(const u8string& u8str)
- {
- return IsUTF8(u8str.data(), u8str.length());
- }
-
- size_t EncodeConverter::GetUTF8Len(const U8CHAR_T* pu8, size_t ilen)
- {
- size_t i;
- size_t n = 0;
- size_t rlen = 0;
- for (i = 0; i < ilen; i += n, rlen ++)
- {
- if ((pu8[i] & 0xF0) == 0xE0 &&
- (pu8[i + 1] & 0xC0) == 0x80 &&
- (pu8[i + 2] & 0xC0) == 0x80)
- {
- n = 3;
- }
- else if ((pu8[i] & 0xE0) == 0xC0 &&
- (pu8[i + 1] & 0xC0) == 0x80)
- {
- n = 2;
- }
- else if ((pu8[i] & 0x80) == 0x00)
- {
- n = 1;
- }
- else
- {
- break;
- }
- }
- if (i == ilen)
- return 0;
- else
- return rlen;
- }
- size_t EncodeConverter::GetUTF8Len(const u8string& u8str)
- {
- return GetUTF8Len(u8str.data(), u8str.length());
- }
- size_t EncodeConverter::Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen)
- {
- int offset = 0;
- for (size_t i = 0; i < ilen ; i++) {
- if (pu16[i] <= 0x007F)
- {
- offset += 1;
- }
- else if (pu16[i] >= 0x0080 && pu16[i] <= 0x07FF)
- {
- offset += 2;
- }
- else if (pu16[i] >= 0x0800)
- {
- offset += 3;
- }
- }
-
- return offset;
- }
- uint16_t EncodeConverter::ToUni(const char* sc, int &len)
- {
- uint16_t wide[2];
- len = (int)Utf8ToUtf16((const U8CHAR_T*)sc, wide);
- return wide[0];
- }
- bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- U16CHAR_T* p16 = new U16CHAR_T[ilen + 1];
- size_t len = Utf8ToUtf16(pu8, ilen, p16, ilen + 1);
- for (size_t i = 0; i < len; i++) {
- if (p16[i] < 0x4e00 || p16[i] > 0x9fff) {
- delete[] p16;
- return false;
- }
- }
- delete[] p16;
- return true;
- }
- bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- for (size_t i = 0; i < ilen; i++) {
- if (pu8[i]> 0 && isalpha(pu8[i])){
- return true;
- }
- }
- return false;
- }
- bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- for (size_t i = 0; i < ilen; i++) {
- if (!(pu8[i]> 0 && isalpha(pu8[i]))){
- return false;
- }
- }
- return true;
- }
- bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- bool flag1 = HasAlpha(pu8, ilen);
- if (flag1 == false) {
- return false;
- }
- for (size_t i = 0; i < ilen; i++) {
- if (!(pu8[i]> 0 && (isalpha(pu8[i]) || (ispunct(pu8[i]))))){
- return false;
- }
- }
- return true;
- }
- bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- bool flag1 = HasAlpha(pu8, ilen);
- if (flag1 == false) {
- return false;
- }
- for (size_t i = 0; i < ilen; i++) {
- if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || pu8[i] == '\''))){
- return false;
- }
- }
- return true;
- }
- bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) {
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- for (size_t i = 0; i < ilen; i++) {
- if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || isblank(pu8[i]) || pu8[i] == '\''))){
- return false;
- }
- }
- return true;
- }
- bool EncodeConverter::NeedAddTailBlank(std::string str) {
- U8CHAR_T *pu8 = (U8CHAR_T*)str.data();
- size_t ilen = str.size();
- if (pu8 == nullptr || ilen <= 0) {
- return false;
- }
- if (IsAllAlpha(pu8, ilen) || IsAllAlphaAndPunct(pu8, ilen) || IsAllAlphaAndDigit(pu8, ilen)) {
- return true;
- } else {
- return false;
- }
- }
- std::vector<std::string> EncodeConverter::MergeEnglishWord(std::vector<std::string> &str_vec_input,
- std::vector<int> &merge_mask) {
- std::vector<std::string> output;
- for (int i = 0; i < merge_mask.size(); i++) {
- if (merge_mask[i] == 1 && i > 0) {
- output[output.size() - 1] += str_vec_input[i];
- } else {
- output.push_back(str_vec_input[i]);
- }
- }
- str_vec_input.swap(output);
- return str_vec_input;
- }
- size_t EncodeConverter::Utf8ToCharset(const std::string &input, std::vector<std::string> &output) {
- std::string ch;
- for (size_t i = 0, len = 0; i != input.length(); i += len) {
- unsigned char byte = (unsigned)input[i];
- if (byte >= 0xFC) // lenght 6
- len = 6;
- else if (byte >= 0xF8)
- len = 5;
- else if (byte >= 0xF0)
- len = 4;
- else if (byte >= 0xE0)
- len = 3;
- else if (byte >= 0xC0)
- len = 2;
- else
- len = 1;
- ch = input.substr(i, len);
- output.push_back(ch);
- }
- return output.size();
- }
- }
|