encode_converter.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. /**
  2. * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  3. * MIT License (https://opensource.org/licenses/MIT)
  4. */
  5. #include "encode_converter.h"
  6. #include <assert.h>
  7. namespace funasr {
  8. using namespace std;
  9. U16CHAR_T UTF16[8];
  10. U8CHAR_T UTF8[8];
  11. size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
  12. size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
  13. void EncodeConverter::SwapEndian(U16CHAR_T* pbuf, size_t len)
  14. {
  15. for (size_t i = 0; i < len; i++) {
  16. pbuf[i] = ((pbuf[i] >> 8) | (pbuf[i] << 8));
  17. }
  18. }
  19. size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
  20. {
  21. size_t n = 0;
  22. if (pu16[0] <= 0x007F)
  23. {
  24. pu8[0] = (pu16[0] & 0x7F);
  25. n = 1;
  26. }
  27. else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
  28. {
  29. pu8[1] = (0x80 | (pu16[0] & 0x003F));
  30. pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
  31. n = 2;
  32. }
  33. else if (pu16[0] >= 0x0800)
  34. {
  35. pu8[2] = (0x80 | (pu16[0] & 0x003F));
  36. pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
  37. pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
  38. n = 3;
  39. }
  40. return n;
  41. }
  42. #define is2ByteUtf16(u16) ( (u16) >= 0x0080 && (u16) <= 0x07FF )
  43. #define is3ByteUtf16(u16) ( (u16) >= 0x0800 )
  44. size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
  45. {
  46. size_t n = 0;
  47. if (pu16[0] <= 0x007F)
  48. {
  49. pu8[0] = (pu16[0] & 0x7F);
  50. n = 1;
  51. }
  52. else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
  53. {
  54. pu8[1] = (0x80 | (pu16[0] & 0x003F));
  55. pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
  56. n = 2;
  57. }
  58. else if (pu16[0] >= 0x0800)
  59. {
  60. pu8[2] = (0x80 | (pu16[0] & 0x003F));
  61. pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
  62. pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
  63. n = 3;
  64. }
  65. return n;
  66. }
  67. size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
  68. U8CHAR_T* pu8, size_t olen)
  69. {
  70. size_t offset = 0;
  71. size_t sz = 0;
  72. /*
  73. for (size_t i = 0; i < ilen && offset < static_cast<int>(olen) - 3; i++) {
  74. sz = utf16ToUtf8(pu16 + i, pu8 + offset);
  75. offset += sz;
  76. }
  77. */
  78. for (size_t i = 0; i < ilen && static_cast<int>(offset) < static_cast<int>(olen); i++) {
  79. sz = Utf16ToUtf8(pu16 + i, pu8 + offset);
  80. if (static_cast<int>(offset + static_cast<int>(sz)) <= static_cast<int>(olen))
  81. offset += sz;
  82. }
  83. // pu8[offset] = '\0';
  84. return offset;
  85. }
  86. u8string EncodeConverter::Utf16ToUtf8(const u16string& u16str)
  87. {
  88. size_t buflen = u16str.length()*3 + 1;
  89. U8CHAR_T* pu8 = new U8CHAR_T[buflen];
  90. size_t len = Utf16ToUtf8(u16str.data(), u16str.length(),
  91. pu8, buflen);
  92. u8string u8str(pu8, len);
  93. delete [] pu8;
  94. return u8str;
  95. }
  96. size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16)
  97. {
  98. size_t n = 0;
  99. if ((pu8[0] & 0xF0) == 0xE0)
  100. {
  101. if ((pu8[1] & 0xC0) == 0x80 &&
  102. (pu8[2] & 0xC0) == 0x80)
  103. {
  104. pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
  105. pu16[0] <<= 8;
  106. pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
  107. }
  108. else
  109. {
  110. pu16[0] = defUniChar;
  111. }
  112. n = 3;
  113. }
  114. else if ((pu8[0] & 0xE0) == 0xC0)
  115. {
  116. if ((pu8[1] & 0xC0) == 0x80)
  117. {
  118. pu16[0] = ((pu8[0] & 0x1C) >> 2);
  119. pu16[0] <<= 8;
  120. pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
  121. }
  122. else
  123. {
  124. pu16[0] = defUniChar;
  125. }
  126. n = 2;
  127. }
  128. else if ((pu8[0] & 0x80) == 0x00)
  129. {
  130. pu16[0] = pu8[0];
  131. n = 1;
  132. }
  133. return n;
  134. }
  135. size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
  136. {
  137. size_t n = 0;
  138. if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
  139. {
  140. if ((pu8[1] & 0xC0) == 0x80 &&
  141. (pu8[2] & 0xC0) == 0x80)
  142. {
  143. pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
  144. pu16[0] <<= 8;
  145. pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
  146. n = 3;
  147. }
  148. else
  149. {
  150. pu16[0] = 0x0000;
  151. n = 1;
  152. }
  153. }
  154. else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
  155. {
  156. if ((pu8[1] & 0xC0) == 0x80)
  157. {
  158. pu16[0] = ((pu8[0] & 0x1C) >> 2);
  159. pu16[0] <<= 8;
  160. pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
  161. n = 2;
  162. }
  163. else
  164. {
  165. pu16[0] = 0x0000;
  166. n = 1;
  167. }
  168. }
  169. else if ((pu8[0] & 0x80) == 0x00)
  170. {
  171. pu16[0] = pu8[0];
  172. n = 1;
  173. }
  174. else
  175. {
  176. pu16[0] = 0x0000;
  177. n = 1;
  178. }
  179. return n;
  180. }
  181. size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
  182. {
  183. size_t n = 0;
  184. if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
  185. {
  186. if ((pu8[1] & 0xC0) == 0x80 &&
  187. (pu8[2] & 0xC0) == 0x80)
  188. {
  189. pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
  190. pu16[0] <<= 8;
  191. pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
  192. n = 3;
  193. if( !is3ByteUtf16(pu16[0]) )
  194. {
  195. pu16[0] = 0x0000;
  196. n = 1;
  197. }
  198. }
  199. else
  200. {
  201. pu16[0] = 0x0000;
  202. n = 1;
  203. }
  204. }
  205. else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
  206. {
  207. if ((pu8[1] & 0xC0) == 0x80)
  208. {
  209. pu16[0] = ((pu8[0] & 0x1C) >> 2);
  210. pu16[0] <<= 8;
  211. pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
  212. n = 2;
  213. if( !is2ByteUtf16(pu16[0]) )
  214. {
  215. pu16[0] = 0x0000;
  216. n = 1;
  217. }
  218. }
  219. else
  220. {
  221. pu16[0] = 0x0000;
  222. n = 1;
  223. }
  224. }
  225. else if ((pu8[0] & 0x80) == 0x00)
  226. {
  227. pu16[0] = pu8[0];
  228. n = 1;
  229. }
  230. else
  231. {
  232. pu16[0] = 0x0000;
  233. n = 1;
  234. }
  235. return n;
  236. /*
  237. size_t n = 0;
  238. if ((pu8[0] & 0xF0) == 0xE0)
  239. {
  240. if (ilen >= 3 && (pu8[1] & 0xC0) == 0x80 &&
  241. (pu8[2] & 0xC0) == 0x80)
  242. {
  243. pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
  244. pu16[0] <<= 8;
  245. pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
  246. }
  247. else
  248. {
  249. pu16[0] = defUniChar;
  250. }
  251. n = 3;
  252. }
  253. else if ((pu8[0] & 0xE0) == 0xC0)
  254. {
  255. if( ilen >= 2 && (pu8[1] & 0xC0) == 0x80)
  256. {
  257. pu16[0] = ((pu8[0] & 0x1C) >> 2);
  258. pu16[0] <<= 8;
  259. pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
  260. }
  261. else
  262. {
  263. pu16[0] = defUniChar;
  264. }
  265. n = 2;
  266. }
  267. else if ((pu8[0] & 0x80) == 0x00)
  268. {
  269. pu16[0] = pu8[0];
  270. n = 1;
  271. }
  272. else
  273. {
  274. pu16[0] = defUniChar;
  275. n = 1;
  276. for (size_t i = 1; i < ilen; i++)
  277. {
  278. if ((pu8[i] & 0xF0) == 0xE0 || (pu8[i] & 0xE0) == 0xC0 || (pu8[i] & 0x80) == 0x00)
  279. break;
  280. n++;
  281. }
  282. }
  283. return n;
  284. */
  285. }
  286. size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
  287. U16CHAR_T* pu16, size_t olen)
  288. {
  289. int offset = 0;
  290. size_t sz = 0;
  291. for (size_t i = 0; i < ilen && offset < static_cast<int>(olen); offset ++)
  292. {
  293. sz = Utf8ToUtf16(pu8 + i, ilen - i, pu16 + offset);
  294. i += sz;
  295. if (sz == 0) {
  296. // failed
  297. // assert(sz != 0);
  298. break;
  299. }
  300. }
  301. // pu16[offset] = '\0';
  302. return offset;
  303. }
  304. u16string EncodeConverter::Utf8ToUtf16(const u8string& u8str)
  305. {
  306. U16CHAR_T* p16 = new U16CHAR_T[u8str.length() + 1];
  307. size_t len = Utf8ToUtf16(u8str.data(), u8str.length(),
  308. p16, u8str.length() + 1);
  309. u16string u16str(p16, len);
  310. delete[] p16;
  311. return u16str;
  312. }
  313. bool EncodeConverter::IsUTF8(const U8CHAR_T* pu8, size_t ilen)
  314. {
  315. size_t i;
  316. size_t n = 0;
  317. for (i = 0; i < ilen; i += n)
  318. {
  319. if ((pu8[i] & 0xF0) == 0xE0 &&
  320. (pu8[i + 1] & 0xC0) == 0x80 &&
  321. (pu8[i + 2] & 0xC0) == 0x80)
  322. {
  323. n = 3;
  324. }
  325. else if ((pu8[i] & 0xE0) == 0xC0 &&
  326. (pu8[i + 1] & 0xC0) == 0x80)
  327. {
  328. n = 2;
  329. }
  330. else if ((pu8[i] & 0x80) == 0x00)
  331. {
  332. n = 1;
  333. }
  334. else
  335. {
  336. break;
  337. }
  338. }
  339. return i == ilen;
  340. }
  341. bool EncodeConverter::IsUTF8(const u8string& u8str)
  342. {
  343. return IsUTF8(u8str.data(), u8str.length());
  344. }
  345. size_t EncodeConverter::GetUTF8Len(const U8CHAR_T* pu8, size_t ilen)
  346. {
  347. size_t i;
  348. size_t n = 0;
  349. size_t rlen = 0;
  350. for (i = 0; i < ilen; i += n, rlen ++)
  351. {
  352. if ((pu8[i] & 0xF0) == 0xE0 &&
  353. (pu8[i + 1] & 0xC0) == 0x80 &&
  354. (pu8[i + 2] & 0xC0) == 0x80)
  355. {
  356. n = 3;
  357. }
  358. else if ((pu8[i] & 0xE0) == 0xC0 &&
  359. (pu8[i + 1] & 0xC0) == 0x80)
  360. {
  361. n = 2;
  362. }
  363. else if ((pu8[i] & 0x80) == 0x00)
  364. {
  365. n = 1;
  366. }
  367. else
  368. {
  369. break;
  370. }
  371. }
  372. if (i == ilen)
  373. return 0;
  374. else
  375. return rlen;
  376. }
  377. size_t EncodeConverter::GetUTF8Len(const u8string& u8str)
  378. {
  379. return GetUTF8Len(u8str.data(), u8str.length());
  380. }
  381. size_t EncodeConverter::Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen)
  382. {
  383. int offset = 0;
  384. for (size_t i = 0; i < ilen ; i++) {
  385. if (pu16[i] <= 0x007F)
  386. {
  387. offset += 1;
  388. }
  389. else if (pu16[i] >= 0x0080 && pu16[i] <= 0x07FF)
  390. {
  391. offset += 2;
  392. }
  393. else if (pu16[i] >= 0x0800)
  394. {
  395. offset += 3;
  396. }
  397. }
  398. return offset;
  399. }
  400. uint16_t EncodeConverter::ToUni(const char* sc, int &len)
  401. {
  402. uint16_t wide[2];
  403. len = (int)Utf8ToUtf16((const U8CHAR_T*)sc, wide);
  404. return wide[0];
  405. }
  406. bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) {
  407. if (pu8 == nullptr || ilen <= 0) {
  408. return false;
  409. }
  410. U16CHAR_T* p16 = new U16CHAR_T[ilen + 1];
  411. size_t len = Utf8ToUtf16(pu8, ilen, p16, ilen + 1);
  412. for (size_t i = 0; i < len; i++) {
  413. if (p16[i] < 0x4e00 || p16[i] > 0x9fff) {
  414. delete[] p16;
  415. return false;
  416. }
  417. }
  418. delete[] p16;
  419. return true;
  420. }
  421. bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) {
  422. if (pu8 == nullptr || ilen <= 0) {
  423. return false;
  424. }
  425. for (size_t i = 0; i < ilen; i++) {
  426. if (pu8[i]> 0 && isalpha(pu8[i])){
  427. return true;
  428. }
  429. }
  430. return false;
  431. }
  432. bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) {
  433. if (pu8 == nullptr || ilen <= 0) {
  434. return false;
  435. }
  436. for (size_t i = 0; i < ilen; i++) {
  437. if (!(pu8[i]> 0 && isalpha(pu8[i]))){
  438. return false;
  439. }
  440. }
  441. return true;
  442. }
  443. bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) {
  444. if (pu8 == nullptr || ilen <= 0) {
  445. return false;
  446. }
  447. bool flag1 = HasAlpha(pu8, ilen);
  448. if (flag1 == false) {
  449. return false;
  450. }
  451. for (size_t i = 0; i < ilen; i++) {
  452. if (!(pu8[i]> 0 && (isalpha(pu8[i]) || (ispunct(pu8[i]))))){
  453. return false;
  454. }
  455. }
  456. return true;
  457. }
  458. bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) {
  459. if (pu8 == nullptr || ilen <= 0) {
  460. return false;
  461. }
  462. bool flag1 = HasAlpha(pu8, ilen);
  463. if (flag1 == false) {
  464. return false;
  465. }
  466. for (size_t i = 0; i < ilen; i++) {
  467. if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || pu8[i] == '\''))){
  468. return false;
  469. }
  470. }
  471. return true;
  472. }
  473. bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) {
  474. if (pu8 == nullptr || ilen <= 0) {
  475. return false;
  476. }
  477. for (size_t i = 0; i < ilen; i++) {
  478. if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || isblank(pu8[i]) || pu8[i] == '\''))){
  479. return false;
  480. }
  481. }
  482. return true;
  483. }
  484. bool EncodeConverter::NeedAddTailBlank(std::string str) {
  485. U8CHAR_T *pu8 = (U8CHAR_T*)str.data();
  486. size_t ilen = str.size();
  487. if (pu8 == nullptr || ilen <= 0) {
  488. return false;
  489. }
  490. if (IsAllAlpha(pu8, ilen) || IsAllAlphaAndPunct(pu8, ilen) || IsAllAlphaAndDigit(pu8, ilen)) {
  491. return true;
  492. } else {
  493. return false;
  494. }
  495. }
  496. std::vector<std::string> EncodeConverter::MergeEnglishWord(std::vector<std::string> &str_vec_input,
  497. std::vector<int> &merge_mask) {
  498. std::vector<std::string> output;
  499. for (int i = 0; i < merge_mask.size(); i++) {
  500. if (merge_mask[i] == 1 && i > 0) {
  501. output[output.size() - 1] += str_vec_input[i];
  502. } else {
  503. output.push_back(str_vec_input[i]);
  504. }
  505. }
  506. str_vec_input.swap(output);
  507. return str_vec_input;
  508. }
  509. size_t EncodeConverter::Utf8ToCharset(const std::string &input, std::vector<std::string> &output) {
  510. std::string ch;
  511. for (size_t i = 0, len = 0; i != input.length(); i += len) {
  512. unsigned char byte = (unsigned)input[i];
  513. if (byte >= 0xFC) // lenght 6
  514. len = 6;
  515. else if (byte >= 0xF8)
  516. len = 5;
  517. else if (byte >= 0xF0)
  518. len = 4;
  519. else if (byte >= 0xE0)
  520. len = 3;
  521. else if (byte >= 0xC0)
  522. len = 2;
  523. else
  524. len = 1;
  525. ch = input.substr(i, len);
  526. output.push_back(ch);
  527. }
  528. return output.size();
  529. }
  530. }