websocket-server-2pass.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /**
  2. * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
  3. * Reserved. MIT License (https://opensource.org/licenses/MIT)
  4. */
  5. /* 2022-2023 by zhaomingwork */
  6. // websocket server for asr engine
  7. // take some ideas from https://github.com/k2-fsa/sherpa-onnx
  8. // online-websocket-server-impl.cc, thanks. The websocket server has two threads
  9. // pools, one for handle network data and one for asr decoder.
  10. // now only support offline engine.
  11. #include "websocket-server-2pass.h"
  12. #include <thread>
  13. #include <utility>
  14. #include <vector>
  15. extern std::unordered_map<std::string, int> hws_map_;
  16. extern int fst_inc_wts_;
  17. context_ptr WebSocketServer::on_tls_init(tls_mode mode,
  18. websocketpp::connection_hdl hdl,
  19. std::string& s_certfile,
  20. std::string& s_keyfile) {
  21. namespace asio = websocketpp::lib::asio;
  22. LOG(INFO) << "on_tls_init called with hdl: " << hdl.lock().get();
  23. LOG(INFO) << "using TLS mode: "
  24. << (mode == MOZILLA_MODERN ? "Mozilla Modern"
  25. : "Mozilla Intermediate");
  26. context_ptr ctx = websocketpp::lib::make_shared<asio::ssl::context>(
  27. asio::ssl::context::sslv23);
  28. try {
  29. if (mode == MOZILLA_MODERN) {
  30. // Modern disables TLSv1
  31. ctx->set_options(
  32. asio::ssl::context::default_workarounds |
  33. asio::ssl::context::no_sslv2 | asio::ssl::context::no_sslv3 |
  34. asio::ssl::context::no_tlsv1 | asio::ssl::context::single_dh_use);
  35. } else {
  36. ctx->set_options(asio::ssl::context::default_workarounds |
  37. asio::ssl::context::no_sslv2 |
  38. asio::ssl::context::no_sslv3 |
  39. asio::ssl::context::single_dh_use);
  40. }
  41. ctx->use_certificate_chain_file(s_certfile);
  42. ctx->use_private_key_file(s_keyfile, asio::ssl::context::pem);
  43. } catch (std::exception& e) {
  44. LOG(INFO) << "Exception: " << e.what();
  45. }
  46. return ctx;
  47. }
  48. nlohmann::json handle_result(FUNASR_RESULT result) {
  49. websocketpp::lib::error_code ec;
  50. nlohmann::json jsonresult;
  51. jsonresult["text"] = "";
  52. std::string tmp_online_msg = FunASRGetResult(result, 0);
  53. if (tmp_online_msg != "") {
  54. LOG(INFO) << "online_res :" << tmp_online_msg;
  55. jsonresult["text"] = tmp_online_msg;
  56. jsonresult["mode"] = "2pass-online";
  57. }
  58. std::string tmp_tpass_msg = FunASRGetTpassResult(result, 0);
  59. if (tmp_tpass_msg != "") {
  60. LOG(INFO) << "offline results : " << tmp_tpass_msg;
  61. jsonresult["text"] = tmp_tpass_msg;
  62. jsonresult["mode"] = "2pass-offline";
  63. }
  64. std::string tmp_stamp_msg = FunASRGetStamp(result);
  65. if (tmp_stamp_msg != "") {
  66. LOG(INFO) << "offline stamps : " << tmp_stamp_msg;
  67. jsonresult["timestamp"] = tmp_stamp_msg;
  68. }
  69. return jsonresult;
  70. }
  71. // feed buffer to asr engine for decoder
  72. void WebSocketServer::do_decoder(
  73. std::vector<char>& buffer,
  74. websocketpp::connection_hdl& hdl,
  75. nlohmann::json& msg,
  76. std::vector<std::vector<std::string>>& punc_cache,
  77. std::vector<std::vector<float>> &hotwords_embedding,
  78. websocketpp::lib::mutex& thread_lock,
  79. bool& is_final,
  80. std::string wav_name,
  81. std::string modetype,
  82. bool itn,
  83. int audio_fs,
  84. std::string wav_format,
  85. FUNASR_HANDLE& tpass_online_handle) {
  86. // lock for each connection
  87. if(!tpass_online_handle){
  88. scoped_lock guard(thread_lock);
  89. LOG(INFO) << "tpass_online_handle is free, return";
  90. msg["access_num"]=(int)msg["access_num"]-1;
  91. return;
  92. }
  93. try {
  94. FUNASR_RESULT Result = nullptr;
  95. int asr_mode_ = 2;
  96. if (modetype == "offline") {
  97. asr_mode_ = 0;
  98. } else if (modetype == "online") {
  99. asr_mode_ = 1;
  100. } else if (modetype == "2pass") {
  101. asr_mode_ = 2;
  102. }
  103. while (buffer.size() >= 800 * 2 && !msg["is_eof"]) {
  104. std::vector<char> subvector = {buffer.begin(), buffer.begin() + 800 * 2};
  105. buffer.erase(buffer.begin(), buffer.begin() + 800 * 2);
  106. try {
  107. if (tpass_online_handle) {
  108. Result = FunTpassInferBuffer(tpass_handle, tpass_online_handle,
  109. subvector.data(), subvector.size(),
  110. punc_cache, false, audio_fs,
  111. wav_format, (ASR_TYPE)asr_mode_,
  112. hotwords_embedding, itn);
  113. } else {
  114. scoped_lock guard(thread_lock);
  115. msg["access_num"]=(int)msg["access_num"]-1;
  116. return;
  117. }
  118. } catch (std::exception const& e) {
  119. scoped_lock guard(thread_lock);
  120. LOG(ERROR) << e.what();
  121. msg["access_num"]=(int)msg["access_num"]-1;
  122. return;
  123. }
  124. if (Result) {
  125. websocketpp::lib::error_code ec;
  126. nlohmann::json jsonresult = handle_result(Result);
  127. jsonresult["wav_name"] = wav_name;
  128. jsonresult["is_final"] = false;
  129. if (jsonresult["text"] != "") {
  130. if (is_ssl) {
  131. wss_server_->send(hdl, jsonresult.dump(),
  132. websocketpp::frame::opcode::text, ec);
  133. } else {
  134. server_->send(hdl, jsonresult.dump(),
  135. websocketpp::frame::opcode::text, ec);
  136. }
  137. }
  138. FunASRFreeResult(Result);
  139. }
  140. }
  141. if (is_final && !msg["is_eof"]) {
  142. try {
  143. if (tpass_online_handle) {
  144. Result = FunTpassInferBuffer(tpass_handle, tpass_online_handle,
  145. buffer.data(), buffer.size(), punc_cache,
  146. is_final, audio_fs,
  147. wav_format, (ASR_TYPE)asr_mode_,
  148. hotwords_embedding, itn);
  149. } else {
  150. scoped_lock guard(thread_lock);
  151. msg["access_num"]=(int)msg["access_num"]-1;
  152. return;
  153. }
  154. } catch (std::exception const& e) {
  155. scoped_lock guard(thread_lock);
  156. LOG(ERROR) << e.what();
  157. msg["access_num"]=(int)msg["access_num"]-1;
  158. return;
  159. }
  160. if(punc_cache.size()>0){
  161. for (auto& vec : punc_cache) {
  162. vec.clear();
  163. }
  164. }
  165. if (Result) {
  166. websocketpp::lib::error_code ec;
  167. nlohmann::json jsonresult = handle_result(Result);
  168. jsonresult["wav_name"] = wav_name;
  169. jsonresult["is_final"] = true;
  170. if (is_ssl) {
  171. wss_server_->send(hdl, jsonresult.dump(),
  172. websocketpp::frame::opcode::text, ec);
  173. } else {
  174. server_->send(hdl, jsonresult.dump(),
  175. websocketpp::frame::opcode::text, ec);
  176. }
  177. FunASRFreeResult(Result);
  178. }else{
  179. if(wav_format != "pcm" && wav_format != "PCM"){
  180. websocketpp::lib::error_code ec;
  181. nlohmann::json jsonresult;
  182. jsonresult["text"] = "ERROR. Real-time transcription service ONLY SUPPORT wav_format pcm.";
  183. jsonresult["wav_name"] = wav_name;
  184. jsonresult["is_final"] = true;
  185. if (is_ssl) {
  186. wss_server_->send(hdl, jsonresult.dump(),
  187. websocketpp::frame::opcode::text, ec);
  188. } else {
  189. server_->send(hdl, jsonresult.dump(),
  190. websocketpp::frame::opcode::text, ec);
  191. }
  192. }
  193. }
  194. }
  195. } catch (std::exception const& e) {
  196. std::cerr << "Error: " << e.what() << std::endl;
  197. }
  198. scoped_lock guard(thread_lock);
  199. msg["access_num"]=(int)msg["access_num"]-1;
  200. }
  201. void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
  202. scoped_lock guard(m_lock); // for threads safty
  203. try{
  204. std::shared_ptr<FUNASR_MESSAGE> data_msg =
  205. std::make_shared<FUNASR_MESSAGE>(); // put a new data vector for new
  206. // connection
  207. data_msg->samples = std::make_shared<std::vector<char>>();
  208. data_msg->thread_lock = std::make_shared<websocketpp::lib::mutex>();
  209. data_msg->msg = nlohmann::json::parse("{}");
  210. data_msg->msg["wav_format"] = "pcm";
  211. data_msg->msg["wav_name"] = "wav-default-id";
  212. data_msg->msg["mode"] = "2pass";
  213. data_msg->msg["itn"] = true;
  214. data_msg->msg["audio_fs"] = 16000;
  215. data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly
  216. data_msg->msg["is_eof"]=false; // if this connection is closed
  217. data_msg->punc_cache =
  218. std::make_shared<std::vector<std::vector<std::string>>>(2);
  219. data_msg->strand_ = std::make_shared<asio::io_context::strand>(io_decoder_);
  220. data_map.emplace(hdl, data_msg);
  221. }catch (std::exception const& e) {
  222. std::cerr << "Error: " << e.what() << std::endl;
  223. }
  224. }
  225. void remove_hdl(
  226. websocketpp::connection_hdl hdl,
  227. std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,
  228. std::owner_less<websocketpp::connection_hdl>>& data_map) {
  229. std::shared_ptr<FUNASR_MESSAGE> data_msg = nullptr;
  230. auto it_data = data_map.find(hdl);
  231. if (it_data != data_map.end()) {
  232. data_msg = it_data->second;
  233. } else {
  234. return;
  235. }
  236. // scoped_lock guard_decoder(*(data_msg->thread_lock)); //wait for do_decoder
  237. // finished and avoid access freed tpass_online_handle
  238. unique_lock guard_decoder(*(data_msg->thread_lock));
  239. if (data_msg->msg["access_num"]==0 && data_msg->msg["is_eof"]==true) {
  240. FunTpassOnlineUninit(data_msg->tpass_online_handle);
  241. data_msg->tpass_online_handle = nullptr;
  242. data_map.erase(hdl);
  243. }
  244. guard_decoder.unlock();
  245. }
  246. void WebSocketServer::on_close(websocketpp::connection_hdl hdl) {
  247. scoped_lock guard(m_lock);
  248. std::shared_ptr<FUNASR_MESSAGE> data_msg = nullptr;
  249. auto it_data = data_map.find(hdl);
  250. if (it_data != data_map.end()) {
  251. data_msg = it_data->second;
  252. } else {
  253. return;
  254. }
  255. unique_lock guard_decoder(*(data_msg->thread_lock));
  256. data_msg->msg["is_eof"]=true;
  257. guard_decoder.unlock();
  258. }
  259. // remove closed connection
  260. void WebSocketServer::check_and_clean_connection() {
  261. while(true){
  262. std::this_thread::sleep_for(std::chrono::milliseconds(5000));
  263. std::vector<websocketpp::connection_hdl> to_remove; // remove list
  264. auto iter = data_map.begin();
  265. while (iter != data_map.end()) { // loop to find closed connection
  266. websocketpp::connection_hdl hdl = iter->first;
  267. try{
  268. if (is_ssl) {
  269. wss_server::connection_ptr con = wss_server_->get_con_from_hdl(hdl);
  270. if (con->get_state() != 1) { // session::state::open ==1
  271. to_remove.push_back(hdl);
  272. }
  273. } else {
  274. server::connection_ptr con = server_->get_con_from_hdl(hdl);
  275. if (con->get_state() != 1) { // session::state::open ==1
  276. to_remove.push_back(hdl);
  277. }
  278. }
  279. }
  280. catch (std::exception const &e)
  281. {
  282. // if connection is close, we set is_eof = true
  283. std::shared_ptr<FUNASR_MESSAGE> data_msg = nullptr;
  284. auto it_data = data_map.find(hdl);
  285. if (it_data != data_map.end()) {
  286. data_msg = it_data->second;
  287. } else {
  288. continue;
  289. }
  290. unique_lock guard_decoder(*(data_msg->thread_lock));
  291. data_msg->msg["is_eof"]=true;
  292. guard_decoder.unlock();
  293. to_remove.push_back(hdl);
  294. LOG(INFO)<<"connection is closed: "<<e.what();
  295. }
  296. iter++;
  297. }
  298. for (auto hdl : to_remove) {
  299. {
  300. unique_lock lock(m_lock);
  301. remove_hdl(hdl, data_map);
  302. }
  303. }
  304. }
  305. }
  306. void WebSocketServer::on_message(websocketpp::connection_hdl hdl,
  307. message_ptr msg) {
  308. unique_lock lock(m_lock);
  309. // find the sample data vector according to one connection
  310. std::shared_ptr<FUNASR_MESSAGE> msg_data = nullptr;
  311. auto it_data = data_map.find(hdl);
  312. if (it_data != data_map.end()) {
  313. msg_data = it_data->second;
  314. if(msg_data->msg["is_eof"]){
  315. lock.unlock();
  316. return;
  317. }
  318. } else {
  319. lock.unlock();
  320. return;
  321. }
  322. std::shared_ptr<std::vector<char>> sample_data_p = msg_data->samples;
  323. std::shared_ptr<std::vector<std::vector<std::string>>> punc_cache_p =
  324. msg_data->punc_cache;
  325. std::shared_ptr<websocketpp::lib::mutex> thread_lock_p = msg_data->thread_lock;
  326. lock.unlock();
  327. if (sample_data_p == nullptr) {
  328. LOG(INFO) << "error when fetch sample data vector";
  329. return;
  330. }
  331. const std::string& payload = msg->get_payload(); // get msg type
  332. unique_lock guard_decoder(*(thread_lock_p)); // mutex for one connection
  333. switch (msg->get_opcode()) {
  334. case websocketpp::frame::opcode::text: {
  335. nlohmann::json jsonresult;
  336. try{
  337. jsonresult = nlohmann::json::parse(payload);
  338. }catch (std::exception const &e)
  339. {
  340. LOG(ERROR)<<e.what();
  341. msg_data->msg["is_eof"]=true;
  342. guard_decoder.unlock();
  343. return;
  344. }
  345. if (jsonresult.contains("wav_name")) {
  346. msg_data->msg["wav_name"] = jsonresult["wav_name"];
  347. }
  348. if (jsonresult.contains("mode")) {
  349. msg_data->msg["mode"] = jsonresult["mode"];
  350. }
  351. if (jsonresult.contains("wav_format")) {
  352. msg_data->msg["wav_format"] = jsonresult["wav_format"];
  353. }
  354. // hotwords: fst/nn
  355. if(msg_data->hotwords_embedding == NULL){
  356. std::unordered_map<std::string, int> merged_hws_map;
  357. std::string nn_hotwords = "";
  358. if (jsonresult["hotwords"] != nullptr) {
  359. std::string json_string = jsonresult["hotwords"];
  360. if (!json_string.empty()){
  361. nlohmann::json json_fst_hws;
  362. try{
  363. json_fst_hws = nlohmann::json::parse(json_string);
  364. if(json_fst_hws.type() == nlohmann::json::value_t::object){
  365. // fst
  366. try{
  367. std::unordered_map<std::string, int> client_hws_map = json_fst_hws;
  368. merged_hws_map.insert(client_hws_map.begin(), client_hws_map.end());
  369. } catch (const std::exception& e) {
  370. LOG(INFO) << e.what();
  371. }
  372. }
  373. } catch (std::exception const &e)
  374. {
  375. LOG(ERROR)<<e.what();
  376. // nn
  377. std::string client_nn_hws = jsonresult["hotwords"];
  378. nn_hotwords += " " + client_nn_hws;
  379. // LOG(INFO) << "nn hotwords: " << client_nn_hws;
  380. }
  381. }
  382. }
  383. merged_hws_map.insert(hws_map_.begin(), hws_map_.end());
  384. // fst
  385. LOG(INFO) << "hotwords: ";
  386. for (const auto& pair : merged_hws_map) {
  387. nn_hotwords += " " + pair.first;
  388. LOG(INFO) << pair.first << " : " << pair.second;
  389. }
  390. // FunWfstDecoderLoadHwsRes(msg_data->decoder_handle, fst_inc_wts_, merged_hws_map);
  391. // nn
  392. std::vector<std::vector<float>> new_hotwords_embedding = CompileHotwordEmbedding(tpass_handle, nn_hotwords, ASR_TWO_PASS);
  393. msg_data->hotwords_embedding =
  394. std::make_shared<std::vector<std::vector<float>>>(new_hotwords_embedding);
  395. }
  396. if (jsonresult.contains("audio_fs")) {
  397. msg_data->msg["audio_fs"] = jsonresult["audio_fs"];
  398. }
  399. if (jsonresult.contains("chunk_size")) {
  400. if (msg_data->tpass_online_handle == NULL) {
  401. std::vector<int> chunk_size_vec =
  402. jsonresult["chunk_size"].get<std::vector<int>>();
  403. // check chunk_size_vec
  404. if(chunk_size_vec.size() == 3 && chunk_size_vec[1] != 0){
  405. FUNASR_HANDLE tpass_online_handle =
  406. FunTpassOnlineInit(tpass_handle, chunk_size_vec);
  407. msg_data->tpass_online_handle = tpass_online_handle;
  408. }else{
  409. LOG(ERROR) << "Wrong chunk_size!";
  410. break;
  411. }
  412. }
  413. }
  414. if (jsonresult.contains("itn")) {
  415. msg_data->msg["itn"] = jsonresult["itn"];
  416. }
  417. LOG(INFO) << "jsonresult=" << jsonresult
  418. << ", msg_data->msg=" << msg_data->msg;
  419. if ((jsonresult["is_speaking"] == false ||
  420. jsonresult["is_finished"] == true) &&
  421. msg_data->msg["is_eof"] != true &&
  422. msg_data->hotwords_embedding != NULL) {
  423. LOG(INFO) << "client done";
  424. // if it is in final message, post the sample_data to decode
  425. try{
  426. std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
  427. msg_data->strand_->post(
  428. std::bind(&WebSocketServer::do_decoder, this,
  429. std::move(*(sample_data_p.get())), std::move(hdl),
  430. std::ref(msg_data->msg), std::ref(*(punc_cache_p.get())),
  431. std::move(hotwords_embedding_),
  432. std::ref(*thread_lock_p), std::move(true),
  433. msg_data->msg["wav_name"],
  434. msg_data->msg["mode"],
  435. msg_data->msg["itn"],
  436. msg_data->msg["audio_fs"],
  437. msg_data->msg["wav_format"],
  438. std::ref(msg_data->tpass_online_handle)));
  439. msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1;
  440. }
  441. catch (std::exception const &e)
  442. {
  443. LOG(ERROR)<<e.what();
  444. }
  445. }
  446. break;
  447. }
  448. case websocketpp::frame::opcode::binary: {
  449. // recived binary data
  450. const auto* pcm_data = static_cast<const char*>(payload.data());
  451. int32_t num_samples = payload.size();
  452. if (isonline) {
  453. sample_data_p->insert(sample_data_p->end(), pcm_data,
  454. pcm_data + num_samples);
  455. int setpsize =
  456. 800 * 2; // TODO, need get from client
  457. // if sample_data size > setpsize, we post data to decode
  458. if (sample_data_p->size() > setpsize) {
  459. int chunksize = floor(sample_data_p->size() / setpsize);
  460. // make sure the subvector size is an integer multiple of setpsize
  461. std::vector<char> subvector = {
  462. sample_data_p->begin(),
  463. sample_data_p->begin() + chunksize * setpsize};
  464. // keep remain in sample_data
  465. sample_data_p->erase(sample_data_p->begin(),
  466. sample_data_p->begin() + chunksize * setpsize);
  467. try{
  468. // post to decode
  469. if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != NULL) {
  470. std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
  471. msg_data->strand_->post(
  472. std::bind(&WebSocketServer::do_decoder, this,
  473. std::move(subvector), std::move(hdl),
  474. std::ref(msg_data->msg),
  475. std::ref(*(punc_cache_p.get())),
  476. std::move(hotwords_embedding_),
  477. std::ref(*thread_lock_p), std::move(false),
  478. msg_data->msg["wav_name"],
  479. msg_data->msg["mode"],
  480. msg_data->msg["itn"],
  481. msg_data->msg["audio_fs"],
  482. msg_data->msg["wav_format"],
  483. std::ref(msg_data->tpass_online_handle)));
  484. msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1;
  485. }
  486. }
  487. catch (std::exception const &e)
  488. {
  489. LOG(ERROR)<<e.what();
  490. }
  491. }
  492. } else {
  493. sample_data_p->insert(sample_data_p->end(), pcm_data,
  494. pcm_data + num_samples);
  495. }
  496. break;
  497. }
  498. default:
  499. break;
  500. }
  501. guard_decoder.unlock();
  502. }
  503. // init asr model
  504. void WebSocketServer::initAsr(std::map<std::string, std::string>& model_path,
  505. int thread_num) {
  506. try {
  507. tpass_handle = FunTpassInit(model_path, thread_num);
  508. if (!tpass_handle) {
  509. LOG(ERROR) << "FunTpassInit init failed";
  510. exit(-1);
  511. }
  512. LOG(INFO) << "initAsr run check_and_clean_connection";
  513. std::thread clean_thread(&WebSocketServer::check_and_clean_connection,this);
  514. clean_thread.detach();
  515. LOG(INFO) << "initAsr run check_and_clean_connection finished";
  516. } catch (const std::exception& e) {
  517. LOG(INFO) << e.what();
  518. }
  519. }