|
|
@@ -255,7 +255,8 @@ void TimestampAdd(std::deque<string> &alignment_str1, std::string str_word){
|
|
|
}
|
|
|
|
|
|
bool TimestampIsPunctuation(const std::string& str) {
|
|
|
- const std::string punctuation = u8",。?、,.?";
|
|
|
+ const std::string punctuation = u8",。?、,?";
|
|
|
+ // const std::string punctuation = u8",。?、,.?";
|
|
|
for (char ch : str) {
|
|
|
if (punctuation.find(ch) == std::string::npos) {
|
|
|
return false;
|
|
|
@@ -557,6 +558,76 @@ std::string TimestampSmooth(std::string &text, std::string &text_itn, std::strin
|
|
|
return timestamps_str;
|
|
|
}
|
|
|
|
|
|
+std::string TimestampSentence(std::string &text, std::string &str_time){
|
|
|
+ std::vector<std::string> characters;
|
|
|
+ funasr::TimestampSplitChiEngCharacters(text, characters);
|
|
|
+ vector<vector<int>> timestamps = funasr::ParseTimestamps(str_time);
|
|
|
+
|
|
|
+ int idx_str = 0, idx_ts = 0;
|
|
|
+ int start = -1, end = -1;
|
|
|
+ std::string text_seg = "";
|
|
|
+ std::string ts_sentences = "";
|
|
|
+ std::string ts_sent = "";
|
|
|
+ vector<vector<int>> ts_seg;
|
|
|
+ while(idx_str < characters.size()){
|
|
|
+ if (TimestampIsPunctuation(characters[idx_str])){
|
|
|
+ if(ts_seg.size() >0){
|
|
|
+ if (ts_seg[0].size() == 2){
|
|
|
+ start = ts_seg[0][0];
|
|
|
+ }
|
|
|
+ if (ts_seg[ts_seg.size()-1].size() == 2){
|
|
|
+ end = ts_seg[ts_seg.size()-1][1];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // format
|
|
|
+ ts_sent += "{'text':'" + text_seg + "',";
|
|
|
+ ts_sent += "'start':'" + to_string(start) + "',";
|
|
|
+ ts_sent += "'end':'" + to_string(end) + "',";
|
|
|
+ ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}";
|
|
|
+
|
|
|
+ if (idx_str == characters.size()-1){
|
|
|
+ ts_sentences += ts_sent;
|
|
|
+ } else{
|
|
|
+ ts_sentences += ts_sent + ",";
|
|
|
+ }
|
|
|
+
|
|
|
+ // clear
|
|
|
+ idx_str++;
|
|
|
+ text_seg = "";
|
|
|
+ ts_sent = "";
|
|
|
+ start = 0;
|
|
|
+ end = 0;
|
|
|
+ ts_seg.clear();
|
|
|
+ } else if(idx_ts < timestamps.size()) {
|
|
|
+ if (text_seg.empty()){
|
|
|
+ text_seg = characters[idx_str];
|
|
|
+ }else{
|
|
|
+ text_seg += " " + characters[idx_str];
|
|
|
+ }
|
|
|
+ ts_seg.push_back(timestamps[idx_ts]);
|
|
|
+ idx_str++;
|
|
|
+ idx_ts++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // for none punc results
|
|
|
+ if(ts_seg.size() >0){
|
|
|
+ if (ts_seg[0].size() == 2){
|
|
|
+ start = ts_seg[0][0];
|
|
|
+ }
|
|
|
+ if (ts_seg[ts_seg.size()-1].size() == 2){
|
|
|
+ end = ts_seg[ts_seg.size()-1][1];
|
|
|
+ }
|
|
|
+ // format
|
|
|
+ ts_sent += "{'text':'" + text_seg + "',";
|
|
|
+ ts_sent += "'start':'" + to_string(start) + "',";
|
|
|
+ ts_sent += "'end':'" + to_string(end) + "',";
|
|
|
+ ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}";
|
|
|
+ ts_sentences += ts_sent;
|
|
|
+ }
|
|
|
+
|
|
|
+ return "[" +ts_sentences + "]";
|
|
|
+}
|
|
|
+
|
|
|
std::vector<std::string> split(const std::string &s, char delim) {
|
|
|
std::vector<std::string> elems;
|
|
|
std::stringstream ss(s);
|