3 年之前 · 1aba91a4b8
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 
				 .DS_Store
			
 
				 init_model/
			
 
				 *.tar.gz
			
 
				+test_local/
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
@@ -1,4 +1,3 @@
 
				 # -*- encoding: utf-8 -*-
			
 
				 # @Author: SWHL
			
 
				-# @Contact: liekkaskono@163.com
			
 
				-from .paraformer_onnx import Paraformer
			
 
				+# @Contact: liekkaskono@163.com
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
@@ -1,201 +0,0 @@
 
				-                                 Apache License
			
 
				-                           Version 2.0, January 2004
			
 
				-                        http://www.apache.org/licenses/
			
 
				-
			
 
				-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				-
			
 
				-   1. Definitions.
			
 
				-
			
 
				-      "License" shall mean the terms and conditions for use, reproduction,
			
 
				-      and distribution as defined by Sections 1 through 9 of this document.
			
 
				-
			
 
				-      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				-      the copyright owner that is granting the License.
			
 
				-
			
 
				-      "Legal Entity" shall mean the union of the acting entity and all
			
 
				-      other entities that control, are controlled by, or are under common
			
 
				-      control with that entity. For the purposes of this definition,
			
 
				-      "control" means (i) the power, direct or indirect, to cause the
			
 
				-      direction or management of such entity, whether by contract or
			
 
				-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				-      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				-
			
 
				-      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				-      exercising permissions granted by this License.
			
 
				-
			
 
				-      "Source" form shall mean the preferred form for making modifications,
			
 
				-      including but not limited to software source code, documentation
			
 
				-      source, and configuration files.
			
 
				-
			
 
				-      "Object" form shall mean any form resulting from mechanical
			
 
				-      transformation or translation of a Source form, including but
			
 
				-      not limited to compiled object code, generated documentation,
			
 
				-      and conversions to other media types.
			
 
				-
			
 
				-      "Work" shall mean the work of authorship, whether in Source or
			
 
				-      Object form, made available under the License, as indicated by a
			
 
				-      copyright notice that is included in or attached to the work
			
 
				-      (an example is provided in the Appendix below).
			
 
				-
			
 
				-      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				-      form, that is based on (or derived from) the Work and for which the
			
 
				-      editorial revisions, annotations, elaborations, or other modifications
			
 
				-      represent, as a whole, an original work of authorship. For the purposes
			
 
				-      of this License, Derivative Works shall not include works that remain
			
 
				-      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				-      the Work and Derivative Works thereof.
			
 
				-
			
 
				-      "Contribution" shall mean any work of authorship, including
			
 
				-      the original version of the Work and any modifications or additions
			
 
				-      to that Work or Derivative Works thereof, that is intentionally
			
 
				-      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				-      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				-      the copyright owner. For the purposes of this definition, "submitted"
			
 
				-      means any form of electronic, verbal, or written communication sent
			
 
				-      to the Licensor or its representatives, including but not limited to
			
 
				-      communication on electronic mailing lists, source code control systems,
			
 
				-      and issue tracking systems that are managed by, or on behalf of, the
			
 
				-      Licensor for the purpose of discussing and improving the Work, but
			
 
				-      excluding communication that is conspicuously marked or otherwise
			
 
				-      designated in writing by the copyright owner as "Not a Contribution."
			
 
				-
			
 
				-      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				-      on behalf of whom a Contribution has been received by Licensor and
			
 
				-      subsequently incorporated within the Work.
			
 
				-
			
 
				-   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				-      this License, each Contributor hereby grants to You a perpetual,
			
 
				-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				-      copyright license to reproduce, prepare Derivative Works of,
			
 
				-      publicly display, publicly perform, sublicense, and distribute the
			
 
				-      Work and such Derivative Works in Source or Object form.
			
 
				-
			
 
				-   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				-      this License, each Contributor hereby grants to You a perpetual,
			
 
				-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				-      (except as stated in this section) patent license to make, have made,
			
 
				-      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				-      where such license applies only to those patent claims licensable
			
 
				-      by such Contributor that are necessarily infringed by their
			
 
				-      Contribution(s) alone or by combination of their Contribution(s)
			
 
				-      with the Work to which such Contribution(s) was submitted. If You
			
 
				-      institute patent litigation against any entity (including a
			
 
				-      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				-      or a Contribution incorporated within the Work constitutes direct
			
 
				-      or contributory patent infringement, then any patent licenses
			
 
				-      granted to You under this License for that Work shall terminate
			
 
				-      as of the date such litigation is filed.
			
 
				-
			
 
				-   4. Redistribution. You may reproduce and distribute copies of the
			
 
				-      Work or Derivative Works thereof in any medium, with or without
			
 
				-      modifications, and in Source or Object form, provided that You
			
 
				-      meet the following conditions:
			
 
				-
			
 
				-      (a) You must give any other recipients of the Work or
			
 
				-          Derivative Works a copy of this License; and
			
 
				-
			
 
				-      (b) You must cause any modified files to carry prominent notices
			
 
				-          stating that You changed the files; and
			
 
				-
			
 
				-      (c) You must retain, in the Source form of any Derivative Works
			
 
				-          that You distribute, all copyright, patent, trademark, and
			
 
				-          attribution notices from the Source form of the Work,
			
 
				-          excluding those notices that do not pertain to any part of
			
 
				-          the Derivative Works; and
			
 
				-
			
 
				-      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				-          distribution, then any Derivative Works that You distribute must
			
 
				-          include a readable copy of the attribution notices contained
			
 
				-          within such NOTICE file, excluding those notices that do not
			
 
				-          pertain to any part of the Derivative Works, in at least one
			
 
				-          of the following places: within a NOTICE text file distributed
			
 
				-          as part of the Derivative Works; within the Source form or
			
 
				-          documentation, if provided along with the Derivative Works; or,
			
 
				-          within a display generated by the Derivative Works, if and
			
 
				-          wherever such third-party notices normally appear. The contents
			
 
				-          of the NOTICE file are for informational purposes only and
			
 
				-          do not modify the License. You may add Your own attribution
			
 
				-          notices within Derivative Works that You distribute, alongside
			
 
				-          or as an addendum to the NOTICE text from the Work, provided
			
 
				-          that such additional attribution notices cannot be construed
			
 
				-          as modifying the License.
			
 
				-
			
 
				-      You may add Your own copyright statement to Your modifications and
			
 
				-      may provide additional or different license terms and conditions
			
 
				-      for use, reproduction, or distribution of Your modifications, or
			
 
				-      for any such Derivative Works as a whole, provided Your use,
			
 
				-      reproduction, and distribution of the Work otherwise complies with
			
 
				-      the conditions stated in this License.
			
 
				-
			
 
				-   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				-      any Contribution intentionally submitted for inclusion in the Work
			
 
				-      by You to the Licensor shall be under the terms and conditions of
			
 
				-      this License, without any additional terms or conditions.
			
 
				-      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				-      the terms of any separate license agreement you may have executed
			
 
				-      with Licensor regarding such Contributions.
			
 
				-
			
 
				-   6. Trademarks. This License does not grant permission to use the trade
			
 
				-      names, trademarks, service marks, or product names of the Licensor,
			
 
				-      except as required for reasonable and customary use in describing the
			
 
				-      origin of the Work and reproducing the content of the NOTICE file.
			
 
				-
			
 
				-   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				-      agreed to in writing, Licensor provides the Work (and each
			
 
				-      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				-      implied, including, without limitation, any warranties or conditions
			
 
				-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				-      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				-      appropriateness of using or redistributing the Work and assume any
			
 
				-      risks associated with Your exercise of permissions under this License.
			
 
				-
			
 
				-   8. Limitation of Liability. In no event and under no legal theory,
			
 
				-      whether in tort (including negligence), contract, or otherwise,
			
 
				-      unless required by applicable law (such as deliberate and grossly
			
 
				-      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				-      liable to You for damages, including any direct, indirect, special,
			
 
				-      incidental, or consequential damages of any character arising as a
			
 
				-      result of this License or out of the use or inability to use the
			
 
				-      Work (including but not limited to damages for loss of goodwill,
			
 
				-      work stoppage, computer failure or malfunction, or any and all
			
 
				-      other commercial damages or losses), even if such Contributor
			
 
				-      has been advised of the possibility of such damages.
			
 
				-
			
 
				-   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				-      the Work or Derivative Works thereof, You may choose to offer,
			
 
				-      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				-      or other liability obligations and/or rights consistent with this
			
 
				-      License. However, in accepting such obligations, You may act only
			
 
				-      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				-      of any other Contributor, and only if You agree to indemnify,
			
 
				-      defend, and hold each Contributor harmless for any liability
			
 
				-      incurred by, or claims asserted against, such Contributor by reason
			
 
				-      of your accepting any such warranty or additional liability.
			
 
				-
			
 
				-   END OF TERMS AND CONDITIONS
			
 
				-
			
 
				-   APPENDIX: How to apply the Apache License to your work.
			
 
				-
			
 
				-      To apply the Apache License to your work, attach the following
			
 
				-      boilerplate notice, with the fields enclosed by brackets "[]"
			
 
				-      replaced with your own identifying information. (Don't include
			
 
				-      the brackets!)  The text should be enclosed in the appropriate
			
 
				-      comment syntax for the file format. We also recommend that a
			
 
				-      file or class name and description of purpose be included on the
			
 
				-      same "printed page" as the copyright notice for easier
			
 
				-      identification within third-party archives.
			
 
				-
			
 
				-   Copyright [yyyy] [name of copyright owner]
			
 
				-
			
 
				-   Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-   you may not use this file except in compliance with the License.
			
 
				-   You may obtain a copy of the License at
			
 
				-
			
 
				-       http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-   Unless required by applicable law or agreed to in writing, software
			
 
				-   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-   See the License for the specific language governing permissions and
			
 
				-   limitations under the License.
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
@@ -1,3 +0,0 @@
 
				-# -*- encoding: utf-8 -*-
			
 
				-from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding
			
 
				-from .ivector import compute_vad
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
@@ -1,459 +0,0 @@
 
				-import numpy as np
			
 
				-from scipy.fftpack import dct
			
 
				-
			
 
				-
			
 
				-# ---------- feature-window ----------
			
 
				-
			
 
				-def sliding_window(x, window_size, window_shift):
			
 
				-    shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
			
 
				-    strides = x.strides + (x.strides[-1],)
			
 
				-    return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
			
 
				-
			
 
				-
			
 
				-def func_num_frames(num_samples, window_size, window_shift, snip_edges):
			
 
				-    if snip_edges:
			
 
				-        if num_samples < window_size:
			
 
				-            return 0
			
 
				-        else:
			
 
				-            return 1 + ((num_samples - window_size) // window_shift)
			
 
				-    else:
			
 
				-        return (num_samples + (window_shift // 2)) // window_shift
			
 
				-
			
 
				-
			
 
				-def func_dither(waveform, dither_value):
			
 
				-    if dither_value == 0.0:
			
 
				-        return waveform
			
 
				-    waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value
			
 
				-    return waveform
			
 
				-
			
 
				-
			
 
				-def func_remove_dc_offset(waveform):
			
 
				-    return waveform - np.mean(waveform)
			
 
				-
			
 
				-
			
 
				-def func_log_energy(waveform):
			
 
				-    return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
			
 
				-
			
 
				-
			
 
				-def func_preemphasis(waveform, preemph_coeff):
			
 
				-    if preemph_coeff == 0.0:
			
 
				-        return waveform
			
 
				-    assert 0 < preemph_coeff <= 1
			
 
				-    waveform[1:] -= preemph_coeff * waveform[:-1]
			
 
				-    waveform[0] -= preemph_coeff * waveform[0]
			
 
				-    return waveform
			
 
				-
			
 
				-
			
 
				-def sine(M):
			
 
				-    if M < 1:
			
 
				-        return np.array([])
			
 
				-    if M == 1:
			
 
				-        return np.ones(1, float)
			
 
				-    n = np.arange(0, M)
			
 
				-    return np.sin(np.pi*n/(M-1))
			
 
				-
			
 
				-
			
 
				-def povey(M):
			
 
				-    if M < 1:
			
 
				-        return np.array([])
			
 
				-    if M == 1:
			
 
				-        return np.ones(1, float)
			
 
				-    n = np.arange(0, M)
			
 
				-    return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85
			
 
				-
			
 
				-
			
 
				-def feature_window_function(window_type, window_size, blackman_coeff):
			
 
				-    assert window_size > 0
			
 
				-    if window_type == 'hanning':
			
 
				-        return np.hanning(window_size)
			
 
				-    elif window_type == 'sine':
			
 
				-        return sine(window_size)
			
 
				-    elif window_type == 'hamming':
			
 
				-        return np.hamming(window_size)
			
 
				-    elif window_type == 'povey':
			
 
				-        return povey(window_size)
			
 
				-    elif window_type == 'rectangular':
			
 
				-        return np.ones(window_size)
			
 
				-    elif window_type == 'blackman':
			
 
				-        window_func = np.blackman(window_size)
			
 
				-        if blackman_coeff == 0.42:
			
 
				-            return window_func
			
 
				-        else:
			
 
				-            return window_func - 0.42 + blackman_coeff
			
 
				-    else:
			
 
				-        raise ValueError('Invalid window type {}'.format(window_type))
			
 
				-
			
 
				-
			
 
				-def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy):
			
 
				-    if dither != 0.0:
			
 
				-        window = func_dither(window, dither)
			
 
				-    if remove_dc_offset:
			
 
				-        window = func_remove_dc_offset(window)
			
 
				-    if raw_energy:
			
 
				-        log_energy = func_log_energy(window)
			
 
				-    if preemphasis_coefficient != 0.0:
			
 
				-        window = func_preemphasis(window, preemphasis_coefficient)
			
 
				-    window *= window_function
			
 
				-    if not raw_energy:
			
 
				-        log_energy = func_log_energy(window)
			
 
				-    return window, log_energy
			
 
				-
			
 
				-
			
 
				-def extract_window(waveform, blackman_coeff, dither, window_size, window_shift,
			
 
				-                   preemphasis_coefficient, raw_energy, remove_dc_offset,
			
 
				-                   snip_edges, window_type, dtype):
			
 
				-    num_samples = len(waveform)
			
 
				-    num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges)
			
 
				-    num_samples_ = (num_frames - 1) * window_shift + window_size
			
 
				-    if snip_edges:
			
 
				-        waveform = waveform[:num_samples_]
			
 
				-    else:
			
 
				-        offset = window_shift // 2 - window_size // 2
			
 
				-        waveform = np.concatenate([
			
 
				-            waveform[-offset - 1::-1],
			
 
				-            waveform,
			
 
				-            waveform[:-(offset + num_samples_ - num_samples + 1):-1]
			
 
				-        ])
			
 
				-    frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift)
			
 
				-    frames = frames.astype(dtype)
			
 
				-    log_enery = np.empty(frames.shape[0], dtype=dtype)
			
 
				-    for i in range(frames.shape[0]):
			
 
				-        frames[i], log_enery[i] = process_window(
			
 
				-            window=frames[i],
			
 
				-            dither=dither,
			
 
				-            remove_dc_offset=remove_dc_offset,
			
 
				-            preemphasis_coefficient=preemphasis_coefficient,
			
 
				-            window_function=feature_window_function(
			
 
				-                window_type=window_type,
			
 
				-                window_size=window_size,
			
 
				-                blackman_coeff=blackman_coeff
			
 
				-            ).astype(dtype),
			
 
				-            raw_energy=raw_energy
			
 
				-        )
			
 
				-    return frames, log_enery
			
 
				-
			
 
				-# ---------- feature-window ----------
			
 
				-
			
 
				-
			
 
				-# ---------- feature-functions ----------
			
 
				-
			
 
				-def compute_spectrum(frames, n):
			
 
				-    complex_spec = np.fft.rfft(frames, n)
			
 
				-    return np.absolute(complex_spec)
			
 
				-
			
 
				-
			
 
				-def compute_power_spectrum(frames, n):
			
 
				-    return np.square(compute_spectrum(frames, n))
			
 
				-
			
 
				-
			
 
				-def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False):
			
 
				-    num_frames, feat_dim = feat.shape
			
 
				-    std = 1
			
 
				-    if center:
			
 
				-        if num_frames <= window:
			
 
				-            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
			
 
				-            if norm_vars:
			
 
				-                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
			
 
				-        else:
			
 
				-            feat1 = feat[:window]
			
 
				-            feat2 = sliding_window(feat.T, window, 1)
			
 
				-            feat3 = feat[-window:]
			
 
				-            mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0)
			
 
				-            mean2 = feat2.mean(axis=2).T
			
 
				-            mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
			
 
				-            mean = np.concatenate([mean1, mean2, mean3])
			
 
				-            if norm_vars:
			
 
				-                std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0)
			
 
				-                std2 = feat2.std(axis=2).T
			
 
				-                std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
			
 
				-                std = np.concatenate([std1, std2, std3])
			
 
				-    else:
			
 
				-        if num_frames <= min_window:
			
 
				-            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
			
 
				-            if norm_vars:
			
 
				-                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
			
 
				-        else:
			
 
				-            feat1 = feat[:min_window]
			
 
				-            mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0)
			
 
				-            feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:]
			
 
				-            cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis]
			
 
				-            mean2 = feat2_cumsum / cumcnt
			
 
				-            mean = np.concatenate([mean1, mean2])
			
 
				-            if norm_vars:
			
 
				-                std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0)
			
 
				-                feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:]
			
 
				-                std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2))
			
 
				-                std = np.concatenate([std1, std2])
			
 
				-            if num_frames > window:
			
 
				-                feat3 = sliding_window(feat.T, window, 1)
			
 
				-                mean3 = feat3.mean(axis=2).T
			
 
				-                mean = np.concatenate([mean, mean3[1:]])
			
 
				-                if norm_vars:
			
 
				-                    std3 = feat3.std(axis=2).T
			
 
				-                    std = np.concatenate([std, std3[1:]])
			
 
				-    feat = (feat - mean) / std
			
 
				-    return feat
			
 
				-
			
 
				-# ---------- feature-functions ----------
			
 
				-
			
 
				-
			
 
				-# ---------- mel-computations ----------
			
 
				-
			
 
				-def inverse_mel_scale(mel_freq):
			
 
				-    return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
			
 
				-
			
 
				-
			
 
				-def mel_scale(freq):
			
 
				-    return 1127.0 * np.log(1.0 + freq / 700.0)
			
 
				-
			
 
				-
			
 
				-def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n):
			
 
				-    """ Compute Mel banks.
			
 
				-
			
 
				-    :param num_bins: Number of triangular mel-frequency bins
			
 
				-    :param sample_frequency: Waveform data sample frequency
			
 
				-    :param low_freq: Low cutoff frequency for mel bins
			
 
				-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
			
 
				-    :param n: Window size
			
 
				-    :return: Mel banks.
			
 
				-    """
			
 
				-    assert num_bins >= 3, 'Must have at least 3 mel bins'
			
 
				-    num_fft_bins = n // 2
			
 
				-
			
 
				-    nyquist = 0.5 * sample_frequency
			
 
				-    if high_freq <= 0:
			
 
				-        high_freq = nyquist + high_freq
			
 
				-    assert 0 <= low_freq < high_freq <= nyquist
			
 
				-
			
 
				-    fft_bin_width = sample_frequency / n
			
 
				-
			
 
				-    mel_low_freq = mel_scale(low_freq)
			
 
				-    mel_high_freq = mel_scale(high_freq)
			
 
				-    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
			
 
				-
			
 
				-    mel_banks = np.zeros([num_bins, num_fft_bins + 1])
			
 
				-    for i in range(num_bins):
			
 
				-        left_mel = mel_low_freq + mel_freq_delta * i
			
 
				-        center_mel = left_mel + mel_freq_delta
			
 
				-        right_mel = center_mel + mel_freq_delta
			
 
				-        for j in range(num_fft_bins):
			
 
				-            mel = mel_scale(fft_bin_width * j)
			
 
				-            if left_mel < mel < right_mel:
			
 
				-                if mel <= center_mel:
			
 
				-                    mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel)
			
 
				-                else:
			
 
				-                    mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel)
			
 
				-    return mel_banks
			
 
				-
			
 
				-
			
 
				-def compute_lifter_coeffs(q, M):
			
 
				-    """ Compute liftering coefficients (scaling on cepstral coeffs)
			
 
				-        the zeroth index is C0, which is not affected.
			
 
				-
			
 
				-    :param q: Number of lifters
			
 
				-    :param M: Number of coefficients
			
 
				-    :return: Lifters.
			
 
				-    """
			
 
				-    if M < 1:
			
 
				-        return np.array([])
			
 
				-    if M == 1:
			
 
				-        return np.ones(1, float)
			
 
				-    n = np.arange(0, M)
			
 
				-    return 1 + 0.5*np.sin(np.pi*n/q)*q
			
 
				-
			
 
				-# ---------- mel-computations ----------
			
 
				-
			
 
				-
			
 
				-# ---------- compute-fbank-feats ----------
			
 
				-
			
 
				-def compute_fbank_feats(
			
 
				-        waveform,
			
 
				-        blackman_coeff=0.42,
			
 
				-        dither=1.0,
			
 
				-        energy_floor=1.0,
			
 
				-        frame_length=25,
			
 
				-        frame_shift=10,
			
 
				-        high_freq=0,
			
 
				-        low_freq=20,
			
 
				-        num_mel_bins=23,
			
 
				-        preemphasis_coefficient=0.97,
			
 
				-        raw_energy=True,
			
 
				-        remove_dc_offset=True,
			
 
				-        round_to_power_of_two=True,
			
 
				-        sample_frequency=16000,
			
 
				-        snip_edges=True,
			
 
				-        use_energy=False,
			
 
				-        use_log_fbank=True,
			
 
				-        use_power=True,
			
 
				-        window_type='povey',
			
 
				-        dtype=np.float32):
			
 
				-    """ Compute (log) Mel filter bank energies
			
 
				-
			
 
				-    :param waveform: Input waveform.
			
 
				-    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
			
 
				-    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
			
 
				-    :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
			
 
				-    :param frame_length: Frame length in milliseconds (float, default = 25)
			
 
				-    :param frame_shift: Frame shift in milliseconds (float, default = 10)
			
 
				-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
			
 
				-    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
			
 
				-    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
			
 
				-    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
			
 
				-    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
			
 
				-    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
			
 
				-    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
			
 
				-    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
			
 
				-    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
			
 
				-    :param use_energy: Add an extra energy output. (bool, default = false)
			
 
				-    :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true)
			
 
				-    :param use_power: If true, use power, else use magnitude. (bool, default = true)
			
 
				-    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
			
 
				-    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
			
 
				-    :return: (Log) Mel filter bank energies.
			
 
				-    """
			
 
				-    window_size = int(frame_length * sample_frequency * 0.001)
			
 
				-    window_shift = int(frame_shift * sample_frequency * 0.001)
			
 
				-    frames, log_energy = extract_window(
			
 
				-        waveform=waveform,
			
 
				-        blackman_coeff=blackman_coeff,
			
 
				-        dither=dither,
			
 
				-        window_size=window_size,
			
 
				-        window_shift=window_shift,
			
 
				-        preemphasis_coefficient=preemphasis_coefficient,
			
 
				-        raw_energy=raw_energy,
			
 
				-        remove_dc_offset=remove_dc_offset,
			
 
				-        snip_edges=snip_edges,
			
 
				-        window_type=window_type,
			
 
				-        dtype=dtype
			
 
				-    )
			
 
				-    if round_to_power_of_two:
			
 
				-        n = 1
			
 
				-        while n < window_size:
			
 
				-            n *= 2
			
 
				-    else:
			
 
				-        n = window_size
			
 
				-    if use_power:
			
 
				-        spectrum = compute_power_spectrum(frames, n)
			
 
				-    else:
			
 
				-        spectrum = compute_spectrum(frames, n)
			
 
				-    mel_banks = compute_mel_banks(
			
 
				-        num_bins=num_mel_bins,
			
 
				-        sample_frequency=sample_frequency,
			
 
				-        low_freq=low_freq,
			
 
				-        high_freq=high_freq,
			
 
				-        n=n
			
 
				-    ).astype(dtype)
			
 
				-    feat = np.dot(spectrum, mel_banks.T)
			
 
				-    if use_log_fbank:
			
 
				-        feat = np.log(feat.clip(min=np.finfo(dtype).eps))
			
 
				-    if use_energy:
			
 
				-        if energy_floor > 0.0:
			
 
				-            log_energy.clip(min=np.math.log(energy_floor))
			
 
				-        return feat, log_energy
			
 
				-    return feat
			
 
				-
			
 
				-# ---------- compute-fbank-feats ----------
			
 
				-
			
 
				-
			
 
				-# ---------- compute-mfcc-feats ----------
			
 
				-
			
 
				-def compute_mfcc_feats(
			
 
				-        waveform,
			
 
				-        blackman_coeff=0.42,
			
 
				-        cepstral_lifter=22,
			
 
				-        dither=1.0,
			
 
				-        energy_floor=0.0,
			
 
				-        frame_length=25,
			
 
				-        frame_shift=10,
			
 
				-        high_freq=0,
			
 
				-        low_freq=20,
			
 
				-        num_ceps=13,
			
 
				-        num_mel_bins=23,
			
 
				-        preemphasis_coefficient=0.97,
			
 
				-        raw_energy=True,
			
 
				-        remove_dc_offset=True,
			
 
				-        round_to_power_of_two=True,
			
 
				-        sample_frequency=16000,
			
 
				-        snip_edges=True,
			
 
				-        use_energy=True,
			
 
				-        window_type='povey',
			
 
				-        dtype=np.float32):
			
 
				-    """ Compute mel-frequency cepstral coefficients
			
 
				-
			
 
				-    :param waveform: Input waveform.
			
 
				-    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
			
 
				-    :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22)
			
 
				-    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
			
 
				-    :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
			
 
				-    :param frame_length: Frame length in milliseconds (float, default = 25)
			
 
				-    :param frame_shift: Frame shift in milliseconds (float, default = 10)
			
 
				-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
			
 
				-    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
			
 
				-    :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13)
			
 
				-    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
			
 
				-    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
			
 
				-    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
			
 
				-    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
			
 
				-    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
			
 
				-    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
			
 
				-    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
			
 
				-    :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true)
			
 
				-    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
			
 
				-    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
			
 
				-    :return: Mel-frequency cespstral coefficients.
			
 
				-    """
			
 
				-    feat, log_energy = compute_fbank_feats(
			
 
				-        waveform=waveform,
			
 
				-        blackman_coeff=blackman_coeff,
			
 
				-        dither=dither,
			
 
				-        energy_floor=energy_floor,
			
 
				-        frame_length=frame_length,
			
 
				-        frame_shift=frame_shift,
			
 
				-        high_freq=high_freq,
			
 
				-        low_freq=low_freq,
			
 
				-        num_mel_bins=num_mel_bins,
			
 
				-        preemphasis_coefficient=preemphasis_coefficient,
			
 
				-        raw_energy=raw_energy,
			
 
				-        remove_dc_offset=remove_dc_offset,
			
 
				-        round_to_power_of_two=round_to_power_of_two,
			
 
				-        sample_frequency=sample_frequency,
			
 
				-        snip_edges=snip_edges,
			
 
				-        use_energy=use_energy,
			
 
				-        use_log_fbank=True,
			
 
				-        use_power=True,
			
 
				-        window_type=window_type,
			
 
				-        dtype=dtype
			
 
				-    )
			
 
				-    feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps]
			
 
				-    lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype)
			
 
				-    feat = feat * lifter_coeffs
			
 
				-    if use_energy:
			
 
				-        feat[:, 0] = log_energy
			
 
				-    return feat
			
 
				-
			
 
				-# ---------- compute-mfcc-feats ----------
			
 
				-
			
 
				-
			
 
				-# ---------- apply-cmvn-sliding ----------
			
 
				-
			
 
				-def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False):
			
 
				-    """ Apply sliding-window cepstral mean (and optionally variance) normalization
			
 
				-
			
 
				-    :param feat: Cepstrum.
			
 
				-    :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
			
 
				-    :param window: Window in frames for running average CMN computation (int, default = 600)
			
 
				-    :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)
			
 
				-    :param norm_vars: If true, normalize variance to one. (bool, default = false)
			
 
				-    :return: Normalized cepstrum.
			
 
				-    """
			
 
				-    # double-precision
			
 
				-    feat = apply_cmvn_sliding_internal(
			
 
				-        feat=feat.astype(np.float64),
			
 
				-        center=center,
			
 
				-        window=window,
			
 
				-        min_window=min_window,
			
 
				-        norm_vars=norm_vars
			
 
				-    ).astype(feat.dtype)
			
 
				-    return feat
			
 
				-
			
 
				-# ---------- apply-cmvn-sliding ----------
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
@@ -1,43 +0,0 @@
 
				-import numpy as np
			
 
				-
			
 
				-from .feature import sliding_window
			
 
				-
			
 
				-
			
 
				-# ---------- compute-vad ----------
			
 
				-
			
 
				-def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6):
			
 
				-    """ Apply voice activity detection
			
 
				-
			
 
				-    :param log_energy: Log mel energy.
			
 
				-    :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
			
 
				-    :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)
			
 
				-    :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)
			
 
				-    :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)
			
 
				-    :return: A vector of boolean that are True if we judge the frame voiced and False otherwise.
			
 
				-    """
			
 
				-    assert len(log_energy.shape) == 1
			
 
				-    assert energy_mean_scale >= 0
			
 
				-    assert frames_context >= 0
			
 
				-    assert 0 < proportion_threshold < 1
			
 
				-    dtype = log_energy.dtype
			
 
				-    energy_threshold += energy_mean_scale * log_energy.mean()
			
 
				-    if frames_context > 0:
			
 
				-        num_frames = len(log_energy)
			
 
				-        window_size = frames_context * 2 + 1
			
 
				-        log_energy_pad = np.concatenate([
			
 
				-            np.zeros(frames_context, dtype=dtype),
			
 
				-            log_energy,
			
 
				-            np.zeros(frames_context, dtype=dtype)
			
 
				-        ])
			
 
				-        log_energy_window = sliding_window(log_energy_pad, window_size, 1)
			
 
				-        num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1)
			
 
				-        den_count = np.ones(num_frames, dtype=dtype) * window_size
			
 
				-        max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype)
			
 
				-        den_count[:-(frames_context + 2):-1] = max_den_count
			
 
				-        den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0)
			
 
				-        vad = num_count / den_count >= proportion_threshold
			
 
				-    else:
			
 
				-        vad = log_energy > energy_threshold
			
 
				-    return vad
			
 
				-
			
 
				-# ---------- compute-vad ----------
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
@@ -10,9 +10,10 @@ import librosa
 
				 import numpy as np
			
 
				 
			
 
				 from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
			
 
				-                    OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
			
 
				+                    OrtInferSession, TokenIDConverter, get_logger,
			
 
				                     read_yaml)
			
 
				 from .utils.postprocess_utils import sentence_postprocess
			
 
				+from .utils.frontend import WavFrontend
			
 
				 
			
 
				 logging = get_logger()
			
 
				 
			
@@ -65,7 +66,7 @@ class Paraformer():
 
				                   wav_content: Union[str, np.ndarray, List[str]]) -> List:
			
 
				         def load_wav(path: str) -> np.ndarray:
			
 
				             waveform, _ = librosa.load(path, sr=None)
			
 
				-            return waveform[None, ...]
			
 
				+            return waveform
			
 
				 
			
 
				         if isinstance(wav_content, np.ndarray):
			
 
				             return [wav_content]
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
@@ -2,4 +2,5 @@ librosa
 
				 numpy
			
 
				 onnxruntime
			
 
				 scipy
			
 
				-typeguard
			
 
				+typeguard
			
 
				+kaldi-native-fbank
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
@@ -0,0 +1,137 @@
 
				+# -*- encoding: utf-8 -*-
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
			
 
				+
			
 
				+import numpy as np
			
 
				+from typeguard import check_argument_types
			
 
				+import kaldi_native_fbank as knf
			
 
				+
			
 
				+root_dir = Path(__file__).resolve().parent
			
 
				+
			
 
				+logger_initialized = {}
			
 
				+
			
 
				+
			
 
				+class WavFrontend():
			
 
				+    """Conventional frontend structure for ASR.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+            self,
			
 
				+            cmvn_file: str = None,
			
 
				+            fs: int = 16000,
			
 
				+            window: str = 'hamming',
			
 
				+            n_mels: int = 80,
			
 
				+            frame_length: int = 25.0,
			
 
				+            frame_shift: int = 10,
			
 
				+            filter_length_min: int = -1,
			
 
				+            filter_length_max: float = -1,
			
 
				+            lfr_m: int = 1,
			
 
				+            lfr_n: int = 1,
			
 
				+            dither: float = 1.0
			
 
				+    ) -> None:
			
 
				+        check_argument_types()
			
 
				+
			
 
				+        opts = knf.FbankOptions()
			
 
				+        opts.frame_opts.samp_freq = fs
			
 
				+        opts.frame_opts.dither = dither
			
 
				+        opts.frame_opts.window_type = window
			
 
				+        opts.frame_opts.frame_shift_ms = float(frame_shift)
			
 
				+        opts.frame_opts.frame_length_ms = float(frame_length)
			
 
				+        opts.mel_opts.num_bins = n_mels
			
 
				+        opts.energy_floor = 0
			
 
				+        opts.frame_opts.snip_edges = True
			
 
				+        opts.mel_opts.debug_mel = False
			
 
				+        self.opts = opts
			
 
				+
			
 
				+        self.compute_fbank_feats = knf.OnlineFbank(self.opts)
			
 
				+
			
 
				+        self.filter_length_min = filter_length_min
			
 
				+        self.filter_length_max = filter_length_max
			
 
				+        self.lfr_m = lfr_m
			
 
				+        self.lfr_n = lfr_n
			
 
				+        self.cmvn_file = cmvn_file
			
 
				+
			
 
				+        if self.cmvn_file:
			
 
				+            self.cmvn = self.load_cmvn()
			
 
				+
			
 
				+    def fbank(self,
			
 
				+              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			
 
				+        waveform = waveform * (1 << 15)
			
 
				+        self.compute_fbank_feats.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
			
 
				+        frames = self.compute_fbank_feats.num_frames_ready
			
 
				+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
			
 
				+        for i in range(frames):
			
 
				+            mat[i, :] = self.compute_fbank_feats.get_frame(i)
			
 
				+        feat = mat.astype(np.float32)
			
 
				+        feat_len = np.array(mat.shape[0]).astype(np.int32)
			
 
				+        return feat, feat_len
			
 
				+
			
 
				+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			
 
				+        if self.lfr_m != 1 or self.lfr_n != 1:
			
 
				+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
			
 
				+
			
 
				+        if self.cmvn_file:
			
 
				+            feat = self.apply_cmvn(feat)
			
 
				+
			
 
				+        feat_len = np.array(feat.shape[0]).astype(np.int32)
			
 
				+        return feat, feat_len
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
			
 
				+        LFR_inputs = []
			
 
				+
			
 
				+        T = inputs.shape[0]
			
 
				+        T_lfr = int(np.ceil(T / lfr_n))
			
 
				+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
			
 
				+        inputs = np.vstack((left_padding, inputs))
			
 
				+        T = T + (lfr_m - 1) // 2
			
 
				+        for i in range(T_lfr):
			
 
				+            if lfr_m <= T - i * lfr_n:
			
 
				+                LFR_inputs.append(
			
 
				+                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
			
 
				+            else:
			
 
				+                # process last LFR frame
			
 
				+                num_padding = lfr_m - (T - i * lfr_n)
			
 
				+                frame = inputs[i * lfr_n:].reshape(-1)
			
 
				+                for _ in range(num_padding):
			
 
				+                    frame = np.hstack((frame, inputs[-1]))
			
 
				+
			
 
				+                LFR_inputs.append(frame)
			
 
				+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
			
 
				+        return LFR_outputs
			
 
				+
			
 
				+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
			
 
				+        """
			
 
				+        Apply CMVN with mvn data
			
 
				+        """
			
 
				+        frame, dim = inputs.shape
			
 
				+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
			
 
				+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
			
 
				+        inputs = (inputs + means) * vars
			
 
				+        return inputs
			
 
				+
			
 
				+    def load_cmvn(self,) -> np.ndarray:
			
 
				+        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
			
 
				+            lines = f.readlines()
			
 
				+
			
 
				+        means_list = []
			
 
				+        vars_list = []
			
 
				+        for i in range(len(lines)):
			
 
				+            line_item = lines[i].split()
			
 
				+            if line_item[0] == '<AddShift>':
			
 
				+                line_item = lines[i + 1].split()
			
 
				+                if line_item[0] == '<LearnRateCoef>':
			
 
				+                    add_shift_line = line_item[3:(len(line_item) - 1)]
			
 
				+                    means_list = list(add_shift_line)
			
 
				+                    continue
			
 
				+            elif line_item[0] == '<Rescale>':
			
 
				+                line_item = lines[i + 1].split()
			
 
				+                if line_item[0] == '<LearnRateCoef>':
			
 
				+                    rescale_line = line_item[3:(len(line_item) - 1)]
			
 
				+                    vars_list = list(rescale_line)
			
 
				+                    continue
			
 
				+
			
 
				+        means = np.array(means_list).astype(np.float64)
			
 
				+        vars = np.array(vars_list).astype(np.float64)
			
 
				+        cmvn = np.array([means, vars])
			
 
				+        return cmvn
			
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
@@ -13,7 +13,6 @@ from onnxruntime import (GraphOptimizationLevel, InferenceSession,
 
				                          SessionOptions, get_available_providers, get_device)
			
 
				 from typeguard import check_argument_types
			
 
				 
			
 
				-from funasr.runtime.python.onnxruntime.paraformer.rapid_paraformer.kaldifeat import compute_fbank_feats
			
 
				 import warnings
			
 
				 
			
 
				 root_dir = Path(__file__).resolve().parent
			
@@ -121,128 +120,6 @@ class CharTokenizer():
 
				         )
			
 
				 
			
 
				 
			
 
				-class WavFrontend():
			
 
				-    """Conventional frontend structure for ASR.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(
			
 
				-            self,
			
 
				-            cmvn_file: str = None,
			
 
				-            fs: int = 16000,
			
 
				-            window: str = 'hamming',
			
 
				-            n_mels: int = 80,
			
 
				-            frame_length: int = 25,
			
 
				-            frame_shift: int = 10,
			
 
				-            filter_length_min: int = -1,
			
 
				-            filter_length_max: float = -1,
			
 
				-            lfr_m: int = 1,
			
 
				-            lfr_n: int = 1,
			
 
				-            dither: float = 1.0
			
 
				-    ) -> None:
			
 
				-        check_argument_types()
			
 
				-
			
 
				-        self.fs = fs
			
 
				-        self.window = window
			
 
				-        self.n_mels = n_mels
			
 
				-        self.frame_length = frame_length
			
 
				-        self.frame_shift = frame_shift
			
 
				-        self.filter_length_min = filter_length_min
			
 
				-        self.filter_length_max = filter_length_max
			
 
				-        self.lfr_m = lfr_m
			
 
				-        self.lfr_n = lfr_n
			
 
				-        self.cmvn_file = cmvn_file
			
 
				-        self.dither = dither
			
 
				-
			
 
				-        if self.cmvn_file:
			
 
				-            self.cmvn = self.load_cmvn()
			
 
				-
			
 
				-    def fbank(self,
			
 
				-              input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			
 
				-        waveform_len = input_content.shape[1]
			
 
				-        waveform = input_content[0][:waveform_len]
			
 
				-        waveform = waveform * (1 << 15)
			
 
				-        mat = compute_fbank_feats(waveform,
			
 
				-                                  num_mel_bins=self.n_mels,
			
 
				-                                  frame_length=self.frame_length,
			
 
				-                                  frame_shift=self.frame_shift,
			
 
				-                                  dither=self.dither,
			
 
				-                                  energy_floor=0.0,
			
 
				-                                  sample_frequency=self.fs,
			
 
				-                                  window_type=self.window)
			
 
				-        feat = mat.astype(np.float32)
			
 
				-        feat_len = np.array(mat.shape[0]).astype(np.int32)
			
 
				-        return feat, feat_len
			
 
				-
			
 
				-    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			
 
				-        if self.lfr_m != 1 or self.lfr_n != 1:
			
 
				-            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
			
 
				-
			
 
				-        if self.cmvn_file:
			
 
				-            feat = self.apply_cmvn(feat)
			
 
				-
			
 
				-        feat_len = np.array(feat.shape[0]).astype(np.int32)
			
 
				-        return feat, feat_len
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
			
 
				-        LFR_inputs = []
			
 
				-
			
 
				-        T = inputs.shape[0]
			
 
				-        T_lfr = int(np.ceil(T / lfr_n))
			
 
				-        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
			
 
				-        inputs = np.vstack((left_padding, inputs))
			
 
				-        T = T + (lfr_m - 1) // 2
			
 
				-        for i in range(T_lfr):
			
 
				-            if lfr_m <= T - i * lfr_n:
			
 
				-                LFR_inputs.append(
			
 
				-                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
			
 
				-            else:
			
 
				-                # process last LFR frame
			
 
				-                num_padding = lfr_m - (T - i * lfr_n)
			
 
				-                frame = inputs[i * lfr_n:].reshape(-1)
			
 
				-                for _ in range(num_padding):
			
 
				-                    frame = np.hstack((frame, inputs[-1]))
			
 
				-
			
 
				-                LFR_inputs.append(frame)
			
 
				-        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
			
 
				-        return LFR_outputs
			
 
				-
			
 
				-    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
			
 
				-        """
			
 
				-        Apply CMVN with mvn data
			
 
				-        """
			
 
				-        frame, dim = inputs.shape
			
 
				-        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
			
 
				-        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
			
 
				-        inputs = (inputs + means) * vars
			
 
				-        return inputs
			
 
				-
			
 
				-    def load_cmvn(self,) -> np.ndarray:
			
 
				-        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
			
 
				-            lines = f.readlines()
			
 
				-
			
 
				-        means_list = []
			
 
				-        vars_list = []
			
 
				-        for i in range(len(lines)):
			
 
				-            line_item = lines[i].split()
			
 
				-            if line_item[0] == '<AddShift>':
			
 
				-                line_item = lines[i + 1].split()
			
 
				-                if line_item[0] == '<LearnRateCoef>':
			
 
				-                    add_shift_line = line_item[3:(len(line_item) - 1)]
			
 
				-                    means_list = list(add_shift_line)
			
 
				-                    continue
			
 
				-            elif line_item[0] == '<Rescale>':
			
 
				-                line_item = lines[i + 1].split()
			
 
				-                if line_item[0] == '<LearnRateCoef>':
			
 
				-                    rescale_line = line_item[3:(len(line_item) - 1)]
			
 
				-                    vars_list = list(rescale_line)
			
 
				-                    continue
			
 
				-
			
 
				-        means = np.array(means_list).astype(np.float64)
			
 
				-        vars = np.array(vars_list).astype(np.float64)
			
 
				-        cmvn = np.array([means, vars])
			
 
				-        return cmvn
			
 
				-
			
 
				 
			
 
				 class Hypothesis(NamedTuple):
			
 
				     """Hypothesis data type."""