| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- #!/usr/bin/env python3
- # 2020, Technische Universität München; Ludwig Kürzinger
- # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
- """Sliding Window for raw audio input data."""
- import torch
- import torch.nn as nn
- from typing import Tuple
- class SlidingWindow(nn.Module):
- """Sliding Window.
- Provides a sliding window over a batched continuous raw audio tensor.
- Optionally, provides padding (Currently not implemented).
- Combine this module with a pre-encoder compatible with raw audio data,
- for example Sinc convolutions.
- Known issues:
- Output length is calculated incorrectly if audio shorter than win_length.
- WARNING: trailing values are discarded - padding not implemented yet.
- There is currently no additional window function applied to input values.
- """
- def __init__(
- self,
- win_length: int = 400,
- hop_length: int = 160,
- channels: int = 1,
- padding: int = None,
- fs=None,
- ):
- """Initialize.
- Args:
- win_length: Length of frame.
- hop_length: Relative starting point of next frame.
- channels: Number of input channels.
- padding: Padding (placeholder, currently not implemented).
- fs: Sampling rate (placeholder for compatibility, not used).
- """
- super().__init__()
- self.fs = fs
- self.win_length = win_length
- self.hop_length = hop_length
- self.channels = channels
- self.padding = padding
- def forward(
- self, input: torch.Tensor, input_lengths: torch.Tensor
- ) -> Tuple[torch.Tensor, torch.Tensor]:
- """Apply a sliding window on the input.
- Args:
- input: Input (B, T, C*D) or (B, T*C*D), with D=C=1.
- input_lengths: Input lengths within batch.
- Returns:
- Tensor: Output with dimensions (B, T, C, D), with D=win_length.
- Tensor: Output lengths within batch.
- """
- input_size = input.size()
- B = input_size[0]
- T = input_size[1]
- C = self.channels
- D = self.win_length
- # (B, T, C) --> (T, B, C)
- continuous = input.view(B, T, C).permute(1, 0, 2)
- windowed = continuous.unfold(0, D, self.hop_length)
- # (T, B, C, D) --> (B, T, C, D)
- output = windowed.permute(1, 0, 2, 3).contiguous()
- # After unfold(), windowed lengths change:
- output_lengths = (input_lengths - self.win_length) // self.hop_length + 1
- return output, output_lengths
- def output_size(self) -> int:
- """Return output length of feature dimension D, i.e. the window length."""
- return self.win_length
|