目录

音频重采样

在这里,我们将演练使用 对音频波形进行重采样。torchaudio

# When running this tutorial in Google Colab, install the required packages
# with the following.
# !pip install torchaudio librosa

import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

print(torch.__version__)
print(torchaudio.__version__)

外:

1.11.0+cpu
0.11.0+cpu

准备数据和实用程序函数(跳过本节)

# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.

# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------

import math
import time

import librosa
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Audio, display


DEFAULT_OFFSET = 201
SWEEP_MAX_SAMPLE_RATE = 48000
DEFAULT_LOWPASS_FILTER_WIDTH = 6
DEFAULT_ROLLOFF = 0.99
DEFAULT_RESAMPLING_METHOD = "sinc_interpolation"


def _get_log_freq(sample_rate, max_sweep_rate, offset):
    """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2]

    offset is used to avoid negative infinity `log(offset + x)`.

    """
    start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2)
    return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset


def _get_inverse_log_freq(freq, sample_rate, offset):
    """Find the time where the given frequency is given by _get_log_freq"""
    half = sample_rate // 2
    return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset))


def _get_freq_ticks(sample_rate, offset, f_max):
    # Given the original sample rate used for generating the sweep,
    # find the x-axis value where the log-scale major frequency values fall in
    time, freq = [], []
    for exp in range(2, 5):
        for v in range(1, 10):
            f = v * 10 ** exp
            if f < sample_rate // 2:
                t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
                time.append(t)
                freq.append(f)
    t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
    time.append(t_max)
    freq.append(f_max)
    return time, freq


def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
    max_sweep_rate = sample_rate
    freq = _get_log_freq(sample_rate, max_sweep_rate, offset)
    delta = 2 * math.pi * freq / sample_rate
    cummulative = torch.cumsum(delta, dim=0)
    signal = torch.sin(cummulative).unsqueeze(dim=0)
    return signal


def plot_sweep(
    waveform,
    sample_rate,
    title,
    max_sweep_rate=SWEEP_MAX_SAMPLE_RATE,
    offset=DEFAULT_OFFSET,
):
    x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2]
    y_ticks = [1000, 5000, 10000, 20000, sample_rate // 2]

    time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2)
    freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq]
    freq_y = [f for f in freq if f >= 1000 and f in y_ticks and f <= sample_rate // 2]

    figure, axis = plt.subplots(1, 1)
    axis.specgram(waveform[0].numpy(), Fs=sample_rate)
    plt.xticks(time, freq_x)
    plt.yticks(freq_y, freq_y)
    axis.set_xlabel("Original Signal Frequency (Hz, log scale)")
    axis.set_ylabel("Waveform Frequency (Hz)")
    axis.xaxis.grid(True, alpha=0.67)
    axis.yaxis.grid(True, alpha=0.67)
    figure.suptitle(f"{title} (sample rate: {sample_rate} Hz)")
    plt.show(block=True)


def play_audio(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    if num_channels == 1:
        display(Audio(waveform[0], rate=sample_rate))
    elif num_channels == 2:
        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
    else:
        raise ValueError("Waveform with more than 2 channels are not supported.")


def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    plt.show(block=False)


def benchmark_resample(
    method,
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH,
    rolloff=DEFAULT_ROLLOFF,
    resampling_method=DEFAULT_RESAMPLING_METHOD,
    beta=None,
    librosa_type=None,
    iters=5,
):
    if method == "functional":
        begin = time.time()
        for _ in range(iters):
            F.resample(
                waveform,
                sample_rate,
                resample_rate,
                lowpass_filter_width=lowpass_filter_width,
                rolloff=rolloff,
                resampling_method=resampling_method,
            )
        elapsed = time.time() - begin
        return elapsed / iters
    elif method == "transforms":
        resampler = T.Resample(
            sample_rate,
            resample_rate,
            lowpass_filter_width=lowpass_filter_width,
            rolloff=rolloff,
            resampling_method=resampling_method,
            dtype=waveform.dtype,
        )
        begin = time.time()
        for _ in range(iters):
            resampler(waveform)
        elapsed = time.time() - begin
        return elapsed / iters
    elif method == "librosa":
        waveform_np = waveform.squeeze().numpy()
        begin = time.time()
        for _ in range(iters):
            librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type)
        elapsed = time.time() - begin
        return elapsed / iters

重采样概述

要将音频波形从一个频率重新采样到另一个频率,你可以使用torchaudio.transforms.Resample()torchaudio.functional.resample(). precomputes 并缓存用于重采样的内核, while 会动态计算它,因此 using 将导致在重新采样时加速 使用相同参数的多个波形(参见 基准测试 部分)。transforms.Resamplefunctional.resampletorchaudio.transforms.Resample

两种重采样方法都使用带限 sinc 要计算的插值 信号值。实现涉及 卷积,因此我们可以利用 GPU / 多线程 性能改进。在多个 子进程,例如使用多个工作进程加载数据,则 应用程序创建的线程数可能会超出您的系统处理能力 有效。设置可能会对此有所帮助 箱。torch.set_num_threads(1)

因为有限数量的样本只能代表有限数量的 频率、重新采样不会产生完美的结果,并且 of 参数可用于控制其质量和计算 速度。我们通过对数 正弦扫频,这是一个在 频率随时间的变化。

下面的频谱图显示了信号的频率表示, 其中 x 轴对应于原始 波形(对数刻度),y 轴频率 绘制波形,颜色强度为振幅。

sample_rate = 48000
resample_rate = 32000

waveform = get_sine_sweep(sample_rate)
plot_sweep(waveform, sample_rate, title="Original Waveform")
play_audio(waveform, sample_rate)

resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
resampled_waveform = resampler(waveform)
plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform")
play_audio(waveform, sample_rate)
  • 原始波形(采样率:48000 Hz)
  • 重新采样的波形(采样率:32000 Hz)

外:

<IPython.lib.display.Audio object>
<IPython.lib.display.Audio object>

使用参数控制重采样质量

低通滤波器宽度

由于用于插值的滤波器无限延伸,因此该参数用于控制 用于对插值进行窗口化的 filter。它也被称为 自插值通过 在每个时间单位上为零。使用较大的滤波器更清晰、更精确,但计算量更大 贵。lowpass_filter_widthlowpass_filter_width

sample_rate = 48000
resample_rate = 32000

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=6)
plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6")

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=128)
plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128")
  • lowpass_filter_width=6(采样率:32000 Hz)
  • lowpass_filter_width=128(采样率:32000 Hz)

滚降

该参数表示为奈奎斯特频率的分数 frequency,即给定的 有限采样率。 确定低通滤波器截止和 控制混叠的程度,当频率 高于奈奎斯特频率的 Nyquist 频率被映射到较低的频率。下滚降 因此,将减少锯齿的数量,但它也会减少 一些更高的频率。rolloffrolloff

sample_rate = 48000
resample_rate = 32000

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.99)
plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.99")

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.8)
plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
  • 滚降 = 0.99(采样率:32000 Hz)
  • 滚降 = 0.8(采样率:32000 Hz)

Window 函数

默认情况下,的 resample 使用 Hann 窗口过滤器,即 加权余弦函数。它还支持 Kaiser 窗口、 ,这是一个近乎最优的窗口函数,它包含一个额外的参数,该参数允许设计 filter 和 impulse 的宽度。这可以使用 parameter 进行控制。torchaudiobetaresampling_method

sample_rate = 48000
resample_rate = 32000

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation")
plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default")

resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window")
plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default")
  • Hann Window 默认(采样率:32000 Hz)
  • Kaiser Window Default(采样率:32000 Hz)

与 librosa 的比较

torchaudio的 resample 函数可用于生成类似于 Librosa (Resampy) 的 Kaiser 窗口重新采样,有一些噪声

sample_rate = 48000
resample_rate = 32000

# kaiser_best
resampled_waveform = F.resample(
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=64,
    rolloff=0.9475937167399596,
    resampling_method="kaiser_window",
    beta=14.769656459379492,
)
plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)")

librosa_resampled_waveform = torch.from_numpy(
    librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_best")
).unsqueeze(0)
plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Best (librosa)")

mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
print("torchaudio and librosa kaiser best MSE:", mse)

# kaiser_fast
resampled_waveform = F.resample(
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=16,
    rolloff=0.85,
    resampling_method="kaiser_window",
    beta=8.555504641634386,
)
plot_specgram(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")

librosa_resampled_waveform = torch.from_numpy(
    librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_fast")
).unsqueeze(0)
plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Fast (librosa)")

mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
print("torchaudio and librosa kaiser fast MSE:", mse)
  • Kaiser Window Best (torchaudio) (采样率:32000 Hz)
  • Kaiser Window Best (librosa) (采样率:32000 Hz)
  • Kaiser Window Fast (torchaudio)
  • Kaiser Window Fast (librosa)(采样率:32000 Hz)

外:

torchaudio and librosa kaiser best MSE: 2.0806901153659873e-06
torchaudio and librosa kaiser fast MSE: 2.5200744248601027e-05

性能基准测试

以下是 两对采样率。我们展示了性能影响 、窗口类型和采样率可以 有。此外,我们还提供了与 的比较,并使用它们的相应参数 在。lowpass_filter_wdithlibrosakaiser_bestkaiser_fasttorchaudio

详细说明结果:

  • 较大的结果会产生较大的重采样核, 因此增加了内核计算的计算时间 和卷积lowpass_filter_width

  • using 会导致计算时间比 default 长,因为计算中间 窗口值 - 将导致采样率和重新采样率之间的 GCD 较大 在允许更小的内核和更快的内核计算的简化中。kaiser_windowsinc_interpolation

configs = {
    "downsample (48 -> 44.1 kHz)": [48000, 44100],
    "downsample (16 -> 8 kHz)": [16000, 8000],
    "upsample (44.1 -> 48 kHz)": [44100, 48000],
    "upsample (8 -> 16 kHz)": [8000, 16000],
}

for label in configs:
    times, rows = [], []
    sample_rate = configs[label][0]
    resample_rate = configs[label][1]
    waveform = get_sine_sweep(sample_rate)

    # sinc 64 zero-crossings
    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
    times.append([None, 1000 * f_time, 1000 * t_time])
    rows.append("sinc (width 64)")

    # sinc 6 zero-crossings
    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
    times.append([None, 1000 * f_time, 1000 * t_time])
    rows.append("sinc (width 16)")

    # kaiser best
    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
    f_time = benchmark_resample(
        "functional",
        waveform,
        sample_rate,
        resample_rate,
        lowpass_filter_width=64,
        rolloff=0.9475937167399596,
        resampling_method="kaiser_window",
        beta=14.769656459379492,
    )
    t_time = benchmark_resample(
        "transforms",
        waveform,
        sample_rate,
        resample_rate,
        lowpass_filter_width=64,
        rolloff=0.9475937167399596,
        resampling_method="kaiser_window",
        beta=14.769656459379492,
    )
    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
    rows.append("kaiser_best")

    # kaiser fast
    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
    f_time = benchmark_resample(
        "functional",
        waveform,
        sample_rate,
        resample_rate,
        lowpass_filter_width=16,
        rolloff=0.85,
        resampling_method="kaiser_window",
        beta=8.555504641634386,
    )
    t_time = benchmark_resample(
        "transforms",
        waveform,
        sample_rate,
        resample_rate,
        lowpass_filter_width=16,
        rolloff=0.85,
        resampling_method="kaiser_window",
        beta=8.555504641634386,
    )
    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
    rows.append("kaiser_fast")

    df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows)
    df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns])
    display(df.round(2))

外:

                downsample (48 -> 44.1 kHz) time (ms)  ...
                                              librosa  ... transforms
sinc (width 64)                                   NaN  ...       0.39
sinc (width 16)                                   NaN  ...       0.32
kaiser_best                                     36.50  ...       0.39
kaiser_fast                                      9.85  ...       0.36

[4 rows x 3 columns]
                downsample (16 -> 8 kHz) time (ms)  ...
                                           librosa  ... transforms
sinc (width 64)                                NaN  ...       0.78
sinc (width 16)                                NaN  ...       0.33
kaiser_best                                  12.68  ...       0.78
kaiser_fast                                   4.53  ...       0.34

[4 rows x 3 columns]
                upsample (44.1 -> 48 kHz) time (ms)  ...
                                            librosa  ... transforms
sinc (width 64)                                 NaN  ...       0.41
sinc (width 16)                                 NaN  ...       0.37
kaiser_best                                   33.73  ...       0.43
kaiser_fast                                   10.05  ...       0.42

[4 rows x 3 columns]
                upsample (8 -> 16 kHz) time (ms)  ...
                                         librosa  ... transforms
sinc (width 64)                              NaN  ...       0.36
sinc (width 16)                              NaN  ...       0.21
kaiser_best                                22.68  ...       0.41
kaiser_fast                                 5.27  ...       0.18

[4 rows x 3 columns]

脚本总运行时间:(0 分 4.470 秒)

由 Sphinx-Gallery 生成的图库

文档

访问 PyTorch 的全面开发人员文档

查看文档

教程

获取面向初学者和高级开发人员的深入教程

查看教程

资源

查找开发资源并解答您的问题

查看资源