SepformerSeparation can not load audio.

Open imxtx opened this issue 9 months ago • 0 comments

Describe the bug

Code:

from speechbrain.inference import SepformerSeparation as separator
import torchaudio

model = separator.from_hparams(
    source="speechbrain/sepformer-wham16k-enhancement",
    # savedir="pretrained_models/sepformer-wham16k-enhancement",
)

# for custom file, change path
import os
assert os.path.exists("../trajectory/uot3d_mel_t=0.50.wav")
est_sources = model.separate_file(
    path="../trajectory/uot3d_mel_t=0.50.wav"
)

Error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[99], [line 12](vscode-notebook-cell:?execution_count=99&line=12)
     [10](vscode-notebook-cell:?execution_count=99&line=10) import os
     [11](vscode-notebook-cell:?execution_count=99&line=11) assert os.path.exists("../trajectory/uot3d_mel_t=0.50.wav")
---> [12](vscode-notebook-cell:?execution_count=99&line=12) est_sources = model.separate_file(
     [13](vscode-notebook-cell:?execution_count=99&line=13)     path="../trajectory/uot3d_mel_t=0.50.wav"
     [14](vscode-notebook-cell:?execution_count=99&line=14) )
     [16](vscode-notebook-cell:?execution_count=99&line=16) # torchaudio.save("enhanced_wham16k.wav", est_sources[:, :, 0].detach().cpu(), 16000)

File /mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:107, in SepformerSeparation.separate_file(self, path, savedir)
     [99](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:99) source, fl = split_path(path)
    [100](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:100) path = fetch(
    [101](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:101)     fl,
    [102](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:102)     source=source,
    [103](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:103)     savedir=savedir,
    [104](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:104)     local_strategy=LocalStrategy.SYMLINK,
    [105](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:105) )
--> [107](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:107) batch, fs_file = torchaudio.load(path)
    [108](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:108) batch = batch.to(self.device)
    [109](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/speechbrain/inference/separation.py:109) fs_model = self.hparams.sample_rate

File /mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:205, in get_load_func.<locals>.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)
    [128](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:128) """Load audio data from source.
    [129](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:129) 
    [130](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:130) By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
   (...)
    [202](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:202)         `[channel, time]` else `[time, channel]`.
    [203](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:203) """
    [204](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:204) backend = dispatcher(uri, format, backend)
--> [205](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/utils.py:205) return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)

File /mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:44, in SoXBackend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
     [39](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:39)     raise ValueError(
     [40](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:40)         "SoX backend does not support loading from file-like objects. ",
     [41](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:41)         "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
     [42](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:42)     )
     [43](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:43) else:
---> [44](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:44)     ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
     [45](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:45)     if not ret:
     [46](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torchaudio/_backend/sox.py:46)         raise RuntimeError(f"Failed to load audio from {uri}.")

File /mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:854, in OpOverloadPacket.__call__(self_, *args, **kwargs)
    [846](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:846) def __call__(self_, *args, **kwargs):  # noqa: B902
    [847](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:847)     # use `self_` to avoid naming collide with aten ops arguments that
    [848](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:848)     # named "self". This way, all the aten ops can be called by kwargs.
   (...)
    [852](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:852)     # We save the function ptr as the `op` attribute on
    [853](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:853)     # OpOverloadPacket to access it here.
--> [854](https://vscode-remote+ssh-002dremote-002b10-002e120-002e17-002e57.vscode-resource.vscode-cdn.net/mnt/nvme0n1/xietianxin/miniconda3/envs/f5-tts/lib/python3.10/site-packages/torch/_ops.py:854)     return self_._op(*args, **(kwargs or {}))

RuntimeError: torchaudio_sox::load_audio_file() Expected a value of type 'str' for argument '_0' but instead found type 'PosixPath'.
Position: 0
Value: PosixPath('/mnt/nvme0n1/xietianxin/code/github/F5-TTS/notebooks/../trajectory/uot3d_mel_t=0.50.wav')
Declaration: torchaudio_sox::load_audio_file(str _0, int? _1, int? _2, bool? _3, bool? _4, str? _5) -> (Tensor _0, int _1)
Cast error details: Unable to cast Python instance of type <class 'pathlib.PosixPath'> to C++ type '?' (#2650 ```


### Expected behaviour

It should load and separate the audio file.

### To Reproduce

_No response_

### Environment Details

Python env:

```text
accelerate==1.3.0
aiofiles==23.2.1
aiohappyeyeballs==2.4.6
aiohttp==3.11.12
aiosignal==1.3.2
aliyun-python-sdk-core==2.16.0
aliyun-python-sdk-kms==2.16.5
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.8.0
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
async-timeout==5.0.1
attrs==25.1.0
audeer==2.2.1
audioread==3.0.1
audobject==0.7.11
audonnx==0.7.0
bitsandbytes==0.45.2
blessed==1.20.0
boto3==1.36.20
botocore==1.36.20
cached_path==1.6.7
cachetools==5.5.1
certifi==2025.1.31
cffi==1.17.1
charset-normalizer==3.4.1
click==8.1.8
coloredlogs==15.0.1
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
contourpy==1.3.1
crcmod==1.7
cryptography==44.0.2
cycler==0.12.1
datasets==3.3.0
debugpy @ file:///croot/debugpy_1736267418885/work
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
dill==0.3.8
docker-pycreds==0.4.0
editdistance==0.8.1
einops==0.8.1
einx==0.3.0
ema-pytorch==0.7.7
encodec==0.1.1
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1733327148154/work
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1733569351617/work
-e git+https://github.com/SWivid/F5-TTS.git@4b4359bc39ec9a3dfb6ff412345030792f5c37fb#egg=f5_tts
fastapi==0.115.8
fastdtw==0.3.4
fastjsonschema==2.21.1
ffmpy==0.5.0
filelock==3.17.0
flatbuffers==25.2.10
fonttools==4.56.0
frozendict==2.4.6
frozenlist==1.5.0
fsspec==2024.12.0
funasr==1.0.27
gitdb==4.0.12
GitPython==3.1.44
google-api-core==2.24.1
google-auth==2.38.0
google-cloud-core==2.4.1
google-cloud-storage==2.19.0
google-crc32c==1.6.0
google-resumable-media==2.7.2
googleapis-common-protos==1.67.0
gpustat==1.1.1
gradio==5.14.0
gradio_client==1.7.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
huggingface-hub==0.27.1
humanfriendly==10.0
hydra-core==1.3.2
HyperPyYAML==1.2.2
idna==3.10
importlib_metadata==8.6.1
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1744034854/work
ipywidgets==8.1.6
jaconv==0.4.0
jamo==0.4.1
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
jieba==0.42.1
Jinja2==3.1.5
jmespath==0.10.0
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
jupyterlab_widgets==3.0.14
kaldiio==2.18.1
kiwisolver==1.4.8
lazy_loader==0.4
librosa==0.11.0
llvmlite==0.44.0
loguru==0.7.3
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.10.0
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
mdurl==0.1.2
modelscope==1.25.0
more-itertools==10.6.0
mpmath==1.3.0
msgpack==1.1.0
multidict==6.1.0
multiprocess==0.70.16
narwhals==1.36.0
nbformat==5.10.4
nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
networkx==3.4.2
numba==0.61.2
numpy==2.2.5
nvidia-cublas-cu11==11.11.3.6
nvidia-cuda-cupti-cu11==11.8.87
nvidia-cuda-nvrtc-cu11==11.8.89
nvidia-cuda-runtime-cu11==11.8.89
nvidia-cudnn-cu11==8.7.0.84
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.3.0.86
nvidia-cusolver-cu11==11.4.1.48
nvidia-cusparse-cu11==11.7.5.86
nvidia-ml-py==12.570.86
nvidia-nccl-cu11==2.20.5
nvidia-nvtx-cu11==11.8.86
omegaconf==2.3.0
onnx==1.17.0
onnxruntime==1.21.1
openai-whisper==20240930
opencv-python==4.11.0.86
orjson==3.10.15
oss2==2.19.1
oyaml==1.0
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1745075690131/work
pandas==2.2.3
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
pillow==11.1.0
platformdirs @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_platformdirs_1742485085/work
plotly==6.0.1
pooch==1.8.2
POT==0.9.5
prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1744724089886/work
propcache==0.2.1
proto-plus==1.26.0
protobuf==5.29.4
psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1653089181607/work
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
pyarrow==19.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.1
pycparser==2.22
pycryptodome==3.22.0
pydantic==2.10.6
pydantic_core==2.27.2
pydub==0.25.1
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
pynndescent==0.5.13
pyparsing==3.2.1
pypinyin==0.53.0
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
python-multipart==0.0.20
pytorch-wpe==0.0.1
pytz==2025.1
PyYAML==6.0.2
pyzmq @ file:///croot/pyzmq_1734687138743/work
referencing==0.36.2
regex==2024.11.6
requests==2.32.3
rich==13.9.4
rotary-embedding-torch==0.8.6
rpds-py==0.24.0
rsa==4.9
ruamel.yaml==0.18.10
ruamel.yaml.clib==0.2.12
ruff==0.9.6
s3transfer==0.11.2
safehttpx==0.1.6
safetensors==0.5.2
scikit-learn==1.6.1
scipy==1.15.2
semantic-version==2.10.0
sentencepiece==0.2.0
sentry-sdk==2.21.0
setproctitle==1.3.4
shellingham==1.5.4
six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
smmap==5.0.2
sniffio==1.3.1
socksio==1.0.0
soundfile==0.13.1
soxr==0.5.0.post1
speechbrain==1.0.3
stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
starlette==0.45.3
sympy==1.13.3
tensorboardX==2.6.2.2
threadpoolctl==3.6.0
tiktoken==0.9.0
tokenizers==0.21.0
tomli==2.2.1
tomlkit==0.13.2
torch==2.3.0+cu118
torch-complex==0.4.4
torchaudio==2.3.0+cu118
torchdiffeq==0.2.5
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
tqdm==4.67.1
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
transformers==4.48.3
transformers-stream-generator==0.0.5
triton==2.3.0
typer==0.15.1
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_typing_extensions_1744302253/work
tzdata==2025.1
umap-learn==0.5.7
urllib3==2.4.0
uvicorn==0.34.0
vocos==0.1.0
wandb==0.19.9
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
websockets==14.2
widgetsnbextension==4.0.14
x-transformers==2.0.2
xxhash==3.5.0
yarl==1.18.3
zipp==3.21.0

Relevant Log Output

Additional Context

No response

May 01 '25 14:05 imxtx