We try to infer in the speech interaction scenario without image inputs, while an error occurs.
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0
export PYTHONPATH=./:./CosyVoice
STREAMOMNI_CKPT=./checkpoints_sllms/stream_omni
python3 stream_omni/eval/run_stream_omni.py \
--model-path ${STREAMOMNI_CKPT} \
--conv-mode stream_omni_llama_3_1 --model-name stream-omni \
--query /eval/VocalBench/audio/knowledge/0000.wav
Traceback (most recent call last):
File "stream_omni/eval/run_stream_omni.py", line 272, in <module>
eval_model(args)
File "stream_omni/eval/run_stream_omni.py", line 193, in eval_model
output_ids = model.generate(
File "/opt/conda/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/stream_omni_llama.py", line 1326, in generate
return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 2024, in generate
result = self._sample(
File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 2982, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/stream_omni_llama.py", line 666, in forward
return self.infer_speech_to_speech(
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/stream_omni_llama.py", line 854, in infer_speech_to_speech
outputs = self.model(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/stream_omni_llama.py", line 490, in forward
layer_outputs = decoder_layer(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/modeling_llama.py", line 648, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/projects/Stream-Omni/stream_omni/model/language_model/modeling_llama.py", line 351, in forward
attn_weights = attn_weights + causal_mask
RuntimeError: The size of tensor a (112) must match the size of tensor b (111) at non-singleton dimension 3
Simple padding can successfully generate speech and text responses, but we are worried that it may affect performance. Have you ever encountered such an error? Is there an official solution?
We try to infer in the speech interaction scenario without image inputs, while an error occurs.
The error is:
Simple padding can successfully generate speech and text responses, but we are worried that it may affect performance. Have you ever encountered such an error? Is there an official solution?