Commit ·
68ddd2c
1
Parent(s): d20112c
"fix-convert-to-float32-before-mean-operations"
Browse files
reachy_mini_ha_voice/voice_assistant.py
CHANGED
|
@@ -387,7 +387,7 @@ class VoiceAssistantService:
|
|
| 387 |
if isinstance(audio_data, bytes):
|
| 388 |
audio_data = np.frombuffer(audio_data, dtype=np.int16)
|
| 389 |
elif isinstance(audio_data, (list, tuple)):
|
| 390 |
-
audio_data = np.array(audio_data)
|
| 391 |
|
| 392 |
# Ensure it's a numpy array
|
| 393 |
if not isinstance(audio_data, np.ndarray):
|
|
@@ -397,10 +397,31 @@ class VoiceAssistantService:
|
|
| 397 |
time.sleep(0.01)
|
| 398 |
continue
|
| 399 |
|
| 400 |
-
# Handle string dtype (S1) -
|
| 401 |
-
if audio_data.dtype.kind in ('S', 'U', 'O'):
|
| 402 |
-
#
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
# Handle multi-dimensional arrays (stereo/multi-channel)
|
| 406 |
if audio_data.ndim == 2:
|
|
@@ -411,24 +432,8 @@ class VoiceAssistantService:
|
|
| 411 |
elif audio_data.ndim > 2:
|
| 412 |
audio_data = audio_data.reshape(-1)
|
| 413 |
|
| 414 |
-
#
|
| 415 |
-
|
| 416 |
-
audio_chunk_array = audio_data.astype(np.float32) / 32768.0
|
| 417 |
-
elif audio_data.dtype == np.float32:
|
| 418 |
-
audio_chunk_array = audio_data
|
| 419 |
-
elif audio_data.dtype == np.float64:
|
| 420 |
-
audio_chunk_array = audio_data.astype(np.float32)
|
| 421 |
-
elif np.issubdtype(audio_data.dtype, np.integer):
|
| 422 |
-
# Other integer types
|
| 423 |
-
info = np.iinfo(audio_data.dtype)
|
| 424 |
-
scale = float(max(-info.min, info.max))
|
| 425 |
-
audio_chunk_array = audio_data.astype(np.float32) / scale
|
| 426 |
-
else:
|
| 427 |
-
# Try to convert to float32
|
| 428 |
-
audio_chunk_array = audio_data.astype(np.float32)
|
| 429 |
-
|
| 430 |
-
# Ensure 1D array
|
| 431 |
-
audio_chunk_array = audio_chunk_array.reshape(-1)
|
| 432 |
|
| 433 |
# Resample if needed
|
| 434 |
if input_sample_rate != target_sample_rate and len(audio_chunk_array) > 0:
|
|
|
|
| 387 |
if isinstance(audio_data, bytes):
|
| 388 |
audio_data = np.frombuffer(audio_data, dtype=np.int16)
|
| 389 |
elif isinstance(audio_data, (list, tuple)):
|
| 390 |
+
audio_data = np.array(audio_data, dtype=np.float32)
|
| 391 |
|
| 392 |
# Ensure it's a numpy array
|
| 393 |
if not isinstance(audio_data, np.ndarray):
|
|
|
|
| 397 |
time.sleep(0.01)
|
| 398 |
continue
|
| 399 |
|
| 400 |
+
# Handle string/bytes dtype (S1, U, O) - MUST be done before any numeric operations
|
| 401 |
+
if audio_data.dtype.kind in ('S', 'U', 'O'):
|
| 402 |
+
# Get raw bytes and convert to int16
|
| 403 |
+
raw_bytes = audio_data.tobytes()
|
| 404 |
+
if len(raw_bytes) >= 2:
|
| 405 |
+
audio_data = np.frombuffer(raw_bytes, dtype=np.int16)
|
| 406 |
+
else:
|
| 407 |
+
time.sleep(0.01)
|
| 408 |
+
continue
|
| 409 |
+
|
| 410 |
+
# Now convert to numeric type BEFORE any mean/reshape operations
|
| 411 |
+
if not np.issubdtype(audio_data.dtype, np.number):
|
| 412 |
+
time.sleep(0.01)
|
| 413 |
+
continue
|
| 414 |
+
|
| 415 |
+
# Convert to float32 first to avoid issues with mean on integer types
|
| 416 |
+
if audio_data.dtype == np.int16:
|
| 417 |
+
audio_data = audio_data.astype(np.float32) / 32768.0
|
| 418 |
+
elif audio_data.dtype != np.float32:
|
| 419 |
+
if np.issubdtype(audio_data.dtype, np.integer):
|
| 420 |
+
info = np.iinfo(audio_data.dtype)
|
| 421 |
+
scale = float(max(-info.min, info.max))
|
| 422 |
+
audio_data = audio_data.astype(np.float32) / scale
|
| 423 |
+
else:
|
| 424 |
+
audio_data = audio_data.astype(np.float32)
|
| 425 |
|
| 426 |
# Handle multi-dimensional arrays (stereo/multi-channel)
|
| 427 |
if audio_data.ndim == 2:
|
|
|
|
| 432 |
elif audio_data.ndim > 2:
|
| 433 |
audio_data = audio_data.reshape(-1)
|
| 434 |
|
| 435 |
+
# Ensure 1D float32 array
|
| 436 |
+
audio_chunk_array = audio_data.reshape(-1).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
# Resample if needed
|
| 439 |
if input_sample_rate != target_sample_rate and len(audio_chunk_array) > 0:
|