Desmond-Dong commited on
Commit
68ddd2c
·
1 Parent(s): d20112c

"fix-convert-to-float32-before-mean-operations"

Browse files
reachy_mini_ha_voice/voice_assistant.py CHANGED
@@ -387,7 +387,7 @@ class VoiceAssistantService:
387
  if isinstance(audio_data, bytes):
388
  audio_data = np.frombuffer(audio_data, dtype=np.int16)
389
  elif isinstance(audio_data, (list, tuple)):
390
- audio_data = np.array(audio_data)
391
 
392
  # Ensure it's a numpy array
393
  if not isinstance(audio_data, np.ndarray):
@@ -397,10 +397,31 @@ class VoiceAssistantService:
397
  time.sleep(0.01)
398
  continue
399
 
400
- # Handle string dtype (S1) - this is the actual error case
401
- if audio_data.dtype.kind in ('S', 'U', 'O'): # bytes, unicode, object
402
- # Convert bytes array to int16
403
- audio_data = np.frombuffer(audio_data.tobytes(), dtype=np.int16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  # Handle multi-dimensional arrays (stereo/multi-channel)
406
  if audio_data.ndim == 2:
@@ -411,24 +432,8 @@ class VoiceAssistantService:
411
  elif audio_data.ndim > 2:
412
  audio_data = audio_data.reshape(-1)
413
 
414
- # Convert to float32 normalized to [-1.0, 1.0]
415
- if audio_data.dtype == np.int16:
416
- audio_chunk_array = audio_data.astype(np.float32) / 32768.0
417
- elif audio_data.dtype == np.float32:
418
- audio_chunk_array = audio_data
419
- elif audio_data.dtype == np.float64:
420
- audio_chunk_array = audio_data.astype(np.float32)
421
- elif np.issubdtype(audio_data.dtype, np.integer):
422
- # Other integer types
423
- info = np.iinfo(audio_data.dtype)
424
- scale = float(max(-info.min, info.max))
425
- audio_chunk_array = audio_data.astype(np.float32) / scale
426
- else:
427
- # Try to convert to float32
428
- audio_chunk_array = audio_data.astype(np.float32)
429
-
430
- # Ensure 1D array
431
- audio_chunk_array = audio_chunk_array.reshape(-1)
432
 
433
  # Resample if needed
434
  if input_sample_rate != target_sample_rate and len(audio_chunk_array) > 0:
 
387
  if isinstance(audio_data, bytes):
388
  audio_data = np.frombuffer(audio_data, dtype=np.int16)
389
  elif isinstance(audio_data, (list, tuple)):
390
+ audio_data = np.array(audio_data, dtype=np.float32)
391
 
392
  # Ensure it's a numpy array
393
  if not isinstance(audio_data, np.ndarray):
 
397
  time.sleep(0.01)
398
  continue
399
 
400
+ # Handle string/bytes dtype (S1, U, O) - MUST be done before any numeric operations
401
+ if audio_data.dtype.kind in ('S', 'U', 'O'):
402
+ # Get raw bytes and convert to int16
403
+ raw_bytes = audio_data.tobytes()
404
+ if len(raw_bytes) >= 2:
405
+ audio_data = np.frombuffer(raw_bytes, dtype=np.int16)
406
+ else:
407
+ time.sleep(0.01)
408
+ continue
409
+
410
+ # Now convert to numeric type BEFORE any mean/reshape operations
411
+ if not np.issubdtype(audio_data.dtype, np.number):
412
+ time.sleep(0.01)
413
+ continue
414
+
415
+ # Convert to float32 first to avoid issues with mean on integer types
416
+ if audio_data.dtype == np.int16:
417
+ audio_data = audio_data.astype(np.float32) / 32768.0
418
+ elif audio_data.dtype != np.float32:
419
+ if np.issubdtype(audio_data.dtype, np.integer):
420
+ info = np.iinfo(audio_data.dtype)
421
+ scale = float(max(-info.min, info.max))
422
+ audio_data = audio_data.astype(np.float32) / scale
423
+ else:
424
+ audio_data = audio_data.astype(np.float32)
425
 
426
  # Handle multi-dimensional arrays (stereo/multi-channel)
427
  if audio_data.ndim == 2:
 
432
  elif audio_data.ndim > 2:
433
  audio_data = audio_data.reshape(-1)
434
 
435
+ # Ensure 1D float32 array
436
+ audio_chunk_array = audio_data.reshape(-1).astype(np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
  # Resample if needed
439
  if input_sample_rate != target_sample_rate and len(audio_chunk_array) > 0: