Commit ·
c92d558
1
Parent(s): e1fb914
恢复到4051ebd: 手势识别有依赖冲突,暂时移除
Browse files- PROJECT_PLAN.md +33 -117
- index.html +0 -15
- pyproject.toml +1 -10
- reachy_mini_ha_voice/__init__.py +1 -1
- reachy_mini_ha_voice/camera_server.py +9 -91
- reachy_mini_ha_voice/entity_registry.py +0 -46
- reachy_mini_ha_voice/head_tracker.py +26 -36
PROJECT_PLAN.md
CHANGED
|
@@ -614,7 +614,7 @@ VAD_DB_OFF = -45 # 停止检测阈值
|
|
| 614 |
**技术实现**:
|
| 615 |
- `tap_detector.py` - IMU 加速度突变检测
|
| 616 |
- `satellite.py:_tap_conversation_mode` - 持续对话模式标志
|
| 617 |
-
- 阈值:
|
| 618 |
- 冷却时间: 1.0s (防止重复触发)
|
| 619 |
- 仅限无线版本 (Wireless) 可用
|
| 620 |
|
|
@@ -644,87 +644,7 @@ def _tts_finished(self):
|
|
| 644 |
| 倾斜/倒下 | 播放求助动作 + 语音 "我倒了,帮帮我" | ❌ 未实现 |
|
| 645 |
| 长时间静止 | 进入休眠动画 | ❌ 未实现 |
|
| 646 |
|
| 647 |
-
### Phase 21 -
|
| 648 |
-
|
| 649 |
-
**目标**: 使用 MediaPipe Hands 检测手势,实现非语音交互。
|
| 650 |
-
|
| 651 |
-
**技术方案**:
|
| 652 |
-
- 使用 MediaPipe Hands(完全本地运行,无云端依赖)
|
| 653 |
-
- 与 YOLO 人脸检测并行运行(每隔一帧处理手势,节省 CPU)
|
| 654 |
-
- 手势需保持 0.5 秒才触发,1.5 秒冷却期
|
| 655 |
-
|
| 656 |
-
**已实现手势 (11种)**:
|
| 657 |
-
|
| 658 |
-
| 手势 | 英文值 | 含义 | 检测逻辑 |
|
| 659 |
-
|------|--------|------|---------|
|
| 660 |
-
| 👍 | `thumbs_up` | 确认/点赞 | 拇指向上,其他手指握拳 |
|
| 661 |
-
| 👎 | `thumbs_down` | 拒绝/不喜欢 | 拇指向下,其他手指握拳 |
|
| 662 |
-
| ✋ | `open_palm` | 停止 | 所有手指伸展 |
|
| 663 |
-
| ✊ | `fist` | 暂停/保持 | 所有手指握拳 |
|
| 664 |
-
| ✌️ | `peace` | 胜利/和平 | 食指和中指伸展,其他握拳 |
|
| 665 |
-
| 👌 | `ok` | OK | 拇指和食指形成圆圈,其他伸展 |
|
| 666 |
-
| ☝️ | `pointing_up` | 注意/一 | 仅食指伸展 |
|
| 667 |
-
| 🤘 | `rock` | Rock on | 食指和小指伸展,中指和无名指握拳 |
|
| 668 |
-
| 🤙 | `call` | Call me | 拇指和小指伸展,其他握拳 |
|
| 669 |
-
| 3️⃣ | `three` | 三 | 食指、中指、无名指伸展 |
|
| 670 |
-
| 4️⃣ | `four` | 四 | 除拇指外所有手指伸展 |
|
| 671 |
-
|
| 672 |
-
**Home Assistant 实体**:
|
| 673 |
-
|
| 674 |
-
| ESPHome 实体类型 | 名称 | 说明 |
|
| 675 |
-
|-----------------|------|------|
|
| 676 |
-
| `Text Sensor` | `detected_gesture` | 当前检测到的手势 (英文) |
|
| 677 |
-
| `Switch` | `gesture_detection_enabled` | 手势检测开关 |
|
| 678 |
-
|
| 679 |
-
**代码位置**:
|
| 680 |
-
- `gesture_detector.py` - MediaPipe 手势检测器
|
| 681 |
-
- `camera_server.py` - 集成手势检测到摄像头处理循环
|
| 682 |
-
- `entity_registry.py` - Home Assistant 实体注册
|
| 683 |
-
|
| 684 |
-
**技术细节**:
|
| 685 |
-
```python
|
| 686 |
-
# gesture_detector.py - 手势分类
|
| 687 |
-
class Gesture(Enum):
|
| 688 |
-
NONE = "none"
|
| 689 |
-
THUMBS_UP = "thumbs_up"
|
| 690 |
-
THUMBS_DOWN = "thumbs_down"
|
| 691 |
-
OPEN_PALM = "open_palm"
|
| 692 |
-
FIST = "fist"
|
| 693 |
-
PEACE = "peace"
|
| 694 |
-
OK = "ok"
|
| 695 |
-
POINTING_UP = "pointing_up"
|
| 696 |
-
ROCK = "rock"
|
| 697 |
-
CALL = "call"
|
| 698 |
-
THREE = "three"
|
| 699 |
-
FOUR = "four"
|
| 700 |
-
|
| 701 |
-
# 手势检测参数
|
| 702 |
-
min_detection_confidence = 0.7
|
| 703 |
-
min_tracking_confidence = 0.5
|
| 704 |
-
gesture_hold_threshold = 0.5 # 保持 0.5 秒触发
|
| 705 |
-
gesture_cooldown = 1.5 # 触发后 1.5 秒冷却
|
| 706 |
-
gesture_clear_delay = 2.0 # 手势消失 2 秒后清除
|
| 707 |
-
```
|
| 708 |
-
|
| 709 |
-
**回调支持**:
|
| 710 |
-
```python
|
| 711 |
-
# 可为每种手势设置回调
|
| 712 |
-
camera_server.set_gesture_callbacks(
|
| 713 |
-
on_thumbs_up=lambda: print("确认"),
|
| 714 |
-
on_thumbs_down=lambda: print("拒绝"),
|
| 715 |
-
on_open_palm=lambda: print("停止"),
|
| 716 |
-
on_fist=lambda: print("暂停"),
|
| 717 |
-
on_peace=lambda: print("和平"),
|
| 718 |
-
on_ok=lambda: print("OK"),
|
| 719 |
-
on_pointing_up=lambda: print("注意"),
|
| 720 |
-
on_rock=lambda: print("Rock!"),
|
| 721 |
-
on_call=lambda: print("打电话"),
|
| 722 |
-
on_three=lambda: print("三"),
|
| 723 |
-
on_four=lambda: print("四"),
|
| 724 |
-
)
|
| 725 |
-
```
|
| 726 |
-
|
| 727 |
-
### Phase 22 - Home Assistant 场景联动 (未实现) ❌
|
| 728 |
|
| 729 |
**目标**: 根据 Home Assistant 的场景/自动化触发机器人动作。
|
| 730 |
|
|
@@ -753,20 +673,16 @@ camera_server.set_gesture_callbacks(
|
|
| 753 |
- **音频处理** - AGC、噪声抑制、回声消除
|
| 754 |
- **摄像头流** - MJPEG 实时预览
|
| 755 |
|
| 756 |
-
####
|
| 757 |
-
- **Phase 13** - Sendspin 多房间音频支持 ✅
|
| 758 |
-
- **Phase 15** - YOLO 人脸追踪 ✅
|
| 759 |
-
- **Phase 20** - 拍一拍唤醒 ✅
|
| 760 |
-
- **Phase 21** - 手势识别 (11 种手势,自动安装mediapipe) ✅
|
| 761 |
-
|
| 762 |
-
#### 部分实现功能
|
| 763 |
- **Phase 14** - 情感动作 API 基础设施 (手动触发可用)
|
| 764 |
- **Phase 19** - 重力补偿模式切换 (教学流程未实现)
|
| 765 |
|
| 766 |
### ❌ 未实现功能
|
| 767 |
|
| 768 |
#### 高优先级
|
|
|
|
| 769 |
- **Phase 14** - 自动情感动作反馈 (需与语音助手事件关联)
|
|
|
|
| 770 |
|
| 771 |
#### 中优先级
|
| 772 |
- **Phase 16** - 卡通风格运动模式 (需动态插值切换)
|
|
@@ -775,8 +691,8 @@ camera_server.set_gesture_callbacks(
|
|
| 775 |
|
| 776 |
#### 低优先级
|
| 777 |
- **Phase 19** - 教学模式录制/播放功能
|
| 778 |
-
- **Phase 20** - IMU 环境感知响应
|
| 779 |
-
- **Phase
|
| 780 |
|
| 781 |
---
|
| 782 |
|
|
@@ -786,40 +702,42 @@ camera_server.set_gesture_callbacks(
|
|
| 786 |
- ✅ **Phase 1-12**: 基础 ESPHome 实体 (45+ 个)
|
| 787 |
- ✅ 核心语音助手功能
|
| 788 |
- ✅ 基础运动反馈 (点头、摇头、注视)
|
| 789 |
-
- ✅ **Phase 13**: Sendspin 多房间音频
|
| 790 |
-
- ✅ **Phase 15**: YOLO 人脸追踪
|
| 791 |
-
- ✅ **Phase 21**: 手势识别 (11 种手势)
|
| 792 |
|
| 793 |
### 高优先级 (部分实现 🟡)
|
| 794 |
-
- 🟡 **Phase
|
| 795 |
- ✅ Emotion Selector 实体与 API 基础设施
|
| 796 |
- ❌ 自动根据语音助手响应触发情感动作
|
| 797 |
- ❌ 意图识别与情感匹配
|
| 798 |
- ❌ 舞蹈动作库集成
|
| 799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
### 中优先级 (部分实现 🟡)
|
| 801 |
-
- 🟡 **Phase
|
| 802 |
-
- ✅
|
| 803 |
- ✅ 姿态变化检测 + 状态查询缓存 (减少 daemon 负载)
|
| 804 |
- ✅ 平滑插值动作 + 呼吸动画
|
| 805 |
- ❌ 动态插值技术切换 (CARTOON 等)
|
| 806 |
-
- 🟡 **Phase
|
| 807 |
- ✅ 语音驱动头部摆动 (SpeechSwayGenerator)
|
| 808 |
- ❌ 天线随音频节奏摆动
|
| 809 |
|
| 810 |
### 中优先级 (未实现 ❌)
|
| 811 |
-
- ❌ **Phase
|
| 812 |
|
| 813 |
### 低优先级 (部分实现 🟡)
|
| 814 |
-
- 🟡 **Phase
|
| 815 |
- ✅ 重力补偿模式切换
|
| 816 |
- ❌ 教学式交互 (录制/播放功能)
|
| 817 |
-
- 🟡 **Phase 20**: 环境感知响应
|
| 818 |
-
- ✅ 拍一拍唤醒 (IMU 加速度检测)
|
| 819 |
-
- ❌ 摇晃/倾斜检测
|
| 820 |
|
| 821 |
### 低优先级 (未实现 ❌)
|
| 822 |
-
- ❌ **Phase
|
|
|
|
| 823 |
|
| 824 |
---
|
| 825 |
|
|
@@ -827,19 +745,17 @@ camera_server.set_gesture_callbacks(
|
|
| 827 |
|
| 828 |
| 阶段 | 状态 | 完成度 | 说明 |
|
| 829 |
|------|------|--------|------|
|
| 830 |
-
| Phase 1-12 | ✅ 完成 | 100% |
|
| 831 |
-
| Phase 13 |
|
| 832 |
-
| Phase 14 |
|
| 833 |
-
| Phase 15 |
|
| 834 |
-
| Phase 16 | 🟡 部分完成 |
|
| 835 |
-
| Phase 17 |
|
| 836 |
-
| Phase 18 |
|
| 837 |
-
| Phase 19 |
|
| 838 |
-
| Phase 20 |
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
**总体完成度**: **Phase 1-12: 100%** | **Phase 13-22: ~60%**
|
| 843 |
|
| 844 |
---
|
| 845 |
|
|
|
|
| 614 |
**技术实现**:
|
| 615 |
- `tap_detector.py` - IMU 加速度突变检测
|
| 616 |
- `satellite.py:_tap_conversation_mode` - 持续对话模式标志
|
| 617 |
+
- 阈值: 2.0g (可配置)
|
| 618 |
- 冷却时间: 1.0s (防止重复触发)
|
| 619 |
- 仅限无线版本 (Wireless) 可用
|
| 620 |
|
|
|
|
| 644 |
| 倾斜/倒下 | 播放求助动作 + 语音 "我倒了,帮帮我" | ❌ 未实现 |
|
| 645 |
| 长时间静止 | 进入休眠动画 | ❌ 未实现 |
|
| 646 |
|
| 647 |
+
### Phase 21 - Home Assistant 场景联动 (未实现) ❌
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
|
| 649 |
**目标**: 根据 Home Assistant 的场景/自动化触发机器人动作。
|
| 650 |
|
|
|
|
| 673 |
- **音频处理** - AGC、噪声抑制、回声消除
|
| 674 |
- **摄像头流** - MJPEG 实时预览
|
| 675 |
|
| 676 |
+
#### 部分实现功能 (Phase 14-21)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
- **Phase 14** - 情感动作 API 基础设施 (手动触发可用)
|
| 678 |
- **Phase 19** - 重力补偿模式切换 (教学流程未实现)
|
| 679 |
|
| 680 |
### ❌ 未实现功能
|
| 681 |
|
| 682 |
#### 高优先级
|
| 683 |
+
- ~~**Phase 13** - Sendspin 音频播放支持~~ ✅ **已完成**
|
| 684 |
- **Phase 14** - 自动情感动作反馈 (需与语音助手事件关联)
|
| 685 |
+
- **Phase 15** - 持续声源追踪 (仅唤醒时转向)
|
| 686 |
|
| 687 |
#### 中优先级
|
| 688 |
- **Phase 16** - 卡通风格运动模式 (需动态插值切换)
|
|
|
|
| 691 |
|
| 692 |
#### 低优先级
|
| 693 |
- **Phase 19** - 教学模式录制/播放功能
|
| 694 |
+
- **Phase 20** - IMU 环境感知响应
|
| 695 |
+
- **Phase 21** - Home Assistant 场景联动
|
| 696 |
|
| 697 |
---
|
| 698 |
|
|
|
|
| 702 |
- ✅ **Phase 1-12**: 基础 ESPHome 实体 (45+ 个)
|
| 703 |
- ✅ 核心语音助手功能
|
| 704 |
- ✅ 基础运动反馈 (点头、摇头、注视)
|
|
|
|
|
|
|
|
|
|
| 705 |
|
| 706 |
### 高优先级 (部分实现 🟡)
|
| 707 |
+
- 🟡 **Phase 13**: 情感动作反馈系统
|
| 708 |
- ✅ Emotion Selector 实体与 API 基础设施
|
| 709 |
- ❌ 自动根据语音助手响应触发情感动作
|
| 710 |
- ❌ 意图识别与情感匹配
|
| 711 |
- ❌ 舞蹈动作库集成
|
| 712 |
|
| 713 |
+
### 高优先级 (未实现 ❌)
|
| 714 |
+
- ❌ **Phase 14**: 智能声源追踪增强
|
| 715 |
+
- ✅ 唤醒时转向声源
|
| 716 |
+
- ❌ 持续声源追踪
|
| 717 |
+
- ❌ 多人对话切换
|
| 718 |
+
- ❌ 声源可视化
|
| 719 |
+
|
| 720 |
### 中优先级 (部分实现 🟡)
|
| 721 |
+
- 🟡 **Phase 15**: 卡通风格运动模式
|
| 722 |
+
- ✅ 20Hz 统一控制循环架构 (优化以防止 daemon 崩溃)
|
| 723 |
- ✅ 姿态变化检测 + 状态查询缓存 (减少 daemon 负载)
|
| 724 |
- ✅ 平滑插值动作 + 呼吸动画
|
| 725 |
- ❌ 动态插值技术切换 (CARTOON 等)
|
| 726 |
+
- 🟡 **Phase 16**: 说话时天线同步
|
| 727 |
- ✅ 语音驱动头部摆动 (SpeechSwayGenerator)
|
| 728 |
- ❌ 天线随音频节奏摆动
|
| 729 |
|
| 730 |
### 中优先级 (未实现 ❌)
|
| 731 |
+
- ❌ **Phase 17**: 视觉注视交互 - 眼神交流
|
| 732 |
|
| 733 |
### 低优先级 (部分实现 🟡)
|
| 734 |
+
- 🟡 **Phase 18**: 重力补偿互动模式
|
| 735 |
- ✅ 重力补偿模式切换
|
| 736 |
- ❌ 教学式交互 (录制/播放功能)
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
### 低优先级 (未实现 ❌)
|
| 739 |
+
- ❌ **Phase 19**: 环境感知响应 - IMU 触发动作
|
| 740 |
+
- ❌ **Phase 20**: Home Assistant 场景联动 - 智能家居整合
|
| 741 |
|
| 742 |
---
|
| 743 |
|
|
|
|
| 745 |
|
| 746 |
| 阶段 | 状态 | 完成度 | 说明 |
|
| 747 |
|------|------|--------|------|
|
| 748 |
+
| Phase 1-12 | ✅ 完成 | 100% | 40 个 ESPHome 实体已实现(Phase 11 LED 已禁用) |
|
| 749 |
+
| Phase 13 | 🟡 部分完成 | 30% | API 基础设施就绪,缺自动触发 |
|
| 750 |
+
| Phase 14 | ❌ 未完成 | 20% | 仅实现唤醒时转向 |
|
| 751 |
+
| Phase 15 | 🟡 部分完成 | 70% | 20Hz控制循环+姿态变化检测+状态缓存+呼吸动画已实现 |
|
| 752 |
+
| Phase 16 | 🟡 部分完成 | 50% | 语音驱动头部摆动已实现 |
|
| 753 |
+
| Phase 17 | ❌ 未完成 | 10% | 摄像头已实现,缺人脸检测 |
|
| 754 |
+
| Phase 18 | 🟡 部分完成 | 40% | 模式切换已实现,缺教学流程 |
|
| 755 |
+
| Phase 19 | ❌ 未完成 | 10% | IMU 数据已暴露,缺触发逻辑 |
|
| 756 |
+
| Phase 20 | ❌ 未完成 | 0% | 完全未实现 |
|
| 757 |
+
|
| 758 |
+
**总体完成度**: **Phase 1-12: 100%** | **Phase 13-20: ~35%**
|
|
|
|
|
|
|
| 759 |
|
| 760 |
---
|
| 761 |
|
index.html
CHANGED
|
@@ -80,10 +80,6 @@
|
|
| 80 |
<h3>😊 Facial Expressions</h3>
|
| 81 |
<p>Automatic emotional feedback with head movements and antenna animations while listening and responding.</p>
|
| 82 |
</div>
|
| 83 |
-
<div class="info-box">
|
| 84 |
-
<h3>✋ Gesture Detection</h3>
|
| 85 |
-
<p>MediaPipe-based hand gesture recognition with 11 gestures for non-verbal interaction.</p>
|
| 86 |
-
</div>
|
| 87 |
<div class="info-box">
|
| 88 |
<h3>📹 Camera Streaming</h3>
|
| 89 |
<p>MJPEG video stream available in Home Assistant as a Generic Camera for real-time monitoring.</p>
|
|
@@ -101,15 +97,6 @@
|
|
| 101 |
<h2>Changelog</h2>
|
| 102 |
<div class="how-to-use changelog-container">
|
| 103 |
<div class="changelog-scroll">
|
| 104 |
-
<div class="changelog-entry">
|
| 105 |
-
<span class="version">v0.6.0</span>
|
| 106 |
-
<span class="date">2026-01-07</span>
|
| 107 |
-
<ul>
|
| 108 |
-
<li>NEW: MediaPipe gesture detection with 11 gestures</li>
|
| 109 |
-
<li>Gestures: thumbs_up, thumbs_down, open_palm, fist, peace, ok, pointing_up, rock, call, three, four</li>
|
| 110 |
-
<li>Home Assistant entities for gesture detection (detected_gesture, gesture_detection_enabled)</li>
|
| 111 |
-
</ul>
|
| 112 |
-
</div>
|
| 113 |
<div class="changelog-entry">
|
| 114 |
<span class="version">v0.5.0</span>
|
| 115 |
<span class="date">2026-01-07</span>
|
|
@@ -122,8 +109,6 @@
|
|
| 122 |
<li>Noise suppression default reduced to 15%</li>
|
| 123 |
<li>Tap-to-wake default threshold reduced to 0.5g (most sensitive)</li>
|
| 124 |
<li>Fix: Replace non-existent clear_output_buffer with stop_playing</li>
|
| 125 |
-
<li>NEW: Gesture detection (11 gestures) via MediaPipe</li>
|
| 126 |
-
<li>NEW: Home Assistant entities for gesture detection</li>
|
| 127 |
</ul>
|
| 128 |
</div>
|
| 129 |
<div class="changelog-entry">
|
|
|
|
| 80 |
<h3>😊 Facial Expressions</h3>
|
| 81 |
<p>Automatic emotional feedback with head movements and antenna animations while listening and responding.</p>
|
| 82 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
<div class="info-box">
|
| 84 |
<h3>📹 Camera Streaming</h3>
|
| 85 |
<p>MJPEG video stream available in Home Assistant as a Generic Camera for real-time monitoring.</p>
|
|
|
|
| 97 |
<h2>Changelog</h2>
|
| 98 |
<div class="how-to-use changelog-container">
|
| 99 |
<div class="changelog-scroll">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
<div class="changelog-entry">
|
| 101 |
<span class="version">v0.5.0</span>
|
| 102 |
<span class="date">2026-01-07</span>
|
|
|
|
| 109 |
<li>Noise suppression default reduced to 15%</li>
|
| 110 |
<li>Tap-to-wake default threshold reduced to 0.5g (most sensitive)</li>
|
| 111 |
<li>Fix: Replace non-existent clear_output_buffer with stop_playing</li>
|
|
|
|
|
|
|
| 112 |
</ul>
|
| 113 |
</div>
|
| 114 |
<div class="changelog-entry">
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "reachy_mini_ha_voice"
|
| 7 |
-
version = "0.
|
| 8 |
description = "Home Assistant Voice Assistant for Reachy Mini"
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
|
@@ -40,16 +40,7 @@ dependencies = [
|
|
| 40 |
|
| 41 |
# Sendspin synchronized audio (optional, for multi-room playback)
|
| 42 |
"aiosendspin>=2.0.1",
|
| 43 |
-
|
| 44 |
-
# Gesture detection dependencies (mediapipe installed separately for ARM64)
|
| 45 |
-
"flatbuffers>=2.0",
|
| 46 |
-
"absl-py",
|
| 47 |
-
"attrs>=19.1.0",
|
| 48 |
]
|
| 49 |
-
|
| 50 |
-
[project.optional-dependencies]
|
| 51 |
-
# For x86_64 systems, install with: pip install reachy_mini_ha_voice[gesture]
|
| 52 |
-
gesture = ["mediapipe>=0.10.31"]
|
| 53 |
keywords = ["reachy-mini-app", "reachy-mini", "home-assistant", "voice-assistant"]
|
| 54 |
|
| 55 |
[project.entry-points."reachy_mini_apps"]
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "reachy_mini_ha_voice"
|
| 7 |
+
version = "0.5.0"
|
| 8 |
description = "Home Assistant Voice Assistant for Reachy Mini"
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
|
|
|
| 40 |
|
| 41 |
# Sendspin synchronized audio (optional, for multi-room playback)
|
| 42 |
"aiosendspin>=2.0.1",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
keywords = ["reachy-mini-app", "reachy-mini", "home-assistant", "voice-assistant"]
|
| 45 |
|
| 46 |
[project.entry-points."reachy_mini_apps"]
|
reachy_mini_ha_voice/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ Key features:
|
|
| 11 |
- Reachy Mini motion control integration
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
__version__ = "0.
|
| 15 |
__author__ = "Desmond Dong"
|
| 16 |
|
| 17 |
# Don't import main module here to avoid runpy warning
|
|
|
|
| 11 |
- Reachy Mini motion control integration
|
| 12 |
"""
|
| 13 |
|
| 14 |
+
__version__ = "0.5.0"
|
| 15 |
__author__ = "Desmond Dong"
|
| 16 |
|
| 17 |
# Don't import main module here to avoid runpy warning
|
reachy_mini_ha_voice/camera_server.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
-
MJPEG Camera Server for Reachy Mini with Face Tracking
|
| 3 |
|
| 4 |
This module provides an HTTP server that streams camera frames from Reachy Mini
|
| 5 |
as MJPEG, which can be integrated with Home Assistant via Generic Camera.
|
| 6 |
-
Also provides face tracking for head movement control
|
| 7 |
|
| 8 |
Reference: reachy_mini_conversation_app/src/reachy_mini_conversation_app/camera_worker.py
|
| 9 |
"""
|
|
@@ -12,7 +12,7 @@ import asyncio
|
|
| 12 |
import logging
|
| 13 |
import threading
|
| 14 |
import time
|
| 15 |
-
from typing import Optional, Tuple, List,
|
| 16 |
|
| 17 |
import cv2
|
| 18 |
import numpy as np
|
|
@@ -36,15 +36,14 @@ MJPEG_BOUNDARY = "frame"
|
|
| 36 |
|
| 37 |
class MJPEGCameraServer:
|
| 38 |
"""
|
| 39 |
-
MJPEG streaming server for Reachy Mini camera with face tracking
|
| 40 |
|
| 41 |
Provides HTTP endpoints:
|
| 42 |
- /stream - MJPEG video stream
|
| 43 |
- /snapshot - Single JPEG image
|
| 44 |
- / - Simple status page
|
| 45 |
|
| 46 |
-
Also provides face tracking offsets for head movement control
|
| 47 |
-
and gesture detection for interaction (thumbs up, open palm/stop).
|
| 48 |
"""
|
| 49 |
|
| 50 |
def __init__(
|
|
@@ -55,7 +54,6 @@ class MJPEGCameraServer:
|
|
| 55 |
fps: int = 15, # 15fps for smooth face tracking
|
| 56 |
quality: int = 80,
|
| 57 |
enable_face_tracking: bool = True,
|
| 58 |
-
enable_gesture_detection: bool = True,
|
| 59 |
):
|
| 60 |
"""
|
| 61 |
Initialize the MJPEG camera server.
|
|
@@ -67,7 +65,6 @@ class MJPEGCameraServer:
|
|
| 67 |
fps: Target frames per second for the stream
|
| 68 |
quality: JPEG quality (1-100)
|
| 69 |
enable_face_tracking: Enable face tracking for head movement
|
| 70 |
-
enable_gesture_detection: Enable gesture detection (thumbs up, stop)
|
| 71 |
"""
|
| 72 |
self.reachy_mini = reachy_mini
|
| 73 |
self.host = host
|
|
@@ -75,7 +72,6 @@ class MJPEGCameraServer:
|
|
| 75 |
self.fps = fps
|
| 76 |
self.quality = quality
|
| 77 |
self.enable_face_tracking = enable_face_tracking
|
| 78 |
-
self.enable_gesture_detection = enable_gesture_detection
|
| 79 |
|
| 80 |
self._server: Optional[asyncio.Server] = None
|
| 81 |
self._running = False
|
|
@@ -102,10 +98,6 @@ class MJPEGCameraServer:
|
|
| 102 |
|
| 103 |
# Offset scaling (same as conversation_app)
|
| 104 |
self._offset_scale = 0.6
|
| 105 |
-
|
| 106 |
-
# Gesture detection state
|
| 107 |
-
self._gesture_detector = None
|
| 108 |
-
self._gesture_detection_enabled = True
|
| 109 |
|
| 110 |
async def start(self) -> None:
|
| 111 |
"""Start the MJPEG camera server."""
|
|
@@ -129,25 +121,6 @@ class MJPEGCameraServer:
|
|
| 129 |
self._head_tracker = None
|
| 130 |
else:
|
| 131 |
_LOGGER.info("Face tracking disabled by configuration")
|
| 132 |
-
|
| 133 |
-
# Initialize gesture detector if enabled
|
| 134 |
-
if self.enable_gesture_detection:
|
| 135 |
-
try:
|
| 136 |
-
from .gesture_detector import GestureDetector
|
| 137 |
-
self._gesture_detector = GestureDetector()
|
| 138 |
-
if self._gesture_detector.is_available:
|
| 139 |
-
_LOGGER.info("Gesture detection enabled with MediaPipe Hands")
|
| 140 |
-
else:
|
| 141 |
-
_LOGGER.warning("Gesture detection not available (MediaPipe not installed)")
|
| 142 |
-
self._gesture_detector = None
|
| 143 |
-
except ImportError as e:
|
| 144 |
-
_LOGGER.warning("Failed to import gesture detector: %s", e)
|
| 145 |
-
self._gesture_detector = None
|
| 146 |
-
except Exception as e:
|
| 147 |
-
_LOGGER.warning("Failed to initialize gesture detector: %s", e)
|
| 148 |
-
self._gesture_detector = None
|
| 149 |
-
else:
|
| 150 |
-
_LOGGER.info("Gesture detection disabled by configuration")
|
| 151 |
|
| 152 |
# Start frame capture thread
|
| 153 |
self._capture_thread = threading.Thread(
|
|
@@ -184,9 +157,8 @@ class MJPEGCameraServer:
|
|
| 184 |
_LOGGER.info("MJPEG Camera server stopped")
|
| 185 |
|
| 186 |
def _capture_frames(self) -> None:
|
| 187 |
-
"""Background thread to capture frames from Reachy Mini and do face tracking
|
| 188 |
-
_LOGGER.info("Starting camera capture thread (face_tracking=%s
|
| 189 |
-
self._face_tracking_enabled, self._gesture_detection_enabled)
|
| 190 |
|
| 191 |
frame_count = 0
|
| 192 |
last_log_time = time.time()
|
|
@@ -215,16 +187,11 @@ class MJPEGCameraServer:
|
|
| 215 |
# Handle smooth interpolation when face lost
|
| 216 |
self._process_face_lost_interpolation(current_time)
|
| 217 |
|
| 218 |
-
# Do gesture detection if enabled (every other frame to save CPU)
|
| 219 |
-
if self._gesture_detection_enabled and self._gesture_detector is not None:
|
| 220 |
-
if frame_count % 2 == 0: # Process every other frame
|
| 221 |
-
self._gesture_detector.process_frame(frame)
|
| 222 |
-
|
| 223 |
# Log stats every 10 seconds
|
| 224 |
if current_time - last_log_time >= 10.0:
|
| 225 |
fps = frame_count / (current_time - last_log_time)
|
| 226 |
-
_LOGGER.debug("Camera: %.1f fps, face_tracking=%s,
|
| 227 |
-
fps, self._face_tracking_enabled, self.
|
| 228 |
frame_count = 0
|
| 229 |
last_log_time = current_time
|
| 230 |
|
|
@@ -412,55 +379,6 @@ class MJPEGCameraServer:
|
|
| 412 |
self._interpolation_start_time = None
|
| 413 |
_LOGGER.info("Face tracking %s", "enabled" if enabled else "disabled")
|
| 414 |
|
| 415 |
-
# =========================================================================
|
| 416 |
-
# Public API for gesture detection
|
| 417 |
-
# =========================================================================
|
| 418 |
-
|
| 419 |
-
def get_current_gesture(self) -> str:
|
| 420 |
-
"""Get current detected gesture as string.
|
| 421 |
-
|
| 422 |
-
Returns:
|
| 423 |
-
Gesture name: "none", "thumbs_up", "open_palm"
|
| 424 |
-
"""
|
| 425 |
-
if self._gesture_detector is None:
|
| 426 |
-
return "none"
|
| 427 |
-
return self._gesture_detector.current_gesture.value
|
| 428 |
-
|
| 429 |
-
def set_gesture_detection_enabled(self, enabled: bool) -> None:
|
| 430 |
-
"""Enable or disable gesture detection."""
|
| 431 |
-
self._gesture_detection_enabled = enabled
|
| 432 |
-
_LOGGER.info("Gesture detection %s", "enabled" if enabled else "disabled")
|
| 433 |
-
|
| 434 |
-
def set_gesture_callbacks(
|
| 435 |
-
self,
|
| 436 |
-
on_thumbs_up: Optional[Callable[[], None]] = None,
|
| 437 |
-
on_thumbs_down: Optional[Callable[[], None]] = None,
|
| 438 |
-
on_open_palm: Optional[Callable[[], None]] = None,
|
| 439 |
-
on_fist: Optional[Callable[[], None]] = None,
|
| 440 |
-
on_peace: Optional[Callable[[], None]] = None,
|
| 441 |
-
on_ok: Optional[Callable[[], None]] = None,
|
| 442 |
-
on_pointing_up: Optional[Callable[[], None]] = None,
|
| 443 |
-
on_rock: Optional[Callable[[], None]] = None,
|
| 444 |
-
on_call: Optional[Callable[[], None]] = None,
|
| 445 |
-
on_three: Optional[Callable[[], None]] = None,
|
| 446 |
-
on_four: Optional[Callable[[], None]] = None,
|
| 447 |
-
) -> None:
|
| 448 |
-
"""Set gesture detection callbacks."""
|
| 449 |
-
if self._gesture_detector is not None:
|
| 450 |
-
self._gesture_detector.set_callbacks(
|
| 451 |
-
on_thumbs_up=on_thumbs_up,
|
| 452 |
-
on_thumbs_down=on_thumbs_down,
|
| 453 |
-
on_open_palm=on_open_palm,
|
| 454 |
-
on_fist=on_fist,
|
| 455 |
-
on_peace=on_peace,
|
| 456 |
-
on_ok=on_ok,
|
| 457 |
-
on_pointing_up=on_pointing_up,
|
| 458 |
-
on_rock=on_rock,
|
| 459 |
-
on_call=on_call,
|
| 460 |
-
on_three=on_three,
|
| 461 |
-
on_four=on_four,
|
| 462 |
-
)
|
| 463 |
-
|
| 464 |
def _get_camera_frame(self) -> Optional[np.ndarray]:
|
| 465 |
"""Get a frame from Reachy Mini's camera."""
|
| 466 |
if self.reachy_mini is None:
|
|
|
|
| 1 |
"""
|
| 2 |
+
MJPEG Camera Server for Reachy Mini with Face Tracking.
|
| 3 |
|
| 4 |
This module provides an HTTP server that streams camera frames from Reachy Mini
|
| 5 |
as MJPEG, which can be integrated with Home Assistant via Generic Camera.
|
| 6 |
+
Also provides face tracking for head movement control.
|
| 7 |
|
| 8 |
Reference: reachy_mini_conversation_app/src/reachy_mini_conversation_app/camera_worker.py
|
| 9 |
"""
|
|
|
|
| 12 |
import logging
|
| 13 |
import threading
|
| 14 |
import time
|
| 15 |
+
from typing import Optional, Tuple, List, TYPE_CHECKING
|
| 16 |
|
| 17 |
import cv2
|
| 18 |
import numpy as np
|
|
|
|
| 36 |
|
| 37 |
class MJPEGCameraServer:
|
| 38 |
"""
|
| 39 |
+
MJPEG streaming server for Reachy Mini camera with face tracking.
|
| 40 |
|
| 41 |
Provides HTTP endpoints:
|
| 42 |
- /stream - MJPEG video stream
|
| 43 |
- /snapshot - Single JPEG image
|
| 44 |
- / - Simple status page
|
| 45 |
|
| 46 |
+
Also provides face tracking offsets for head movement control.
|
|
|
|
| 47 |
"""
|
| 48 |
|
| 49 |
def __init__(
|
|
|
|
| 54 |
fps: int = 15, # 15fps for smooth face tracking
|
| 55 |
quality: int = 80,
|
| 56 |
enable_face_tracking: bool = True,
|
|
|
|
| 57 |
):
|
| 58 |
"""
|
| 59 |
Initialize the MJPEG camera server.
|
|
|
|
| 65 |
fps: Target frames per second for the stream
|
| 66 |
quality: JPEG quality (1-100)
|
| 67 |
enable_face_tracking: Enable face tracking for head movement
|
|
|
|
| 68 |
"""
|
| 69 |
self.reachy_mini = reachy_mini
|
| 70 |
self.host = host
|
|
|
|
| 72 |
self.fps = fps
|
| 73 |
self.quality = quality
|
| 74 |
self.enable_face_tracking = enable_face_tracking
|
|
|
|
| 75 |
|
| 76 |
self._server: Optional[asyncio.Server] = None
|
| 77 |
self._running = False
|
|
|
|
| 98 |
|
| 99 |
# Offset scaling (same as conversation_app)
|
| 100 |
self._offset_scale = 0.6
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
async def start(self) -> None:
|
| 103 |
"""Start the MJPEG camera server."""
|
|
|
|
| 121 |
self._head_tracker = None
|
| 122 |
else:
|
| 123 |
_LOGGER.info("Face tracking disabled by configuration")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Start frame capture thread
|
| 126 |
self._capture_thread = threading.Thread(
|
|
|
|
| 157 |
_LOGGER.info("MJPEG Camera server stopped")
|
| 158 |
|
| 159 |
def _capture_frames(self) -> None:
|
| 160 |
+
"""Background thread to capture frames from Reachy Mini and do face tracking."""
|
| 161 |
+
_LOGGER.info("Starting camera capture thread (face_tracking=%s)", self._face_tracking_enabled)
|
|
|
|
| 162 |
|
| 163 |
frame_count = 0
|
| 164 |
last_log_time = time.time()
|
|
|
|
| 187 |
# Handle smooth interpolation when face lost
|
| 188 |
self._process_face_lost_interpolation(current_time)
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
# Log stats every 10 seconds
|
| 191 |
if current_time - last_log_time >= 10.0:
|
| 192 |
fps = frame_count / (current_time - last_log_time)
|
| 193 |
+
_LOGGER.debug("Camera: %.1f fps, face_tracking=%s, head_tracker=%s",
|
| 194 |
+
fps, self._face_tracking_enabled, self._head_tracker is not None)
|
| 195 |
frame_count = 0
|
| 196 |
last_log_time = current_time
|
| 197 |
|
|
|
|
| 379 |
self._interpolation_start_time = None
|
| 380 |
_LOGGER.info("Face tracking %s", "enabled" if enabled else "disabled")
|
| 381 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
def _get_camera_frame(self) -> Optional[np.ndarray]:
|
| 383 |
"""Get a frame from Reachy Mini's camera."""
|
| 384 |
if self.reachy_mini is None:
|
reachy_mini_ha_voice/entity_registry.py
CHANGED
|
@@ -83,9 +83,6 @@ ENTITY_KEYS: Dict[str, int] = {
|
|
| 83 |
# Phase 13: Sendspin - auto-enabled via mDNS, no user entities needed
|
| 84 |
# Phase 20: Tap detection
|
| 85 |
"tap_sensitivity": 1400,
|
| 86 |
-
# Phase 21: Gesture detection
|
| 87 |
-
"detected_gesture": 1500,
|
| 88 |
-
"gesture_detection_enabled": 1501,
|
| 89 |
}
|
| 90 |
|
| 91 |
|
|
@@ -157,7 +154,6 @@ class EntityRegistry:
|
|
| 157 |
# Phase 13 (Sendspin) - auto-enabled via mDNS discovery, no user entities
|
| 158 |
# Phase 14 (head_joints, passive_joints) removed - not needed
|
| 159 |
self._setup_phase20_entities(entities)
|
| 160 |
-
self._setup_phase21_entities(entities)
|
| 161 |
|
| 162 |
_LOGGER.info("All entities registered: %d total", len(entities))
|
| 163 |
|
|
@@ -773,48 +769,6 @@ class EntityRegistry:
|
|
| 773 |
|
| 774 |
_LOGGER.debug("Phase 20 entities registered: tap_sensitivity")
|
| 775 |
|
| 776 |
-
def _setup_phase21_entities(self, entities: List) -> None:
|
| 777 |
-
"""Setup Phase 21 entities: Gesture detection."""
|
| 778 |
-
if self.camera_server is None:
|
| 779 |
-
_LOGGER.debug("Phase 21 skipped: no camera server")
|
| 780 |
-
return
|
| 781 |
-
|
| 782 |
-
def get_detected_gesture() -> str:
|
| 783 |
-
"""Get current detected gesture."""
|
| 784 |
-
return self.camera_server.get_current_gesture()
|
| 785 |
-
|
| 786 |
-
def get_gesture_detection_enabled() -> bool:
|
| 787 |
-
"""Get gesture detection enabled state."""
|
| 788 |
-
return self.camera_server._gesture_detection_enabled
|
| 789 |
-
|
| 790 |
-
def set_gesture_detection_enabled(value: bool) -> None:
|
| 791 |
-
"""Set gesture detection enabled state."""
|
| 792 |
-
self.camera_server.set_gesture_detection_enabled(value)
|
| 793 |
-
|
| 794 |
-
# Text sensor for detected gesture
|
| 795 |
-
entities.append(TextSensorEntity(
|
| 796 |
-
server=self.server,
|
| 797 |
-
key=get_entity_key("detected_gesture"),
|
| 798 |
-
name="Detected Gesture",
|
| 799 |
-
object_id="detected_gesture",
|
| 800 |
-
icon="mdi:hand-wave",
|
| 801 |
-
value_getter=get_detected_gesture,
|
| 802 |
-
))
|
| 803 |
-
|
| 804 |
-
# Switch to enable/disable gesture detection
|
| 805 |
-
entities.append(SwitchEntity(
|
| 806 |
-
server=self.server,
|
| 807 |
-
key=get_entity_key("gesture_detection_enabled"),
|
| 808 |
-
name="Gesture Detection",
|
| 809 |
-
object_id="gesture_detection_enabled",
|
| 810 |
-
icon="mdi:gesture",
|
| 811 |
-
entity_category=1, # config
|
| 812 |
-
value_getter=get_gesture_detection_enabled,
|
| 813 |
-
value_setter=set_gesture_detection_enabled,
|
| 814 |
-
))
|
| 815 |
-
|
| 816 |
-
_LOGGER.debug("Phase 21 entities registered: detected_gesture, gesture_detection_enabled")
|
| 817 |
-
|
| 818 |
def find_entity_references(self, entities: List) -> None:
|
| 819 |
"""Find and store references to special entities from existing list.
|
| 820 |
|
|
|
|
| 83 |
# Phase 13: Sendspin - auto-enabled via mDNS, no user entities needed
|
| 84 |
# Phase 20: Tap detection
|
| 85 |
"tap_sensitivity": 1400,
|
|
|
|
|
|
|
|
|
|
| 86 |
}
|
| 87 |
|
| 88 |
|
|
|
|
| 154 |
# Phase 13 (Sendspin) - auto-enabled via mDNS discovery, no user entities
|
| 155 |
# Phase 14 (head_joints, passive_joints) removed - not needed
|
| 156 |
self._setup_phase20_entities(entities)
|
|
|
|
| 157 |
|
| 158 |
_LOGGER.info("All entities registered: %d total", len(entities))
|
| 159 |
|
|
|
|
| 769 |
|
| 770 |
_LOGGER.debug("Phase 20 entities registered: tap_sensitivity")
|
| 771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
def find_entity_references(self, entities: List) -> None:
|
| 773 |
"""Find and store references to special entities from existing list.
|
| 774 |
|
reachy_mini_ha_voice/head_tracker.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
"""Lightweight head tracker using YOLO for face detection.
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
import logging
|
| 8 |
-
import time
|
| 9 |
-
from pathlib import Path
|
| 10 |
from typing import Tuple, Optional
|
| 11 |
|
| 12 |
import numpy as np
|
|
@@ -15,38 +15,43 @@ from numpy.typing import NDArray
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
-
# Model config
|
| 19 |
-
_MODEL_REPO = "AdamCodd/YOLOv11n-face-detection"
|
| 20 |
-
_MODEL_FILENAME = "model.pt"
|
| 21 |
-
_MAX_RETRIES = 3
|
| 22 |
-
_RETRY_DELAY = 5 # seconds
|
| 23 |
-
|
| 24 |
|
| 25 |
class HeadTracker:
|
| 26 |
-
"""Lightweight head tracker using YOLO for face detection.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def __init__(
|
| 29 |
self,
|
|
|
|
|
|
|
| 30 |
confidence_threshold: float = 0.3,
|
| 31 |
device: str = "cpu",
|
| 32 |
) -> None:
|
| 33 |
"""Initialize YOLO-based head tracker.
|
| 34 |
|
| 35 |
Args:
|
|
|
|
|
|
|
| 36 |
confidence_threshold: Minimum confidence for face detection
|
| 37 |
device: Device to run inference on ('cpu' or 'cuda')
|
| 38 |
"""
|
| 39 |
self.confidence_threshold = confidence_threshold
|
| 40 |
self.model = None
|
|
|
|
|
|
|
| 41 |
self._device = device
|
| 42 |
self._detections_class = None
|
| 43 |
self._model_load_attempted = False
|
| 44 |
self._model_load_error: Optional[str] = None
|
| 45 |
|
|
|
|
| 46 |
self._load_model()
|
| 47 |
|
| 48 |
def _load_model(self) -> None:
|
| 49 |
-
"""Load YOLO model
|
| 50 |
if self._model_load_attempted:
|
| 51 |
return
|
| 52 |
|
|
@@ -59,34 +64,19 @@ class HeadTracker:
|
|
| 59 |
|
| 60 |
self._detections_class = Detections
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
for attempt in range(_MAX_RETRIES):
|
| 67 |
-
try:
|
| 68 |
-
model_path = hf_hub_download(
|
| 69 |
-
repo_id=_MODEL_REPO,
|
| 70 |
-
filename=_MODEL_FILENAME,
|
| 71 |
-
)
|
| 72 |
-
break
|
| 73 |
-
except Exception as e:
|
| 74 |
-
last_error = e
|
| 75 |
-
if attempt < _MAX_RETRIES - 1:
|
| 76 |
-
logger.warning(
|
| 77 |
-
"Model download failed (attempt %d/%d): %s. Retrying in %ds...",
|
| 78 |
-
attempt + 1, _MAX_RETRIES, e, _RETRY_DELAY
|
| 79 |
-
)
|
| 80 |
-
time.sleep(_RETRY_DELAY)
|
| 81 |
-
|
| 82 |
-
if model_path is None:
|
| 83 |
-
raise last_error
|
| 84 |
-
|
| 85 |
self.model = YOLO(model_path).to(self._device)
|
| 86 |
-
logger.info("YOLO face detection model loaded")
|
| 87 |
except ImportError as e:
|
| 88 |
self._model_load_error = f"Missing dependencies: {e}"
|
| 89 |
-
logger.warning(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
self.model = None
|
| 91 |
except Exception as e:
|
| 92 |
self._model_load_error = str(e)
|
|
|
|
| 1 |
"""Lightweight head tracker using YOLO for face detection.
|
| 2 |
|
| 3 |
+
Ported from reachy_mini_conversation_app for voice assistant integration.
|
| 4 |
+
Model is loaded at initialization time (not lazy) to ensure face tracking
|
| 5 |
+
is ready immediately when the camera server starts.
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
import logging
|
|
|
|
|
|
|
| 10 |
from typing import Tuple, Optional
|
| 11 |
|
| 12 |
import numpy as np
|
|
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
class HeadTracker:
|
| 20 |
+
"""Lightweight head tracker using YOLO for face detection.
|
| 21 |
+
|
| 22 |
+
Model is loaded at initialization time to ensure face tracking
|
| 23 |
+
is ready immediately (matching conversation_app behavior).
|
| 24 |
+
"""
|
| 25 |
|
| 26 |
def __init__(
|
| 27 |
self,
|
| 28 |
+
model_repo: str = "AdamCodd/YOLOv11n-face-detection",
|
| 29 |
+
model_filename: str = "model.pt",
|
| 30 |
confidence_threshold: float = 0.3,
|
| 31 |
device: str = "cpu",
|
| 32 |
) -> None:
|
| 33 |
"""Initialize YOLO-based head tracker.
|
| 34 |
|
| 35 |
Args:
|
| 36 |
+
model_repo: HuggingFace model repository
|
| 37 |
+
model_filename: Model file name
|
| 38 |
confidence_threshold: Minimum confidence for face detection
|
| 39 |
device: Device to run inference on ('cpu' or 'cuda')
|
| 40 |
"""
|
| 41 |
self.confidence_threshold = confidence_threshold
|
| 42 |
self.model = None
|
| 43 |
+
self._model_repo = model_repo
|
| 44 |
+
self._model_filename = model_filename
|
| 45 |
self._device = device
|
| 46 |
self._detections_class = None
|
| 47 |
self._model_load_attempted = False
|
| 48 |
self._model_load_error: Optional[str] = None
|
| 49 |
|
| 50 |
+
# Load model immediately at init (not lazy)
|
| 51 |
self._load_model()
|
| 52 |
|
| 53 |
def _load_model(self) -> None:
|
| 54 |
+
"""Load YOLO model at initialization time."""
|
| 55 |
if self._model_load_attempted:
|
| 56 |
return
|
| 57 |
|
|
|
|
| 64 |
|
| 65 |
self._detections_class = Detections
|
| 66 |
|
| 67 |
+
model_path = hf_hub_download(
|
| 68 |
+
repo_id=self._model_repo,
|
| 69 |
+
filename=self._model_filename
|
| 70 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
self.model = YOLO(model_path).to(self._device)
|
| 72 |
+
logger.info("YOLO face detection model loaded from %s", self._model_repo)
|
| 73 |
except ImportError as e:
|
| 74 |
self._model_load_error = f"Missing dependencies: {e}"
|
| 75 |
+
logger.warning(
|
| 76 |
+
"Face tracking disabled - missing dependencies: %s. "
|
| 77 |
+
"Install with: pip install ultralytics supervision huggingface_hub",
|
| 78 |
+
e
|
| 79 |
+
)
|
| 80 |
self.model = None
|
| 81 |
except Exception as e:
|
| 82 |
self._model_load_error = str(e)
|