Initial commit

1d461bc7 · 胡志宏 · 1d461bc7 · 1d461bc7 · 1d461bc7 · 1d461bc7
Commit 1d461bc7 authored Apr 08, 2026 by 胡志宏 🤗
7 changed files
--- a/.gitignore
+++ b/.gitignore
+.venv/
+__pycache__/
+*.pyc
+app.log
+.idea/
+.vscode/
--- a/app.py
+++ b/app.py
--- a/index.html
+++ b/index.html
--- a/requirements.txt
+++ b/requirements.txt
+fastapi>=0.104.0
+uvicorn>=0.24.0
+websockets>=12.0
+websocket-client>=1.6.0
+openai>=1.0.0
+requests>=2.31.0
--- a/start.bat
+++ b/start.bat
+@echo off
+cd /d "%~dp0"
+.venv\Scripts\python app.py
+pause
--- a/tencent_speech.py
+++ b/tencent_speech.py
+"""
+Tencent Cloud TTS (Streaming Text-to-Speech v2) and ASR (Real-time Speech Recognition)
+"""
+import base64
+import hashlib
+import hmac
+import json
+import logging
+import threading
+import time
+import uuid
+from urllib.parse import quote
+
+import websocket  # websocket-client (sync)
+
+logger = logging.getLogger(__name__)
+
+
+def _generate_signature(params: dict, secret_key: str, host: str, path: str, prefix: str = "GET") -> str:
+    """Generate HMAC-SHA1 signature for Tencent Cloud WebSocket API."""
+    sorted_keys = sorted(params.keys())
+    param_string = "&".join(f"{k}={params[k]}" for k in sorted_keys)
+    sign_str = f"{prefix}{host}{path}?{param_string}"
+    signature = hmac.new(
+        secret_key.encode(), sign_str.encode(), hashlib.sha1
+    ).digest()
+    return quote(base64.b64encode(signature).decode())
+
+
+# ================================================================== #
+#  Tencent TTS - Streaming Text-to-Speech v2
+# ================================================================== #
+
+class TencentStreamingTTS:
+    """Tencent Cloud Streaming TTS using WebSocket v2.
+
+    Supports streaming text input (sentence by sentence from LLM).
+    Audio receiving runs in a background thread for non-blocking sends.
+    """
+
+    def __init__(
+        self,
+        app_id: str,
+        secret_id: str,
+        secret_key: str,
+        voice_type: str = "502004",
+        speed: float = 0,
+        volume: int = 0,
+        sample_rate: int = 16000,
+        codec: str = "pcm",
+    ):
+        self.app_id = app_id
+        self.secret_id = secret_id
+        self.secret_key = secret_key
+        self.voice_type = voice_type
+        self.speed = speed
+        self.volume = volume
+        self.sample_rate = sample_rate
+        self.codec = codec
+        self._ws = None
+        self._session_id = None
+        self._on_audio = None
+        self._recv_thread = None
+        self._done_event = threading.Event()
+
+    def connect(self, on_audio):
+        """Connect to TTS WebSocket, start background audio receiver.
+
+        Args:
+            on_audio: callback(pcm_bytes) called for each audio chunk
+        """
+        self._on_audio = on_audio
+        self._session_id = str(uuid.uuid4())
+        self._done_event.clear()
+        timestamp = int(time.time())
+
+        params = {
+            "Action": "TextToStreamAudioWSv2",
+            "AppId": str(self.app_id),
+            "Codec": self.codec,
+            "Expired": str(timestamp + 86400),
+            "SampleRate": str(self.sample_rate),
+            "SecretId": self.secret_id,
+            "SessionId": self._session_id,
+            "Speed": str(self.speed),
+            "Timestamp": str(timestamp),
+            "VoiceType": str(self.voice_type),
+            "Volume": str(self.volume),
+        }
+
+        sig = _generate_signature(
+            params, self.secret_key,
+            "tts.cloud.tencent.com", "/stream_wsv2",
+        )
+
+        query = "&".join(f"{k}={params[k]}" for k in sorted(params.keys()))
+        url = f"wss://tts.cloud.tencent.com/stream_wsv2?{query}&Signature={sig}"
+
+        self._ws = websocket.create_connection(url, timeout=10)
+        resp = json.loads(self._ws.recv())
+        if resp.get("code") != 0:
+            raise RuntimeError(f"TTS connect failed: {resp}")
+        logger.info("Tencent TTS connected (session=%s)", self._session_id)
+
+        # Start background thread to receive audio
+        self._recv_thread = threading.Thread(target=self._recv_loop, daemon=True)
+        self._recv_thread.start()
+
+    def _recv_loop(self):
+        """Background thread: continuously receive audio and metadata."""
+        while True:
+            try:
+                data = self._ws.recv()
+            except Exception:
+                break
+            if isinstance(data, bytes):
+                if self._on_audio:
+                    self._on_audio(data)
+            else:
+                resp = json.loads(data)
+                if resp.get("code") != 0:
+                    logger.error("TTS error: %s", resp)
+                    break
+                if resp.get("final") == 1:
+                    break
+        self._done_event.set()
+
+    def send_text(self, text: str):
+        """Send a text chunk for synthesis (non-blocking, returns immediately)."""
+        if not self._ws:
+            raise RuntimeError("TTS not connected")
+        msg = {
+            "session_id": self._session_id,
+            "message_id": str(uuid.uuid4()),
+            "action": "ACTION_SYNTHESIS",
+            "data": text,
+        }
+        self._ws.send(json.dumps(msg))
+
+    def complete(self):
+        """Signal end of text, wait for all audio, close connection."""
+        if not self._ws:
+            return
+        msg = {
+            "session_id": self._session_id,
+            "message_id": str(uuid.uuid4()),
+            "action": "ACTION_COMPLETE",
+            "data": "",
+        }
+        self._ws.send(json.dumps(msg))
+        # Wait for background receiver to finish
+        self._done_event.wait(timeout=30)
+        if self._recv_thread:
+            self._recv_thread.join(timeout=5)
+        self._ws.close()
+        self._ws = None
+        logger.info("Tencent TTS completed")
+
+
+# ================================================================== #
+#  Tencent ASR - Real-time Speech Recognition
+# ================================================================== #
+
+class TencentRealtimeASR:
+    """Tencent Cloud Real-time ASR using WebSocket.
+
+    Accepts PCM audio chunks and returns transcription text.
+    """
+
+    def __init__(
+        self,
+        app_id: str,
+        secret_id: str,
+        secret_key: str,
+        engine_type: str = "16k_zh_large",
+        vad_silence_time: int = 800,
+    ):
+        self.app_id = app_id
+        self.secret_id = secret_id
+        self.secret_key = secret_key
+        self.engine_type = engine_type
+        self.vad_silence_time = vad_silence_time
+        self._ws = None
+
+    def connect(self) -> str:
+        """Connect to ASR WebSocket. Returns session ID on success."""
+        import random
+        timestamp = int(time.time())
+
+        params = {
+            "secretid": self.secret_id,
+            "timestamp": str(timestamp),
+            "expired": str(timestamp + 86400),
+            "nonce": str(random.randint(1000000, 9999999)),
+            "engine_model_type": self.engine_type,
+            "voice_id": str(uuid.uuid4()),
+            "voice_format": "1",  # PCM
+            "needvad": "1",
+            "vad_silence_time": str(self.vad_silence_time),
+        }
+
+        # ASR signing: no "GET" prefix, just host+path+params
+        sig = _generate_signature(
+            params, self.secret_key,
+            "asr.cloud.tencent.com", f"/asr/v2/{self.app_id}",
+            prefix="",
+        )
+
+        query = "&".join(f"{k}={params[k]}" for k in sorted(params.keys()))
+        url = f"wss://asr.cloud.tencent.com/asr/v2/{self.app_id}?{query}&signature={sig}"
+
+        self._ws = websocket.create_connection(url, timeout=10)
+        # Handshake response
+        resp = json.loads(self._ws.recv())
+        if resp.get("code") != 0:
+            raise RuntimeError(f"ASR connect failed: {resp}")
+        logger.info("Tencent ASR connected (voice_id=%s)", params["voice_id"])
+        return params["voice_id"]
+
+    def send_audio(self, pcm_chunk: bytes):
+        """Send a PCM audio chunk for recognition."""
+        if self._ws:
+            self._ws.send_binary(pcm_chunk)
+
+    def recv_result(self) -> dict | None:
+        """Non-blocking receive of recognition result. Returns None if no data."""
+        if not self._ws:
+            return None
+        self._ws.settimeout(0.05)
+        try:
+            data = self._ws.recv()
+            if isinstance(data, str):
+                return json.loads(data)
+        except websocket.WebSocketTimeoutException:
+            return None
+        except Exception:
+            return None
+        return None
+
+    def send_end(self):
+        """Signal end of audio stream."""
+        if self._ws:
+            try:
+                end_msg = json.dumps({"type": "end"})
+                self._ws.send(end_msg)
+            except Exception:
+                pass
+
+    def close(self):
+        """Close connection and get final result."""
+        results = []
+        if self._ws:
+            self.send_end()
+            # Drain remaining results
+            self._ws.settimeout(3)
+            while True:
+                try:
+                    data = self._ws.recv()
+                    if isinstance(data, str):
+                        resp = json.loads(data)
+                        results.append(resp)
+                        if resp.get("final") == 1:
+                            break
+                except Exception:
+                    break
+            self._ws.close()
+            self._ws = None
+        return results
--- a/使用说明.md
+++ b/使用说明.md
+# 数字人测试客户端 - 使用说明
+
+## 前提条件
+
+1. MuseTalk 流式服务已在服务器上启动（`10.10.0.102:8001`）
+2. 本地已创建好虚拟环境（`.venv` 目录已存在且依赖已安装）
+
+## 启动方式
+
+```bash
+cd D:\work\fusion\Avatar\MuseTalk\test_client
+.venv\Scripts\python app.py
+```
+
+或直接双击 `start.bat`。
+
+启动后在浏览器打开：http://localhost:8002
+
+## 使用流程
+
+1. 打开页面后，系统会自动上传数字人视频到服务器并进行预处理（首次需等待）
+2. 预处理完成后，页面左侧显示数字人画面，右侧为对话框
+3. 在输入框中输入问题，按回车或点击发送
+4. 系统流程：用户文字 → DeepSeek 大模型回复 → CosyVoice 语音合成 → MuseTalk 口型驱动
+5. 数字人会根据回复内容对口型说话，右侧显示对话记录
+
+## 配置说明
+
+如需修改配置，编辑 `app.py` 顶部的常量：
+
+| 配置项 | 说明 | 默认值 |
+|--------|------|--------|
+| `MUSETALK_URL` | MuseTalk 服务地址 | `http://10.10.0.102:8001` |
+| `DEEPSEEK_API_KEY` | DeepSeek 大模型 API Key | 已配置 |
+| `DASHSCOPE_API_KEY` | 千问 TTS API Key | 已配置 |
+| `AVATAR_VIDEO_PATH` | 数字人形象视频路径 | `shuzirenxingxiang.mp4` |
+| `AVATAR_ID` | 数字人形象 ID | `hospital_front` |
+| `SYSTEM_PROMPT` | 大模型系统提示词 | 医院前台角色 |
+
+## 重新安装依赖
+
+如果 `.venv` 损坏或需要重建：
+
+```bash
+cd D:\work\fusion\Avatar\MuseTalk\test_client
+python -m venv .venv
+.venv\Scripts\pip install -r requirements.txt
+```