Commit 1d461bc7 authored by 胡志宏's avatar 胡志宏 🤗

Initial commit

parents
.venv/
__pycache__/
*.pyc
app.log
.idea/
.vscode/
This diff is collapsed.
This diff is collapsed.
fastapi>=0.104.0
uvicorn>=0.24.0
websockets>=12.0
websocket-client>=1.6.0
openai>=1.0.0
requests>=2.31.0
@echo off
cd /d "%~dp0"
.venv\Scripts\python app.py
pause
"""
Tencent Cloud TTS (Streaming Text-to-Speech v2) and ASR (Real-time Speech Recognition)
"""
import base64
import hashlib
import hmac
import json
import logging
import threading
import time
import uuid
from urllib.parse import quote
import websocket # websocket-client (sync)
logger = logging.getLogger(__name__)
def _generate_signature(params: dict, secret_key: str, host: str, path: str, prefix: str = "GET") -> str:
"""Generate HMAC-SHA1 signature for Tencent Cloud WebSocket API."""
sorted_keys = sorted(params.keys())
param_string = "&".join(f"{k}={params[k]}" for k in sorted_keys)
sign_str = f"{prefix}{host}{path}?{param_string}"
signature = hmac.new(
secret_key.encode(), sign_str.encode(), hashlib.sha1
).digest()
return quote(base64.b64encode(signature).decode())
# ================================================================== #
# Tencent TTS - Streaming Text-to-Speech v2
# ================================================================== #
class TencentStreamingTTS:
"""Tencent Cloud Streaming TTS using WebSocket v2.
Supports streaming text input (sentence by sentence from LLM).
Audio receiving runs in a background thread for non-blocking sends.
"""
def __init__(
self,
app_id: str,
secret_id: str,
secret_key: str,
voice_type: str = "502004",
speed: float = 0,
volume: int = 0,
sample_rate: int = 16000,
codec: str = "pcm",
):
self.app_id = app_id
self.secret_id = secret_id
self.secret_key = secret_key
self.voice_type = voice_type
self.speed = speed
self.volume = volume
self.sample_rate = sample_rate
self.codec = codec
self._ws = None
self._session_id = None
self._on_audio = None
self._recv_thread = None
self._done_event = threading.Event()
def connect(self, on_audio):
"""Connect to TTS WebSocket, start background audio receiver.
Args:
on_audio: callback(pcm_bytes) called for each audio chunk
"""
self._on_audio = on_audio
self._session_id = str(uuid.uuid4())
self._done_event.clear()
timestamp = int(time.time())
params = {
"Action": "TextToStreamAudioWSv2",
"AppId": str(self.app_id),
"Codec": self.codec,
"Expired": str(timestamp + 86400),
"SampleRate": str(self.sample_rate),
"SecretId": self.secret_id,
"SessionId": self._session_id,
"Speed": str(self.speed),
"Timestamp": str(timestamp),
"VoiceType": str(self.voice_type),
"Volume": str(self.volume),
}
sig = _generate_signature(
params, self.secret_key,
"tts.cloud.tencent.com", "/stream_wsv2",
)
query = "&".join(f"{k}={params[k]}" for k in sorted(params.keys()))
url = f"wss://tts.cloud.tencent.com/stream_wsv2?{query}&Signature={sig}"
self._ws = websocket.create_connection(url, timeout=10)
resp = json.loads(self._ws.recv())
if resp.get("code") != 0:
raise RuntimeError(f"TTS connect failed: {resp}")
logger.info("Tencent TTS connected (session=%s)", self._session_id)
# Start background thread to receive audio
self._recv_thread = threading.Thread(target=self._recv_loop, daemon=True)
self._recv_thread.start()
def _recv_loop(self):
"""Background thread: continuously receive audio and metadata."""
while True:
try:
data = self._ws.recv()
except Exception:
break
if isinstance(data, bytes):
if self._on_audio:
self._on_audio(data)
else:
resp = json.loads(data)
if resp.get("code") != 0:
logger.error("TTS error: %s", resp)
break
if resp.get("final") == 1:
break
self._done_event.set()
def send_text(self, text: str):
"""Send a text chunk for synthesis (non-blocking, returns immediately)."""
if not self._ws:
raise RuntimeError("TTS not connected")
msg = {
"session_id": self._session_id,
"message_id": str(uuid.uuid4()),
"action": "ACTION_SYNTHESIS",
"data": text,
}
self._ws.send(json.dumps(msg))
def complete(self):
"""Signal end of text, wait for all audio, close connection."""
if not self._ws:
return
msg = {
"session_id": self._session_id,
"message_id": str(uuid.uuid4()),
"action": "ACTION_COMPLETE",
"data": "",
}
self._ws.send(json.dumps(msg))
# Wait for background receiver to finish
self._done_event.wait(timeout=30)
if self._recv_thread:
self._recv_thread.join(timeout=5)
self._ws.close()
self._ws = None
logger.info("Tencent TTS completed")
# ================================================================== #
# Tencent ASR - Real-time Speech Recognition
# ================================================================== #
class TencentRealtimeASR:
"""Tencent Cloud Real-time ASR using WebSocket.
Accepts PCM audio chunks and returns transcription text.
"""
def __init__(
self,
app_id: str,
secret_id: str,
secret_key: str,
engine_type: str = "16k_zh_large",
vad_silence_time: int = 800,
):
self.app_id = app_id
self.secret_id = secret_id
self.secret_key = secret_key
self.engine_type = engine_type
self.vad_silence_time = vad_silence_time
self._ws = None
def connect(self) -> str:
"""Connect to ASR WebSocket. Returns session ID on success."""
import random
timestamp = int(time.time())
params = {
"secretid": self.secret_id,
"timestamp": str(timestamp),
"expired": str(timestamp + 86400),
"nonce": str(random.randint(1000000, 9999999)),
"engine_model_type": self.engine_type,
"voice_id": str(uuid.uuid4()),
"voice_format": "1", # PCM
"needvad": "1",
"vad_silence_time": str(self.vad_silence_time),
}
# ASR signing: no "GET" prefix, just host+path+params
sig = _generate_signature(
params, self.secret_key,
"asr.cloud.tencent.com", f"/asr/v2/{self.app_id}",
prefix="",
)
query = "&".join(f"{k}={params[k]}" for k in sorted(params.keys()))
url = f"wss://asr.cloud.tencent.com/asr/v2/{self.app_id}?{query}&signature={sig}"
self._ws = websocket.create_connection(url, timeout=10)
# Handshake response
resp = json.loads(self._ws.recv())
if resp.get("code") != 0:
raise RuntimeError(f"ASR connect failed: {resp}")
logger.info("Tencent ASR connected (voice_id=%s)", params["voice_id"])
return params["voice_id"]
def send_audio(self, pcm_chunk: bytes):
"""Send a PCM audio chunk for recognition."""
if self._ws:
self._ws.send_binary(pcm_chunk)
def recv_result(self) -> dict | None:
"""Non-blocking receive of recognition result. Returns None if no data."""
if not self._ws:
return None
self._ws.settimeout(0.05)
try:
data = self._ws.recv()
if isinstance(data, str):
return json.loads(data)
except websocket.WebSocketTimeoutException:
return None
except Exception:
return None
return None
def send_end(self):
"""Signal end of audio stream."""
if self._ws:
try:
end_msg = json.dumps({"type": "end"})
self._ws.send(end_msg)
except Exception:
pass
def close(self):
"""Close connection and get final result."""
results = []
if self._ws:
self.send_end()
# Drain remaining results
self._ws.settimeout(3)
while True:
try:
data = self._ws.recv()
if isinstance(data, str):
resp = json.loads(data)
results.append(resp)
if resp.get("final") == 1:
break
except Exception:
break
self._ws.close()
self._ws = None
return results
# 数字人测试客户端 - 使用说明
## 前提条件
1. MuseTalk 流式服务已在服务器上启动(`10.10.0.102:8001`
2. 本地已创建好虚拟环境(`.venv` 目录已存在且依赖已安装)
## 启动方式
```bash
cd D:\work\fusion\Avatar\MuseTalk\test_client
.venv\Scripts\python app.py
```
或直接双击 `start.bat`
启动后在浏览器打开:http://localhost:8002
## 使用流程
1. 打开页面后,系统会自动上传数字人视频到服务器并进行预处理(首次需等待)
2. 预处理完成后,页面左侧显示数字人画面,右侧为对话框
3. 在输入框中输入问题,按回车或点击发送
4. 系统流程:用户文字 → DeepSeek 大模型回复 → CosyVoice 语音合成 → MuseTalk 口型驱动
5. 数字人会根据回复内容对口型说话,右侧显示对话记录
## 配置说明
如需修改配置,编辑 `app.py` 顶部的常量:
| 配置项 | 说明 | 默认值 |
|--------|------|--------|
| `MUSETALK_URL` | MuseTalk 服务地址 | `http://10.10.0.102:8001` |
| `DEEPSEEK_API_KEY` | DeepSeek 大模型 API Key | 已配置 |
| `DASHSCOPE_API_KEY` | 千问 TTS API Key | 已配置 |
| `AVATAR_VIDEO_PATH` | 数字人形象视频路径 | `shuzirenxingxiang.mp4` |
| `AVATAR_ID` | 数字人形象 ID | `hospital_front` |
| `SYSTEM_PROMPT` | 大模型系统提示词 | 医院前台角色 |
## 重新安装依赖
如果 `.venv` 损坏或需要重建:
```bash
cd D:\work\fusion\Avatar\MuseTalk\test_client
python -m venv .venv
.venv\Scripts\pip install -r requirements.txt
```
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment