Skip to content
Navigation Menu
{{ message }}
forked from SesameAILabs/csm
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathchat2_web.py
More file actions
314 lines (257 loc) · 10.8 KB
/
Copy pathchat2_web.py
File metadata and controls
314 lines (257 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import os
import torch
import torchaudio
import gradio as gr
import time
import base64
import whisper
import numpy as np
import requests
import json
from datetime import datetime
from pathlib import Path
from generator import Segment, Generator, load_csm_1b
from io import BytesIO
# 创建 wav 目录(如果不存在)
os.makedirs('wav', exist_ok=True)
# 初始化设备
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# 初始化 Whisper 模型用于语音识别
print("Loading Whisper model...")
whisper_model = whisper.load_model("turbo")
print("Whisper model loaded")
# 初始化 CSM 模型用于语音生成
print("Loading CSM model...")
model_path = "/root/autodl-tmp/csm/model/ckpt.pt"
generator = load_csm_1b(model_path, device)
print("CSM model loaded")
# DeepSeek 配置
DEEPSEEK_API_KEY = "sk-"
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
# 加载音频函数
def load_audio(audio_path, target_sample_rate):
audio_tensor, sample_rate = torchaudio.load(audio_path)
audio_tensor = torchaudio.functional.resample(
audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=target_sample_rate
)
return audio_tensor
# 调用 DeepSeek Chat API 生成文本回复
def get_deepseek_response(user_message, history):
try:
print(f"Calling DeepSeek API for user message: '{user_message}'")
# 构建消息历史
messages = []
for item in history:
messages.append({"role": item["role"], "content": item["content"]})
# 添加当前用户消息
messages.append({"role": "user", "content": user_message})
# 系统提示词:简短温柔的女生设定
system_prompt = """You are a gentle and lovely girl, we are in a relationship of lovers, in communication:
1. The reply should be short, preferably no more than 30 words
2. The tone should be soft and sweet, full of emotion.
3. Reply in English, no emoticons allowed
You're fun to be around, a little provocative
6. Do not display your reply in the form of AI.
7. Ensure that users feel present and immersed."""
# 准备请求数据
payload = {
"model": "deepseek-chat",
"messages": messages,
"system": system_prompt,
"temperature": 0.9, # 增加创意性
"max_tokens": 100, # 限制输出长度
"top_p": 0.9 # 保持适度的随机性
}
# 发送请求到 DeepSeek API
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {DEEPSEEK_API_KEY}"
}
response = requests.post(
f"{DEEPSEEK_BASE_URL}/v1/chat/completions",
headers=headers,
json=payload
)
if response.status_code == 200:
result = response.json()
ai_response = result["choices"][0]["message"]["content"]
print(f"DeepSeek response: '{ai_response}'")
return ai_response
else:
print(f"DeepSeek API error: {response.status_code}")
print(response.text)
return f"抱歉,我现在无法回应,请稍后再试。"
except Exception as e:
print(f"Error calling DeepSeek API: {e}")
return "抱歉,我遇到了一些问题~"
# 初始化对话管理器
class ConversationManager:
def __init__(self, max_history=5):
self.max_history = max_history
self.history = []
self.chatbot_history = []
# 准备参考语音段落(使用女声作为参考)
self.reference_text = "And Lake turned round upon me, a little abruptly, with his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face with a stern and insidious look confronted me."
self.reference_path = "/root/autodl-tmp/csm/model/prompts/read_speech_a.wav" # 女声参考
# 加载参考语音
self.reference_segment = Segment(
text=self.reference_text,
speaker=0, # 女声
audio=load_audio(self.reference_path, generator.sample_rate)
)
print(f"已初始化女声参考语音: {self.reference_text}")
def add_message(self, text, audio_tensor, is_user=True):
speaker_id = 1 if is_user else 0
segment = Segment(text=text, speaker=speaker_id, audio=audio_tensor)
self.history.append(segment)
# 更新聊天机器人历史为新的消息格式
message = {
"role": "user" if is_user else "assistant",
"content": text
}
self.chatbot_history.append(message)
# 保持最大历史记录长度
if len(self.history) > self.max_history * 2: # 每轮对话包含用户和系统消息
self.history = self.history[-self.max_history * 2:]
self.chatbot_history = self.chatbot_history[-self.max_history * 2:]
return segment
def get_context(self):
# 返回参考语音段落加上历史记录
return [self.reference_segment] + self.history
def get_chatbot_history(self):
# 直接返回消息历史记录列表
return self.chatbot_history
# 初始化对话管理器
conv_manager = ConversationManager()
# 使用Whisper将用户音频转换为文本
def transcribe_audio(audio_path):
print(f"转写音频: {audio_path}")
try:
result = whisper_model.transcribe(audio_path)
transcribed_text = result["text"]
print(f"转写结果: '{transcribed_text}'")
return transcribed_text
except Exception as e:
print(f"转写过程中出错: {e}")
return f"[转写错误]"
# 生成系统回复-------------------------------------------------------------------------------------------------
def generate_response(user_text, user_audio_tensor):
print(f"处理用户输入: '{user_text}'")
# 添加用户消息到历史记录
user_segment = conv_manager.add_message(user_text, user_audio_tensor, is_user=True)
# 使用DeepSeek生成AI回复文本
system_text = get_deepseek_response(user_text, conv_manager.chatbot_history[:-1])
print(f"为AI回复生成语音: '{system_text}'")
# 获取完整上下文(参考语音 + 历史记录)
context = conv_manager.get_context()
# 生成系统回复的语音
system_audio = generator.generate(
text=system_text, # 系统回复文本
speaker=0, # 使用女声
context=context, # 包含参考语音和对话历史的上下文
max_audio_length_ms=20_000, # 增加最大长度以适应较长回复
)
# 添加系统回复到历史记录
system_segment = conv_manager.add_message(system_text, system_audio, is_user=False)
# 将音频转换为16位整数格式
audio_numpy = (system_audio.cpu().numpy() * 32767).astype(np.int16)
# 使用BytesIO和soundfile保存音频
wav_io = BytesIO()
import soundfile as sf
sf.write(wav_io, audio_numpy, samplerate=generator.sample_rate, format="WAV")
wav_io.seek(0)
wav_bytes = wav_io.getvalue()
# 将WAV数据转换为base64编码
audio_b64 = base64.b64encode(wav_bytes).decode('utf-8')
# 创建一个带有自动播放的HTML音频元素
html_audio = f'<audio controls autoplay src="data:audio/wav;base64,{audio_b64}"></audio>'
print(f"生成的系统回复音频长度: {len(audio_numpy)} 采样点")
# 返回系统回复文本、HTML音频元素和格式化的对话历史
return system_text, gr.HTML(html_audio), conv_manager.get_chatbot_history()
# 清除对话历史
def clear_history():
global conv_manager
conv_manager = ConversationManager()
return "", gr.HTML(""), [] # 返回空的聊天历史
# 处理用户输入(音频录制)
def process_user_input(audio_file):
if audio_file is None:
return "请先录制音频消息。", gr.HTML(""), []
# 加载并转换用户音频
audio_tensor, sample_rate = torchaudio.load(audio_file)
audio_tensor = torchaudio.functional.resample(
audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate
)
# 使用Whisper将音频转换为文本
# 创建临时文件用于Whisper处理
temp_path = "temp_whisper_input.wav"
torchaudio.save(temp_path, audio_tensor.unsqueeze(0), generator.sample_rate)
user_text = transcribe_audio(temp_path)
os.remove(temp_path) # 删除临时文件
# 生成系统响应
return generate_response(user_text, audio_tensor)
# 创建Gradio界面
demo = gr.Blocks(theme=gr.themes.Soft())
with demo:
gr.Markdown("# 🎙️ AI语音对话系统")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="录制消息"
)
submit_btn = gr.Button("发送", variant="primary")
clear_btn = gr.Button("清除对话历史", variant="secondary")
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="对话历史",
height=400,
show_label=True,
type="messages"
)
with gr.Row():
system_text = gr.Textbox(
label="AI回复文本",
interactive=False
)
audio_output = gr.HTML(
label="AI语音回复"
)
# 设置事件处理
submit_btn.click(
fn=process_user_input,
inputs=[audio_input],
outputs=[system_text, audio_output, chatbot]
)
# 清除对话历史
def clear_history():
global conv_manager
conv_manager = ConversationManager()
return "", gr.HTML(""), []
clear_btn.click(
fn=clear_history,
inputs=[],
outputs=[system_text, audio_output, chatbot]
)
# 自定义CSS提高界面美观度
demo.load(js="""
function checkAudioVisibility() {
const audioResponse = document.getElementById('audio-response');
if (audioResponse) {
const waveform = audioResponse.querySelector('.audio-waveform');
if (waveform) {
waveform.style.display = 'block';
waveform.style.height = '60px';
}
}
setTimeout(checkAudioVisibility, 1000);
}
document.addEventListener('DOMContentLoaded', checkAudioVisibility);
""")
# 启动应用
if __name__ == "__main__":
# 确保缓存目录存在
os.makedirs("gradio_cache", exist_ok=True)
demo.launch(server_name="0.0.0.0", server_port=6006, share=True)
You can’t perform that action at this time.
