add chat stream and audio response by tts

This commit is contained in:
eust-w 2025-06-18 11:28:39 +08:00
parent 31581e24a9
commit e29eced691
4 changed files with 234 additions and 55 deletions

125
app.py
View File

@ -5,7 +5,7 @@ import os
import json import json
import base64 import base64
import requests import requests
from flask import Flask, render_template, request, jsonify, redirect, url_for, session, Response from flask import Flask, render_template, request, jsonify, redirect, url_for, session, Response, stream_with_context
from flask_cors import CORS from flask_cors import CORS
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from app.api.baidu_image_search import BaiduImageSearch from app.api.baidu_image_search import BaiduImageSearch
@ -457,6 +457,120 @@ def delete_robot(robot_id):
except Exception as e: except Exception as e:
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500
# ==================== Chat Stream ====================
@app.route('/imgsearcherApi/api/chat-stream', methods=['POST'])
def chat_stream():
"""流式返回文本增量 + 最终语音 (SSE)"""
if not azure_openai_api:
return jsonify({'error': 'Azure OpenAI API未正确配置'}), 500
data = request.get_json()
if not data or 'message' not in data:
return jsonify({'error': '缺少消息内容'}), 400
message = data['message']
image_path = session.get('chat_image_path')
robot_id = session.get('robot_id', '')
if not image_path or not os.path.exists(image_path):
return jsonify({'error': '没有上传图片或图片已失效'}), 400
conversation_history = session.get('conversation_history', [])
robot_info = None
if robot_id:
robot = robot_manager.get_robot(robot_id)
if robot:
robot_info = {'name': robot.get('name', ''), 'background': robot.get('background', '')}
# 语音音色选择
voice_wav = data.get('voice') or 'zh-CN-XiaoXiao-Assistant-Audio.wav'
# 与Azure流式对话
openai_resp = azure_openai_api.chat_with_image_stream(
image_path=image_path,
message=message,
conversation_history=conversation_history,
robot_info=robot_info
)
def event_stream():
accumulated_text = ""
pending = ""
punctuation = "。!?.!?"
def send_tts(sentence):
if not sentence.strip():
return
tts_params = {
'tts_text': sentence,
'prompt_wav': voice_wav,
'text_split_method': 'cut5'
}
try:
tts_resp = requests.get(
"https://cloud.infini-ai.com/AIStudio/v1/inference/api/te-c7zfd4hrdoj2vqmw/tts/CosyVoice/v1/zero_shot",
params=tts_params,
headers={'Authorization': f'Bearer {os.getenv("INFINI_API_KEY", "sk-daooolufaf7ienn6")}'},
stream=True,
timeout=30
)
if tts_resp.status_code == 200:
audio_bytes = bytearray()
for chunk in tts_resp.iter_content(chunk_size=8192):
if chunk:
audio_bytes.extend(chunk)
if audio_bytes:
b64_audio = base64.b64encode(audio_bytes).decode()
yield f"data: {{\"type\": \"audio\", \"content\": \"{b64_audio}\" }}\n\n"
except Exception as e:
print("TTS 调用失败", e)
# 读取 GPT 流
for line in openai_resp.iter_lines():
if not line:
continue
try:
if line.strip() == b'data: [DONE]':
break
if line.startswith(b'data:'):
content_json = json.loads(line[5:].strip())
choices = content_json.get('choices', [])
if not choices:
continue
delta = choices[0].get('delta', {}).get('content', '')
if not delta:
continue
accumulated_text += delta
pending += delta
# 立即发送文本增量
yield f"data: {{\"type\": \"text\", \"content\": {json.dumps(delta, ensure_ascii=False)} }}\n\n"
# 检测标点
while True:
idx = -1
for i, ch in enumerate(pending):
if ch in punctuation:
idx = i
break
if idx == -1:
break
sentence = pending[:idx+1]
pending = pending[idx+1:]
# 调用 TTS 并流式发送
for audio_evt in send_tts(sentence):
yield audio_evt
except Exception as e:
print("解析OpenAI流错误", e)
continue
# 处理剩余 pending
if pending.strip():
for audio_evt in send_tts(pending):
yield audio_evt
pending = ""
# 更新会话历史
conversation_history.append({"role": "user", "content": message})
conversation_history.append({"role": "assistant", "content": accumulated_text})
session['conversation_history'] = conversation_history
yield "event: end\ndata: {}\n\n"
# 保持请求上下文,避免在生成器中操作 session 报错
return Response(stream_with_context(event_stream()), content_type='text/event-stream')
# ==================== TTS ==================== # ==================== TTS ====================
@app.route('/imgsearcherApi/api/tts', methods=['POST']) @app.route('/imgsearcherApi/api/tts', methods=['POST'])
def tts(): def tts():
@ -467,18 +581,19 @@ def tts():
text = data['text'] text = data['text']
prompt_text = data.get('prompt_text') or "我是威震天,我只代表月亮消灭你" prompt_text = data.get('prompt_text') or "我是威震天,我只代表月亮消灭你"
prompt_wav = data.get('prompt_wav') or "data_workspace/data/workspace_170/我是威震天.wav" prompt_wav = data.get('voice') or data.get('prompt_wav') or "zh-CN-XiaoXiao-Assistant-Audio.wav"
tts_base_url = "http://180.76.186.85:20099/CosyVoice/v1/zero_shot" tts_base_url = "https://cloud.infini-ai.com/AIStudio/v1/inference/api/te-c7zfd4hrdoj2vqmw/tts/CosyVoice/v1/zero_shot"
api_key = os.getenv('INFINI_API_KEY', 'sk-daooolufaf7ienn6')
headers = {'Authorization': f'Bearer {api_key}'}
params = { params = {
'tts_text': text, 'tts_text': text,
'prompt_text': prompt_text,
'prompt_wav': prompt_wav, 'prompt_wav': prompt_wav,
'text_split_method': 'cut5' 'text_split_method': 'cut5'
} }
try: try:
r = requests.get(tts_base_url, params=params, timeout=20) r = requests.get(tts_base_url, params=params, headers=headers, timeout=20)
if r.status_code == 200: if r.status_code == 200:
return Response(r.content, content_type=r.headers.get('Content-Type', 'audio/wav')) return Response(r.content, content_type=r.headers.get('Content-Type', 'audio/wav'))
else: else:

View File

@ -172,3 +172,39 @@ class AzureOpenAI:
return response.json() return response.json()
else: else:
raise Exception(f"Azure OpenAI API请求失败: {response.text}") raise Exception(f"Azure OpenAI API请求失败: {response.text}")
def chat_with_image_stream(self, image_path, message, conversation_history=None, robot_info=None):
"""流式输出 GPT-4o 回复,返回 requests.Response 对象 (stream=True)"""
if conversation_history is None:
conversation_history = []
image_type, description = self._get_image_type_description(image_path)
base64_image = self._encode_image(image_path)
if robot_info and robot_info.get('name') and robot_info.get('background'):
system_message = f"你是{robot_info['name']},一个能够分析图片并回答问题的角色。\n\n你的背景故事:{robot_info['background']}\n\n在对话中,你应该始终保持这个角色的身份和特点,用第一人称回答问题。"
else:
system_message = "你是一个智能助手,能够分析图片并回答问题。"
if image_type and description:
system_message += f"\n\n这是一张{image_type}的图片。\n描述信息:{description}\n\n请基于这些信息和图片内容回答用户的问题。"
messages = [{"role": "system", "content": system_message}]
for msg in conversation_history:
messages.append(msg)
messages.append({
"role": "user",
"content": [
{"type": "text", "text": message},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
})
headers = {"Content-Type": "application/json", "api-key": self.api_key}
payload = {
"messages": messages,
"max_tokens": 2000,
"temperature": 0.7,
"top_p": 0.95,
"stream": True
}
url = f"{self.endpoint}/openai/deployments/{self.deployment_name}/chat/completions?api-version={self.api_version}"
return requests.post(url, headers=headers, json=payload, stream=True)

View File

@ -178,28 +178,76 @@ function initChatForm() {
chatMessages.scrollTop = chatMessages.scrollHeight; chatMessages.scrollTop = chatMessages.scrollHeight;
try { try {
const response = await fetch('/imgsearcherApi/api/chat', { // 使用流式接口
const response = await fetch('/imgsearcherApi/api/chat-stream', {
method: 'POST', method: 'POST',
headers: { headers: {
'Content-Type': 'application/json' 'Content-Type': 'application/json'
}, },
body: JSON.stringify({ body: JSON.stringify({
message: message message: message,
voice: document.getElementById('voiceSelect') ? document.getElementById('voiceSelect').value : ''
}) })
}); });
const data = await response.json(); if (!response.ok) {
chatMessages.removeChild(typingIndicator);
// 移除正在输入指示器 alert('对话失败');
chatMessages.removeChild(typingIndicator);
if (data.error) {
alert(`对话失败: ${data.error}`);
return; return;
} }
// 添加AI回复到聊天界面 // 创建助手消息占位
addMessage('assistant', data.reply); const assistantElement = document.createElement('div');
assistantElement.className = 'message message-assistant';
const contentDiv = document.createElement('div');
contentDiv.className = 'message-content';
assistantElement.appendChild(contentDiv);
const timeDiv = document.createElement('div');
timeDiv.className = 'message-time';
timeDiv.textContent = getCurrentTime();
assistantElement.appendChild(timeDiv);
chatMessages.appendChild(assistantElement);
chatMessages.scrollTop = chatMessages.scrollHeight;
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let buf = '';
let streamEnded = false;
while (!streamEnded) {
const { value, done } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
let pos;
while ((pos = buf.indexOf('\n\n')) !== -1) {
const raw = buf.slice(0, pos).trim();
buf = buf.slice(pos + 2);
if (!raw) continue;
if (raw.startsWith('event: end')) {
chatMessages.removeChild(typingIndicator);
streamEnded = true;
break;
}
if (!raw.startsWith('data:')) continue;
const payload = raw.slice(5).trim();
if (!payload) continue;
try {
const obj = JSON.parse(payload);
if (obj.type === 'text') {
contentDiv.innerHTML += formatMessage(obj.content);
chatMessages.scrollTop = chatMessages.scrollHeight;
} else if (obj.type === 'audio') {
const audioBytes = Uint8Array.from(atob(obj.content), c => c.charCodeAt(0));
const audioBlob = new Blob([audioBytes], { type: 'audio/wav' });
new Audio(URL.createObjectURL(audioBlob)).play();
}
} catch (e) {
console.error('解析事件失败', e, payload);
}
}
}
} catch (error) { } catch (error) {
// 移除正在输入指示器 // 移除正在输入指示器
@ -226,43 +274,8 @@ async function addMessage(role, content) {
chatMessages.appendChild(messageElement); chatMessages.appendChild(messageElement);
// 如果是AI回复调用TTS并播放
if (role === 'assistant') {
try {
const ttsResp = await fetch('/imgsearcherApi/api/tts', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
text: content
})
});
if (ttsResp.ok) {
const blob = await ttsResp.blob();
const audioUrl = URL.createObjectURL(blob);
const audio = document.createElement('audio');
audio.src = audioUrl;
audio.autoplay = true;
// 可选:提供播放控制按钮
const controls = document.createElement('div');
controls.className = 'audio-controls mt-1';
const playBtn = document.createElement('button');
playBtn.type = 'button';
playBtn.className = 'btn btn-sm btn-outline-secondary';
playBtn.textContent = '🔊 播放';
playBtn.addEventListener('click', () => {
audio.play();
});
controls.appendChild(playBtn);
messageElement.appendChild(controls);
} else {
console.error('TTS 接口错误', await ttsResp.text());
}
} catch (err) {
console.error('TTS 播放失败', err);
}
}
// 滚动到底部 // 滚动到底部
chatMessages.scrollTop = chatMessages.scrollHeight; chatMessages.scrollTop = chatMessages.scrollHeight;

View File

@ -44,6 +44,21 @@
</div> </div>
</div> </div>
</div> </div>
<!-- 语音音色选择 -->
<div class="row">
<div class="col-md-6">
<div class="mb-3">
<label for="voiceSelect" class="form-label">选择音色</label>
<select class="form-select" id="voiceSelect" name="voice">
<option value="zh-CN-XiaoXiao-Assistant-Audio.wav" selected>小小助手</option>
<option value="zh-CN-XiaoXiao-Chat-Audio.wav">小小聊天</option>
<option value="zh-CN-YunYang-CustomerService-Audio.wav">云扬客服</option>
<option value="zh-CN-Yunxi-Boy-Default-Audio.wav">云希男孩</option>
<option value="zh-CN-Yunxi-Narrator-Narrative-Audio.wav">云希旁白</option>
</select>
</div>
</div>
</div>
<div class="d-flex justify-content-between"> <div class="d-flex justify-content-between">
<button type="submit" class="btn btn-primary">上传并开始对话</button> <button type="submit" class="btn btn-primary">上传并开始对话</button>
<a href="/imgsearcherApi/robots" class="btn btn-outline-secondary">管理机器人角色</a> <a href="/imgsearcherApi/robots" class="btn btn-outline-secondary">管理机器人角色</a>