add chat stream and audio response by tts
This commit is contained in:
parent
31581e24a9
commit
e29eced691
125
app.py
125
app.py
@ -5,7 +5,7 @@ import os
|
||||
import json
|
||||
import base64
|
||||
import requests
|
||||
from flask import Flask, render_template, request, jsonify, redirect, url_for, session, Response
|
||||
from flask import Flask, render_template, request, jsonify, redirect, url_for, session, Response, stream_with_context
|
||||
from flask_cors import CORS
|
||||
from werkzeug.utils import secure_filename
|
||||
from app.api.baidu_image_search import BaiduImageSearch
|
||||
@ -457,6 +457,120 @@ def delete_robot(robot_id):
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
# ==================== Chat Stream ====================
|
||||
@app.route('/imgsearcherApi/api/chat-stream', methods=['POST'])
|
||||
def chat_stream():
|
||||
"""流式返回文本增量 + 最终语音 (SSE)"""
|
||||
if not azure_openai_api:
|
||||
return jsonify({'error': 'Azure OpenAI API未正确配置'}), 500
|
||||
|
||||
data = request.get_json()
|
||||
if not data or 'message' not in data:
|
||||
return jsonify({'error': '缺少消息内容'}), 400
|
||||
|
||||
message = data['message']
|
||||
image_path = session.get('chat_image_path')
|
||||
robot_id = session.get('robot_id', '')
|
||||
if not image_path or not os.path.exists(image_path):
|
||||
return jsonify({'error': '没有上传图片或图片已失效'}), 400
|
||||
|
||||
conversation_history = session.get('conversation_history', [])
|
||||
robot_info = None
|
||||
if robot_id:
|
||||
robot = robot_manager.get_robot(robot_id)
|
||||
if robot:
|
||||
robot_info = {'name': robot.get('name', ''), 'background': robot.get('background', '')}
|
||||
|
||||
# 语音音色选择
|
||||
voice_wav = data.get('voice') or 'zh-CN-XiaoXiao-Assistant-Audio.wav'
|
||||
# 与Azure流式对话
|
||||
openai_resp = azure_openai_api.chat_with_image_stream(
|
||||
image_path=image_path,
|
||||
message=message,
|
||||
conversation_history=conversation_history,
|
||||
robot_info=robot_info
|
||||
)
|
||||
|
||||
def event_stream():
|
||||
accumulated_text = ""
|
||||
pending = ""
|
||||
punctuation = "。!?.!?"
|
||||
def send_tts(sentence):
|
||||
if not sentence.strip():
|
||||
return
|
||||
tts_params = {
|
||||
'tts_text': sentence,
|
||||
'prompt_wav': voice_wav,
|
||||
'text_split_method': 'cut5'
|
||||
}
|
||||
try:
|
||||
tts_resp = requests.get(
|
||||
"https://cloud.infini-ai.com/AIStudio/v1/inference/api/te-c7zfd4hrdoj2vqmw/tts/CosyVoice/v1/zero_shot",
|
||||
params=tts_params,
|
||||
headers={'Authorization': f'Bearer {os.getenv("INFINI_API_KEY", "sk-daooolufaf7ienn6")}'},
|
||||
stream=True,
|
||||
timeout=30
|
||||
)
|
||||
if tts_resp.status_code == 200:
|
||||
audio_bytes = bytearray()
|
||||
for chunk in tts_resp.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
audio_bytes.extend(chunk)
|
||||
if audio_bytes:
|
||||
b64_audio = base64.b64encode(audio_bytes).decode()
|
||||
yield f"data: {{\"type\": \"audio\", \"content\": \"{b64_audio}\" }}\n\n"
|
||||
except Exception as e:
|
||||
print("TTS 调用失败", e)
|
||||
# 读取 GPT 流
|
||||
for line in openai_resp.iter_lines():
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
if line.strip() == b'data: [DONE]':
|
||||
break
|
||||
if line.startswith(b'data:'):
|
||||
content_json = json.loads(line[5:].strip())
|
||||
choices = content_json.get('choices', [])
|
||||
if not choices:
|
||||
continue
|
||||
delta = choices[0].get('delta', {}).get('content', '')
|
||||
if not delta:
|
||||
continue
|
||||
accumulated_text += delta
|
||||
pending += delta
|
||||
# 立即发送文本增量
|
||||
yield f"data: {{\"type\": \"text\", \"content\": {json.dumps(delta, ensure_ascii=False)} }}\n\n"
|
||||
# 检测标点
|
||||
while True:
|
||||
idx = -1
|
||||
for i, ch in enumerate(pending):
|
||||
if ch in punctuation:
|
||||
idx = i
|
||||
break
|
||||
if idx == -1:
|
||||
break
|
||||
sentence = pending[:idx+1]
|
||||
pending = pending[idx+1:]
|
||||
# 调用 TTS 并流式发送
|
||||
for audio_evt in send_tts(sentence):
|
||||
yield audio_evt
|
||||
except Exception as e:
|
||||
print("解析OpenAI流错误", e)
|
||||
continue
|
||||
# 处理剩余 pending
|
||||
if pending.strip():
|
||||
for audio_evt in send_tts(pending):
|
||||
yield audio_evt
|
||||
pending = ""
|
||||
# 更新会话历史
|
||||
conversation_history.append({"role": "user", "content": message})
|
||||
conversation_history.append({"role": "assistant", "content": accumulated_text})
|
||||
session['conversation_history'] = conversation_history
|
||||
yield "event: end\ndata: {}\n\n"
|
||||
|
||||
# 保持请求上下文,避免在生成器中操作 session 报错
|
||||
return Response(stream_with_context(event_stream()), content_type='text/event-stream')
|
||||
|
||||
# ==================== TTS ====================
|
||||
@app.route('/imgsearcherApi/api/tts', methods=['POST'])
|
||||
def tts():
|
||||
@ -467,18 +581,19 @@ def tts():
|
||||
|
||||
text = data['text']
|
||||
prompt_text = data.get('prompt_text') or "我是威震天,我只代表月亮消灭你"
|
||||
prompt_wav = data.get('prompt_wav') or "data_workspace/data/workspace_170/我是威震天.wav"
|
||||
prompt_wav = data.get('voice') or data.get('prompt_wav') or "zh-CN-XiaoXiao-Assistant-Audio.wav"
|
||||
|
||||
tts_base_url = "http://180.76.186.85:20099/CosyVoice/v1/zero_shot"
|
||||
tts_base_url = "https://cloud.infini-ai.com/AIStudio/v1/inference/api/te-c7zfd4hrdoj2vqmw/tts/CosyVoice/v1/zero_shot"
|
||||
api_key = os.getenv('INFINI_API_KEY', 'sk-daooolufaf7ienn6')
|
||||
headers = {'Authorization': f'Bearer {api_key}'}
|
||||
params = {
|
||||
'tts_text': text,
|
||||
'prompt_text': prompt_text,
|
||||
'prompt_wav': prompt_wav,
|
||||
'text_split_method': 'cut5'
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(tts_base_url, params=params, timeout=20)
|
||||
r = requests.get(tts_base_url, params=params, headers=headers, timeout=20)
|
||||
if r.status_code == 200:
|
||||
return Response(r.content, content_type=r.headers.get('Content-Type', 'audio/wav'))
|
||||
else:
|
||||
|
||||
@ -172,3 +172,39 @@ class AzureOpenAI:
|
||||
return response.json()
|
||||
else:
|
||||
raise Exception(f"Azure OpenAI API请求失败: {response.text}")
|
||||
|
||||
def chat_with_image_stream(self, image_path, message, conversation_history=None, robot_info=None):
|
||||
"""流式输出 GPT-4o 回复,返回 requests.Response 对象 (stream=True)"""
|
||||
if conversation_history is None:
|
||||
conversation_history = []
|
||||
image_type, description = self._get_image_type_description(image_path)
|
||||
base64_image = self._encode_image(image_path)
|
||||
|
||||
if robot_info and robot_info.get('name') and robot_info.get('background'):
|
||||
system_message = f"你是{robot_info['name']},一个能够分析图片并回答问题的角色。\n\n你的背景故事:{robot_info['background']}\n\n在对话中,你应该始终保持这个角色的身份和特点,用第一人称回答问题。"
|
||||
else:
|
||||
system_message = "你是一个智能助手,能够分析图片并回答问题。"
|
||||
if image_type and description:
|
||||
system_message += f"\n\n这是一张{image_type}的图片。\n描述信息:{description}\n\n请基于这些信息和图片内容回答用户的问题。"
|
||||
|
||||
messages = [{"role": "system", "content": system_message}]
|
||||
for msg in conversation_history:
|
||||
messages.append(msg)
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": message},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||||
]
|
||||
})
|
||||
|
||||
headers = {"Content-Type": "application/json", "api-key": self.api_key}
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.95,
|
||||
"stream": True
|
||||
}
|
||||
url = f"{self.endpoint}/openai/deployments/{self.deployment_name}/chat/completions?api-version={self.api_version}"
|
||||
return requests.post(url, headers=headers, json=payload, stream=True)
|
||||
|
||||
@ -178,28 +178,76 @@ function initChatForm() {
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
|
||||
try {
|
||||
const response = await fetch('/imgsearcherApi/api/chat', {
|
||||
// 使用流式接口
|
||||
const response = await fetch('/imgsearcherApi/api/chat-stream', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
message: message
|
||||
message: message,
|
||||
voice: document.getElementById('voiceSelect') ? document.getElementById('voiceSelect').value : ''
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
// 移除正在输入指示器
|
||||
chatMessages.removeChild(typingIndicator);
|
||||
|
||||
if (data.error) {
|
||||
alert(`对话失败: ${data.error}`);
|
||||
|
||||
if (!response.ok) {
|
||||
chatMessages.removeChild(typingIndicator);
|
||||
alert('对话失败');
|
||||
return;
|
||||
}
|
||||
|
||||
// 添加AI回复到聊天界面
|
||||
addMessage('assistant', data.reply);
|
||||
|
||||
// 创建助手消息占位
|
||||
const assistantElement = document.createElement('div');
|
||||
assistantElement.className = 'message message-assistant';
|
||||
const contentDiv = document.createElement('div');
|
||||
contentDiv.className = 'message-content';
|
||||
assistantElement.appendChild(contentDiv);
|
||||
const timeDiv = document.createElement('div');
|
||||
timeDiv.className = 'message-time';
|
||||
timeDiv.textContent = getCurrentTime();
|
||||
assistantElement.appendChild(timeDiv);
|
||||
chatMessages.appendChild(assistantElement);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder('utf-8');
|
||||
let buf = '';
|
||||
let streamEnded = false;
|
||||
while (!streamEnded) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buf += decoder.decode(value, { stream: true });
|
||||
|
||||
let pos;
|
||||
while ((pos = buf.indexOf('\n\n')) !== -1) {
|
||||
const raw = buf.slice(0, pos).trim();
|
||||
buf = buf.slice(pos + 2);
|
||||
if (!raw) continue;
|
||||
|
||||
if (raw.startsWith('event: end')) {
|
||||
chatMessages.removeChild(typingIndicator);
|
||||
streamEnded = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!raw.startsWith('data:')) continue;
|
||||
const payload = raw.slice(5).trim();
|
||||
if (!payload) continue;
|
||||
try {
|
||||
const obj = JSON.parse(payload);
|
||||
if (obj.type === 'text') {
|
||||
contentDiv.innerHTML += formatMessage(obj.content);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (obj.type === 'audio') {
|
||||
const audioBytes = Uint8Array.from(atob(obj.content), c => c.charCodeAt(0));
|
||||
const audioBlob = new Blob([audioBytes], { type: 'audio/wav' });
|
||||
new Audio(URL.createObjectURL(audioBlob)).play();
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('解析事件失败', e, payload);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
// 移除正在输入指示器
|
||||
@ -226,43 +274,8 @@ async function addMessage(role, content) {
|
||||
|
||||
chatMessages.appendChild(messageElement);
|
||||
|
||||
// 如果是AI回复,调用TTS并播放
|
||||
if (role === 'assistant') {
|
||||
try {
|
||||
const ttsResp = await fetch('/imgsearcherApi/api/tts', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text: content
|
||||
})
|
||||
});
|
||||
if (ttsResp.ok) {
|
||||
const blob = await ttsResp.blob();
|
||||
const audioUrl = URL.createObjectURL(blob);
|
||||
const audio = document.createElement('audio');
|
||||
audio.src = audioUrl;
|
||||
audio.autoplay = true;
|
||||
// 可选:提供播放控制按钮
|
||||
const controls = document.createElement('div');
|
||||
controls.className = 'audio-controls mt-1';
|
||||
const playBtn = document.createElement('button');
|
||||
playBtn.type = 'button';
|
||||
playBtn.className = 'btn btn-sm btn-outline-secondary';
|
||||
playBtn.textContent = '🔊 播放';
|
||||
playBtn.addEventListener('click', () => {
|
||||
audio.play();
|
||||
});
|
||||
controls.appendChild(playBtn);
|
||||
messageElement.appendChild(controls);
|
||||
} else {
|
||||
console.error('TTS 接口错误', await ttsResp.text());
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('TTS 播放失败', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// 滚动到底部
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
|
||||
@ -44,6 +44,21 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- 语音音色选择 -->
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<div class="mb-3">
|
||||
<label for="voiceSelect" class="form-label">选择音色</label>
|
||||
<select class="form-select" id="voiceSelect" name="voice">
|
||||
<option value="zh-CN-XiaoXiao-Assistant-Audio.wav" selected>小小助手</option>
|
||||
<option value="zh-CN-XiaoXiao-Chat-Audio.wav">小小聊天</option>
|
||||
<option value="zh-CN-YunYang-CustomerService-Audio.wav">云扬客服</option>
|
||||
<option value="zh-CN-Yunxi-Boy-Default-Audio.wav">云希男孩</option>
|
||||
<option value="zh-CN-Yunxi-Narrator-Narrative-Audio.wav">云希旁白</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="d-flex justify-content-between">
|
||||
<button type="submit" class="btn btn-primary">上传并开始对话</button>
|
||||
<a href="/imgsearcherApi/robots" class="btn btn-outline-secondary">管理机器人角色</a>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user