From d808bbfe265212dbe54e1f67523e15caee4f32c3 Mon Sep 17 00:00:00 2001 From: songjvcheng Date: Sun, 27 Jul 2025 12:11:13 +0800 Subject: [PATCH] =?UTF-8?q?realtime=20=E8=AF=AD=E9=9F=B3=E5=BD=95=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/audio_processor.js | 322 +++++++++++++++++++++++++++++++++++++ src/chat_with_audio.js | 114 ++++++++----- src/config.js | 4 +- src/index - 副本.html | 139 ++++++++++++++++ src/index.js | 107 +++++++------ src/llm_stream.js | 34 +++- src/minimaxi_stream.js | 288 +++++++++++++++++++++++++++++++-- src/new_app.js | 346 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1251 insertions(+), 103 deletions(-) create mode 100644 src/audio_processor.js create mode 100644 src/index - 副本.html create mode 100644 src/new_app.js diff --git a/src/audio_processor.js b/src/audio_processor.js new file mode 100644 index 0000000..4734432 --- /dev/null +++ b/src/audio_processor.js @@ -0,0 +1,322 @@ +// 音频处理模块 - 提取自 new_app.js 的高级音频处理功能 + +class AudioProcessor { + constructor(options = {}) { + this.audioContext = null; + this.isRecording = false; + this.audioChunks = []; + + // VAD相关属性 + this.isSpeaking = false; + this.silenceThreshold = options.silenceThreshold || 0.01; + this.silenceTimeout = options.silenceTimeout || 1000; + this.minSpeechDuration = options.minSpeechDuration || 300; + this.silenceTimer = null; + this.speechStartTime = null; + this.audioBuffer = []; + + // API配置 + this.apiConfig = { + url: 'https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash', + headers: { + 'X-Api-App-Key': '1988591469', + 'X-Api-Access-Key': 'mdEyhgZ59on1-NK3GXWAp3L4iLldSG0r', + 'X-Api-Resource-Id': 'volc.bigasr.auc_turbo', + 'X-Api-Request-Id': this.generateUUID(), + 'X-Api-Sequence': '-1', + 'Content-Type': 'application/json' + } + }; + + // 回调函数 + this.onSpeechStart = options.onSpeechStart || (() => {}); + this.onSpeechEnd = options.onSpeechEnd || (() => {}); + this.onRecognitionResult = options.onRecognitionResult || (() => {}); + this.onError = options.onError || (() => {}); + this.onStatusUpdate = options.onStatusUpdate || (() => {}); + } + + // 生成UUID + generateUUID() { + return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { + const r = Math.random() * 16 | 0; + const v = c == 'x' ? r : (r & 0x3 | 0x8); + return v.toString(16); + }); + } + + // 计算音频能量(音量) + calculateAudioLevel(audioData) { + let sum = 0; + for (let i = 0; i < audioData.length; i++) { + sum += audioData[i] * audioData[i]; + } + return Math.sqrt(sum / audioData.length); + } + + // 语音活动检测 + detectVoiceActivity(audioData) { + const audioLevel = this.calculateAudioLevel(audioData); + const currentTime = Date.now(); + + if (audioLevel > this.silenceThreshold) { + if (!this.isSpeaking) { + this.isSpeaking = true; + this.speechStartTime = currentTime; + this.audioBuffer = []; + this.onSpeechStart(); + this.onStatusUpdate('检测到语音,开始录音...', 'speaking'); + console.log('开始说话'); + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + + return true; + } else { + if (this.isSpeaking && !this.silenceTimer) { + this.silenceTimer = setTimeout(() => { + this.handleSpeechEnd(); + }, this.silenceTimeout); + } + + return this.isSpeaking; + } + } + + // 语音结束处理 + async handleSpeechEnd() { + if (this.isSpeaking) { + const speechDuration = Date.now() - this.speechStartTime; + + if (speechDuration >= this.minSpeechDuration) { + console.log(`语音结束,时长: ${speechDuration}ms`); + await this.processAudioBuffer(); + this.onStatusUpdate('语音识别中...', 'processing'); + } else { + console.log('说话时长太短,忽略'); + this.onStatusUpdate('等待语音输入...', 'ready'); + } + + this.isSpeaking = false; + this.speechStartTime = null; + this.audioBuffer = []; + this.onSpeechEnd(); + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + } + + // 处理音频缓冲区并发送到API + async processAudioBuffer() { + if (this.audioBuffer.length === 0) { + return; + } + + try { + // 合并所有音频数据 + const totalLength = this.audioBuffer.reduce((sum, buffer) => sum + buffer.length, 0); + const combinedBuffer = new Float32Array(totalLength); + let offset = 0; + + for (const buffer of this.audioBuffer) { + combinedBuffer.set(buffer, offset); + offset += buffer.length; + } + + // 转换为WAV格式并编码为base64 + const wavBuffer = this.encodeWAV(combinedBuffer, 16000); + const base64Audio = this.arrayBufferToBase64(wavBuffer); + + // 调用ASR API + await this.callASRAPI(base64Audio); + + } catch (error) { + console.error('处理音频数据失败:', error); + this.onError('处理音频数据失败: ' + error.message); + } + } + + // 调用ASR API + async callASRAPI(base64AudioData) { + try { + const requestBody = { + user: { + uid: "1988591469" + }, + audio: { + data: base64AudioData + }, + request: { + model_name: "bigmodel" + } + }; + + const response = await fetch(this.apiConfig.url, { + method: 'POST', + headers: this.apiConfig.headers, + body: JSON.stringify(requestBody) + }); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const result = await response.json(); + this.handleASRResponse(result); + + } catch (error) { + console.error('ASR API调用失败:', error); + this.onError('ASR API调用失败: ' + error.message); + } + } + + // 处理ASR响应 + handleASRResponse(response) { + console.log('ASR响应:', response); + + if (response && response.result) { + const recognizedText = response.result.text; + this.onRecognitionResult(recognizedText); + this.onStatusUpdate('识别完成', 'completed'); + } else { + console.log('未识别到文字'); + this.onStatusUpdate('未识别到文字', 'ready'); + } + } + + // 编码WAV格式 + encodeWAV(samples, sampleRate) { + const length = samples.length; + const buffer = new ArrayBuffer(44 + length * 2); + const view = new DataView(buffer); + + // WAV文件头 + const writeString = (offset, string) => { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + }; + + writeString(0, 'RIFF'); + view.setUint32(4, 36 + length * 2, true); + writeString(8, 'WAVE'); + writeString(12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + writeString(36, 'data'); + view.setUint32(40, length * 2, true); + + // 写入音频数据 + let offset = 44; + for (let i = 0; i < length; i++) { + const sample = Math.max(-1, Math.min(1, samples[i])); + view.setInt16(offset, sample * 0x7FFF, true); + offset += 2; + } + + return buffer; + } + + // ArrayBuffer转Base64 + arrayBufferToBase64(buffer) { + let binary = ''; + const bytes = new Uint8Array(buffer); + for (let i = 0; i < bytes.byteLength; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); + } + + // 开始录音 + async startRecording() { + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 16000, + channelCount: 1, + echoCancellation: true, + noiseSuppression: true + } + }); + + this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ + sampleRate: 16000 + }); + + const source = this.audioContext.createMediaStreamSource(stream); + const processor = this.audioContext.createScriptProcessor(4096, 1, 1); + + processor.onaudioprocess = (event) => { + const inputBuffer = event.inputBuffer; + const inputData = inputBuffer.getChannelData(0); + + // 语音活动检测 + if (this.detectVoiceActivity(inputData)) { + // 如果检测到语音活动,缓存音频数据 + this.audioBuffer.push(new Float32Array(inputData)); + } + }; + + source.connect(processor); + processor.connect(this.audioContext.destination); + + this.isRecording = true; + this.onStatusUpdate('等待语音输入...', 'ready'); + + return true; + + } catch (error) { + console.error('启动录音失败:', error); + this.onError('启动录音失败: ' + error.message); + return false; + } + } + + // 停止录音 + stopRecording() { + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + + // 如果正在说话,处理最后的音频 + if (this.isSpeaking) { + this.handleSpeechEnd(); + } + + this.isRecording = false; + this.isSpeaking = false; + this.audioBuffer = []; + + this.onStatusUpdate('录音已停止', 'stopped'); + console.log('录音已停止'); + } + + // 获取录音状态 + getRecordingStatus() { + return { + isRecording: this.isRecording, + isSpeaking: this.isSpeaking, + hasAudioContext: !!this.audioContext + }; + } +} + +// 导出模块 +export { AudioProcessor }; \ No newline at end of file diff --git a/src/chat_with_audio.js b/src/chat_with_audio.js index bbe0d4d..6a0e6f8 100644 --- a/src/chat_with_audio.js +++ b/src/chat_with_audio.js @@ -6,6 +6,9 @@ import { getLLMConfig, getMinimaxiConfig, getAudioConfig, validateConfig } from // 防止重复播放的标志 let isPlaying = false; +// 音频播放队列 +let audioQueue = []; +let isProcessingQueue = false; async function chatWithAudioStream(userInput) { // 验证配置 @@ -20,7 +23,48 @@ async function chatWithAudioStream(userInput) { const minimaxiConfig = getMinimaxiConfig(); const audioConfig = getAudioConfig(); - // 1. 请求大模型回答 + // 清空音频队列 + audioQueue = []; + + // 定义段落处理函数 + const handleSegment = async (segment) => { + console.log('\n=== 处理文本段落 ==='); + console.log('段落内容:', segment); + + try { + // 为每个段落生成音频 + const audioResult = await requestMinimaxi({ + apiKey: minimaxiConfig.apiKey, + groupId: minimaxiConfig.groupId, + body: { + model: audioConfig.model, + text: segment, + stream: audioConfig.stream, + language_boost: audioConfig.language_boost, + output_format: audioConfig.output_format, + voice_setting: audioConfig.voiceSetting, + audio_setting: audioConfig.audioSetting, + }, + stream: true, + }); + + // 将音频添加到播放队列 + if (audioResult && audioResult.data && audioResult.data.audio) { + audioQueue.push({ + text: segment, + audioHex: audioResult.data.audio + }); + console.log('音频已添加到队列,队列长度:', audioQueue.length); + + // 开始处理队列 + processAudioQueue(); + } + } catch (error) { + console.error('生成音频失败:', error); + } + }; + + // 1. 请求大模型回答,并实时处理段落 console.log('\n=== 请求大模型回答 ==='); const llmResponse = await requestLLMStream({ apiKey: llmConfig.apiKey, @@ -29,55 +73,45 @@ async function chatWithAudioStream(userInput) { { role: 'system', content: 'You are a helpful assistant.' }, { role: 'user', content: userInput }, ], + onSegment: handleSegment // 传入段落处理回调 }); - // 提取大模型回答内容(现在直接返回内容) - const llmContent = llmResponse; - - console.log('\n=== 大模型回答 ==='); - console.log("llmResponse: ", llmContent); - - // 2. 合成音频 - console.log('\n=== 开始合成音频 ==='); - const audioResult = await requestMinimaxi({ - apiKey: minimaxiConfig.apiKey, - groupId: minimaxiConfig.groupId, - body: { - model: audioConfig.model, - text: llmContent, - stream: audioConfig.stream, - language_boost: audioConfig.language_boost, - output_format: audioConfig.output_format, - voice_setting: audioConfig.voiceSetting, - audio_setting: audioConfig.audioSetting, - }, - stream: true, - }); - - // 3. 流式播放音频 - console.log('\n=== 开始流式播放音频 ==='); - // console.log('音频数据长度:', audioResult.data.audio.length); - await playAudioStream(audioResult.data.audio); + console.log('\n=== 大模型完整回答 ==='); + console.log("llmResponse: ", llmResponse); return { userInput, - llmResponse: llmContent, - audioResult, + llmResponse, + audioQueue: audioQueue.map(item => ({ text: item.text, hasAudio: !!item.audioHex })) }; } +// 处理音频播放队列 +async function processAudioQueue() { + if (isProcessingQueue) return; + + isProcessingQueue = true; + + // while (audioQueue.length > 0) { + // const audioItem = audioQueue.shift(); + // console.log('\n=== 播放队列中的音频 ==='); + // console.log('文本:', audioItem.text); + + // try { + // await playAudioStream(audioItem.audioHex); + // } catch (error) { + // console.error('播放音频失败:', error); + // } + // } + + isProcessingQueue = false; +} + // 流式播放音频 async function playAudioStream(audioHex) { - if (isPlaying) { - console.log('音频正在播放中,跳过重复播放'); - return; - } - console.log('=== 开始播放音频 ==='); console.log('音频数据长度:', audioHex.length); - isPlaying = true; - // 将hex转换为ArrayBuffer const audioBuffer = hexToArrayBuffer(audioHex); @@ -102,13 +136,11 @@ async function playAudioStream(audioHex) { return new Promise((resolve) => { source.onended = () => { console.log('音频播放完成'); - isPlaying = false; resolve(); }; }); } catch (error) { console.error('音频播放失败:', error); - isPlaying = false; throw error; } } @@ -175,4 +207,6 @@ async function playAudioStreamNode(audioHex) { } } -export { chatWithAudioStream, playAudioStream, playAudioStreamNode }; \ No newline at end of file + + +export { chatWithAudioStream, playAudioStream, playAudioStreamNode}; \ No newline at end of file diff --git a/src/config.js b/src/config.js index 8cb236c..9d4b477 100644 --- a/src/config.js +++ b/src/config.js @@ -16,11 +16,11 @@ export const config = { audio: { model: 'speech-02-hd', voiceSetting: { - voice_id: 'yantu-qinggang', + voice_id: 'yantu-qinggang-2', speed: 1, vol: 1, pitch: 0, - emotion: 'happy', + // emotion: 'happy', }, audioSetting: { sample_rate: 32000, diff --git a/src/index - 副本.html b/src/index - 副本.html new file mode 100644 index 0000000..c8bb39e --- /dev/null +++ b/src/index - 副本.html @@ -0,0 +1,139 @@ + + + + + + 实时语音识别 + + + +
+

实时语音识别

+ +
+ +
+ +
未连接
+ +
+ 使用说明:
+ 1. 点击"开始录音"按钮开启麦克风
+ 2. 系统会自动检测您的语音,只有在检测到说话时才开始录音
+ 3. 说话结束后会自动发送音频进行识别
+ 4. 识别结果会显示在下方区域 +
+ +

识别结果:

+
+ +
+
+ + + + \ No newline at end of file diff --git a/src/index.js b/src/index.js index 86c0698..ec15aa5 100644 --- a/src/index.js +++ b/src/index.js @@ -1,5 +1,6 @@ // WebRTC 音视频通话应用 import { chatWithAudioStream } from './chat_with_audio.js'; +import { AudioProcessor } from './audio_processor.js'; class WebRTCChat { constructor() { @@ -15,6 +16,30 @@ class WebRTCChat { this.videoStreams = new Map(); // 存储不同视频的MediaStream this.currentVideoStream = null; + // 初始化音频处理器 + this.audioProcessor = new AudioProcessor({ + onSpeechStart: () => { + this.voiceStatus.textContent = '检测到语音,开始录音...'; + this.logMessage('检测到语音,开始录音...', 'info'); + }, + onSpeechEnd: () => { + // 语音结束回调 + }, + onRecognitionResult: (text) => { + // ASRTEXT = text; + this.voiceStatus.textContent = '识别完成'; + this.logMessage(`语音识别结果: ${text}`, 'success'); + this.handleVoiceInput(text); + }, + onError: (error) => { + this.voiceStatus.textContent = '识别失败'; + this.logMessage(error, 'error'); + }, + onStatusUpdate: (message, status) => { + this.voiceStatus.textContent = message; + } + }); + this.initializeElements(); this.initializeSocket(); this.loadVideoMapping(); @@ -627,65 +652,34 @@ class WebRTCChat { }); } + // 修改:使用音频处理器的语音录制功能 async startVoiceRecording() { - try { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - this.mediaRecorder = new MediaRecorder(stream); - this.audioChunks = []; - - this.mediaRecorder.ondataavailable = (event) => { - this.audioChunks.push(event.data); - }; - - this.mediaRecorder.onstop = () => { - const audioBlob = new Blob(this.audioChunks, { type: 'audio/wav' }); - this.processVoiceInput(audioBlob); - }; - - this.mediaRecorder.start(); - this.isRecording = true; - + const success = await this.audioProcessor.startRecording(); + + if (success) { this.startVoiceButton.disabled = true; this.stopVoiceButton.disabled = false; - this.voiceStatus.textContent = '正在录音...'; this.startVoiceButton.classList.add('recording'); - - this.logMessage('开始语音录制', 'info'); - } catch (error) { - this.logMessage('无法访问麦克风: ' + error.message, 'error'); + this.voiceStatus.textContent = '等待语音输入...'; + this.logMessage('高级语音录制已启动', 'success'); + } else { + this.voiceStatus.textContent = '录音启动失败'; } } + // 修改:停止语音录制 stopVoiceRecording() { - if (this.mediaRecorder && this.isRecording) { - this.mediaRecorder.stop(); - this.isRecording = false; - - this.startVoiceButton.disabled = false; - this.stopVoiceButton.disabled = true; - this.voiceStatus.textContent = '点击开始语音输入'; - this.startVoiceButton.classList.remove('recording'); - - this.logMessage('停止语音录制', 'info'); - } - } - - async processVoiceInput(audioBlob) { - // 这里可以集成语音识别API,如Web Speech API或第三方服务 - // 为了演示,我们使用一个简单的模拟识别 - const mockText = this.simulateSpeechRecognition(); - - this.socket.emit('voice-input', { - audioData: audioBlob, - text: mockText - }); - - this.logMessage(`语音识别结果: ${mockText}`, 'info'); - - // 根据语音识别结果切换视频流 - await this.handleVoiceInput(mockText); + this.audioProcessor.stopRecording(); + + this.startVoiceButton.disabled = false; + this.stopVoiceButton.disabled = true; + this.startVoiceButton.classList.remove('recording'); + this.voiceStatus.textContent = '点击开始语音输入'; + + this.logMessage('语音录制已停止', 'info'); } + // 处理语音输入结果 async handleVoiceInput(text) { // 根据文本查找对应视频 let videoFile = this.videoMapping['默认'] || this.defaultVideo; @@ -705,8 +699,21 @@ class WebRTCChat { type: 'voice', text }); + + // 调用大模型处理 + try { + this.logMessage('正在处理语音输入,请稍候...', 'info'); + const result = await chatWithAudioStream(text); + this.logMessage(`大模型回答: ${result.llmResponse}`, 'success'); + } catch (error) { + this.logMessage(`处理语音输入失败: ${error.message}`, 'error'); + console.error('chatWithAudioStream error:', error); + } } + // 删除原有的简单音频处理方法 + // processVoiceInput() 和 simulateSpeechRecognition() 方法已被移除 + simulateSpeechRecognition() { // 模拟语音识别,随机返回预设的文本 const texts = ['你好', '再见', '谢谢', 'hello', 'goodbye', 'thank you']; @@ -776,4 +783,4 @@ class WebRTCChat { // 页面加载完成后初始化应用 document.addEventListener('DOMContentLoaded', () => { new WebRTCChat(); -}); \ No newline at end of file +}); \ No newline at end of file diff --git a/src/llm_stream.js b/src/llm_stream.js index 0308340..d096134 100644 --- a/src/llm_stream.js +++ b/src/llm_stream.js @@ -1,6 +1,6 @@ // 以流式方式请求LLM大模型接口,并打印流式返回内容 -async function requestLLMStream({ apiKey, model, messages }) { +async function requestLLMStream({ apiKey, model, messages, onSegment }) { const response = await fetch('https://ark.cn-beijing.volces.com/api/v3/bots/chat/completions', { method: 'POST', headers: { @@ -26,6 +26,10 @@ async function requestLLMStream({ apiKey, model, messages }) { let done = false; let buffer = ''; let content = ''; + let pendingText = ''; // 待处理的文本片段 + + // 分段分隔符 + const segmentDelimiters = /[,。:;!?,.:;!?]/; while (!done) { const { value, done: doneReading } = await reader.read(); @@ -47,6 +51,10 @@ async function requestLLMStream({ apiKey, model, messages }) { if (jsonStr === '[DONE]') { console.log('LLM SSE流结束'); + // 处理最后的待处理文本 + if (pendingText.trim() && onSegment) { + await onSegment(pendingText.trim()); + } continue; } @@ -55,7 +63,29 @@ async function requestLLMStream({ apiKey, model, messages }) { if (obj.choices && obj.choices[0] && obj.choices[0].delta && obj.choices[0].delta.content) { const deltaContent = obj.choices[0].delta.content; content += deltaContent; + pendingText += deltaContent; console.log('LLM内容片段:', deltaContent); + + // 检查是否包含分段分隔符 + if (segmentDelimiters.test(pendingText)) { + // 按分隔符分割文本 + const segments = pendingText.split(segmentDelimiters); + + // 处理完整的段落(除了最后一个,因为可能不完整) + for (let i = 0; i < segments.length - 1; i++) { + const segment = segments[i].trim(); + if (segment && onSegment) { + // 找到对应的分隔符 + const delimiterMatch = pendingText.match(segmentDelimiters); + const segmentWithDelimiter = segment + (delimiterMatch ? delimiterMatch[0] : ''); + console.log('检测到完整段落:', segmentWithDelimiter); + await onSegment(segmentWithDelimiter); + } + } + + // 保留最后一个不完整的段落 + pendingText = segments[segments.length - 1] || ''; + } } } catch (e) { console.error('解析LLM SSE数据失败:', e, '原始数据:', jsonStr); @@ -72,4 +102,4 @@ async function requestLLMStream({ apiKey, model, messages }) { return content; } -export { requestLLMStream }; \ No newline at end of file +export { requestLLMStream }; \ No newline at end of file diff --git a/src/minimaxi_stream.js b/src/minimaxi_stream.js index cc3b369..a1eac7a 100644 --- a/src/minimaxi_stream.js +++ b/src/minimaxi_stream.js @@ -1,5 +1,135 @@ // 以流式或非流式方式请求 minimaxi 大模型接口,并打印/返回内容 +// 在文件顶部添加音频播放相关的变量和函数 +let audioContext = null; +let audioQueue = []; // 音频队列 +let isPlaying = false; +let isProcessingQueue = false; // 队列处理状态 +let nextStartTime = 0; // 添加这行来声明 nextStartTime 变量 + +// 初始化音频上下文 +function initAudioContext() { + if (!audioContext) { + audioContext = new (window.AudioContext || window.webkitAudioContext)(); + } + return audioContext; +} + +// 将hex字符串转换为ArrayBuffer +function hexToArrayBuffer(hex) { + const bytes = new Uint8Array(hex.length / 2); + for (let i = 0; i < hex.length; i += 2) { + bytes[i / 2] = parseInt(hex.substr(i, 2), 16); + } + return bytes.buffer; +} + +// 将音频添加到队列(不等待播放) +async function addAudioToQueue(audioHex) { + if (!audioHex || audioHex.length === 0) return; + + try { + const ctx = initAudioContext(); + const audioBuffer = hexToArrayBuffer(audioHex); + const audioData = await ctx.decodeAudioData(audioBuffer); + + // 将解码后的音频数据添加到队列 + audioQueue.push({ + audioData, + timestamp: Date.now() + }); + + console.log(`音频已添加到队列,队列长度: ${audioQueue.length}`); + + // 启动队列处理器(如果还没有运行) + if (!isProcessingQueue) { + processAudioQueue(); + } + + } catch (error) { + console.error('音频解码失败:', error); + } +} + +// 队列处理器 - 独立运行,按顺序播放音频 +async function processAudioQueue() { + if (isProcessingQueue) return; + + isProcessingQueue = true; + console.log('开始处理音频队列'); + + while (audioQueue.length > 0 || isPlaying) { + // 如果当前没有音频在播放,且队列中有音频 + if (!isPlaying && audioQueue.length > 0) { + const audioItem = audioQueue.shift(); + await playAudioData(audioItem.audioData); + } else { + // 等待一小段时间再检查 + await new Promise(resolve => setTimeout(resolve, 50)); + } + } + + isProcessingQueue = false; + console.log('音频队列处理完成'); +} + +// 播放单个音频数据 +function playAudioData(audioData) { + return new Promise((resolve) => { + try { + const ctx = initAudioContext(); + const source = ctx.createBufferSource(); + source.buffer = audioData; + source.connect(ctx.destination); + + isPlaying = true; + + source.onended = () => { + console.log('音频片段播放完成'); + isPlaying = false; + resolve(); + }; + + // 超时保护 + setTimeout(() => { + if (isPlaying) { + console.log('音频播放超时,强制结束'); + isPlaying = false; + resolve(); + } + }, (audioData.duration + 0.5) * 1000); + + source.start(0); + console.log(`开始播放音频片段,时长: ${audioData.duration}秒`); + + } catch (error) { + console.error('播放音频失败:', error); + isPlaying = false; + resolve(); + } + }); +} + +// 修改原来的playAudioChunk函数,改为addAudioToQueue +const playAudioChunk = addAudioToQueue; + +// 清空音频队列 +function clearAudioQueue() { + audioQueue.length = 0; + console.log('音频队列已清空'); +} + +// 获取队列状态 +function getQueueStatus() { + return { + queueLength: audioQueue.length, + isPlaying, + isProcessingQueue + }; +} + +// 移除waitForCurrentAudioToFinish函数,不再需要 + async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { const url = `https://api.minimaxi.com/v1/t2a_v2`; const reqBody = { ...body, stream }; @@ -24,13 +154,19 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { console.log(JSON.stringify(result, null, 2)); return result; } else { - // 流式,解析每个chunk,合并audio + // 流式,解析每个chunk,实时播放音频 const reader = response.body.getReader(); const decoder = new TextDecoder('utf-8'); let done = false; let buffer = ''; let audioHex = ''; let lastFullResult = null; + + // 重置播放状态 + nextStartTime = 0; + if (audioContext) { + nextStartTime = audioContext.currentTime; + } while (!done) { const { value, done: doneReading } = await reader.read(); @@ -38,19 +174,16 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { if (value) { const chunk = decoder.decode(value, { stream: true }); buffer += chunk; - // console.log('收到原始chunk:', chunk); // 处理SSE格式的数据(以\n分割) let lines = buffer.split('\n'); buffer = lines.pop(); // 最后一行可能是不完整的,留到下次 for (const line of lines) { if (!line.trim()) continue; - // console.log('处理行:', line); // 检查是否是SSE格式的数据行 if (line.startsWith('data:')) { const jsonStr = line.substring(6); // 移除 'data: ' 前缀 - // console.log('提取的JSON字符串:', jsonStr); if (jsonStr.trim() === '[DONE]') { console.log('SSE流结束'); @@ -59,17 +192,19 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { try { const obj = JSON.parse(jsonStr); - // 流式,解析每个chunk,合并audio - if (obj.data && obj.data.audio) { + // 流式,解析每个chunk,实时播放音频 + if (obj.data && obj.data.audio && obj.data.status === 1) { + console.log('收到音频数据片段!', obj.data.audio.length); audioHex += obj.data.audio; + + // 立即播放这个音频片段 + await playAudioChunk(obj.data.audio); } // status=2为最后一个chunk,记录完整结构 if (obj.data && obj.data.status === 2) { lastFullResult = obj; console.log('收到最终状态'); } - // 实时打印每个chunk - console.log('解析成功:', JSON.stringify(obj)); } catch (e) { console.error('解析SSE数据失败:', e, '原始数据:', jsonStr); } @@ -83,7 +218,11 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { try { const obj = JSON.parse(line); if (obj.data && obj.data.audio) { + console.log('收到无data:音频数据!', obj.data.audio.length); audioHex += obj.data.audio; + + // 立即播放这个音频片段 + await playAudioChunk(obj.data.audio); } if (obj.data && obj.data.status === 2) { lastFullResult = obj; @@ -109,4 +248,135 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) { } } -export { requestMinimaxi }; \ No newline at end of file +// 火山引擎TTS方法 +async function requestVolcanTTS({ + appId, + accessKey, + resourceId = 'volc.service_type.10029', + appKey = 'aGjiRDfUWi', + body, + stream = true +}) { + const url = 'https://openspeech.bytedance.com/api/v3/tts/unidirectional'; + + // 生成请求ID + const requestId = generateUUID(); + + const response = await fetch(url, { + method: 'POST', + headers: { + 'X-Api-App-Id': appId, + 'X-Api-Access-Key': accessKey, + 'X-Api-Resource-Id': resourceId, + 'X-Api-App-Key': appKey, + 'X-Api-Request-Id': requestId, + 'Content-Type': 'application/json', + 'Accept': stream ? 'text/event-stream' : 'application/json', + 'Cache-Control': 'no-cache', + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + if (!stream) { + // 非流式,直接返回JSON + const result = await response.json(); + console.log('火山引擎TTS非流式结果:', JSON.stringify(result, null, 2)); + return result; + } else { + // 流式,解析每个chunk,合并audio + const reader = response.body.getReader(); + const decoder = new TextDecoder('utf-8'); + let done = false; + let buffer = ''; + let audioBase64 = ''; + let lastFullResult = null; + + while (!done) { + const { value, done: doneReading } = await reader.read(); + done = doneReading; + if (value) { + const chunk = decoder.decode(value, { stream: true }); + buffer += chunk; + + // 处理SSE格式的数据(以\n分割) + let lines = buffer.split('\n'); + buffer = lines.pop(); // 最后一行可能是不完整的,留到下次 + + for (const line of lines) { + if (!line.trim()) continue; + + // 检查是否是SSE格式的数据行 + if (line.startsWith('data:')) { + const jsonStr = line.substring(6); // 移除 'data: ' 前缀 + + if (jsonStr.trim() === '[DONE]') { + console.log('火山引擎TTS流结束'); + continue; + } + + try { + const obj = JSON.parse(jsonStr); + // 流式,解析每个chunk,合并audio base64数据 + if (obj.data) { + audioBase64 += obj.data; + lastFullResult = obj; + } + // 实时打印每个chunk + console.log('火山引擎TTS解析成功:', JSON.stringify(obj)); + } catch (e) { + console.error('解析火山引擎TTS数据失败:', e, '原始数据:', jsonStr); + } + } else if (line.startsWith('event: ') || line.startsWith('id: ') || line.startsWith('retry: ')) { + // 忽略SSE的其他字段 + console.log('忽略SSE字段:', line); + continue; + } else if (line.trim() && !line.startsWith('data:')) { + // 尝试直接解析(兼容非SSE格式) + try { + const obj = JSON.parse(line); + if (obj.data) { + audioBase64 += obj.data; + lastFullResult = obj; + } + console.log('火山引擎TTS直接解析成功:', JSON.stringify(obj)); + } catch (e) { + console.error('解析火山引擎TTS chunk失败:', e, line); + } + } + } + } + } + + // 合成最终结构 + console.log('火山引擎TTS音频数据总长度:', audioBase64.length); + + if (lastFullResult) { + // 更新最终结果的音频数据 + lastFullResult.data = audioBase64; + console.log('火山引擎TTS最终合成结果:', JSON.stringify(lastFullResult, null, 2)); + return lastFullResult; + } else { + // 没有完整结构,返回合成的audio + return { + code: 0, + message: '', + data: audioBase64 + }; + } + } +} + +// 生成UUID的辅助函数 +function generateUUID() { + return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { + const r = Math.random() * 16 | 0; + const v = c === 'x' ? r : (r & 0x3 | 0x8); + return v.toString(16); + }); +} + +export { requestMinimaxi, requestVolcanTTS }; \ No newline at end of file diff --git a/src/new_app.js b/src/new_app.js new file mode 100644 index 0000000..dcb730b --- /dev/null +++ b/src/new_app.js @@ -0,0 +1,346 @@ +let ASRTEXT = '' + +class HttpASRRecognizer { + constructor() { + this.mediaRecorder = null; + this.audioContext = null; + this.isRecording = false; + this.audioChunks = []; + + // VAD相关属性 + this.isSpeaking = false; + this.silenceThreshold = 0.01; + this.silenceTimeout = 1000; + this.minSpeechDuration = 300; + this.silenceTimer = null; + this.speechStartTime = null; + this.audioBuffer = []; + + // API配置 + this.apiConfig = { + url: 'https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash', + headers: { + 'X-Api-App-Key': '1988591469', + 'X-Api-Access-Key': 'mdEyhgZ59on1-NK3GXWAp3L4iLldSG0r', + 'X-Api-Resource-Id': 'volc.bigasr.auc_turbo', + 'X-Api-Request-Id': this.generateUUID(), + 'X-Api-Sequence': '-1', + 'Content-Type': 'application/json' + } + }; + + this.recordBtn = document.getElementById('startVoiceButton'); + this.statusDiv = document.getElementById('status'); + this.resultsDiv = document.getElementById('results'); + + this.initEventListeners(); + } + + initEventListeners() { + this.recordBtn.addEventListener('click', () => { + if (this.isRecording) { + this.stopRecording(); + } else { + this.startRecording(); + } + }); + } + + // 生成UUID + generateUUID() { + return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { + const r = Math.random() * 16 | 0; + const v = c == 'x' ? r : (r & 0x3 | 0x8); + return v.toString(16); + }); + } + + // 计算音频能量(音量) + calculateAudioLevel(audioData) { + let sum = 0; + for (let i = 0; i < audioData.length; i++) { + sum += audioData[i] * audioData[i]; + } + return Math.sqrt(sum / audioData.length); + } + + // 语音活动检测 + detectVoiceActivity(audioData) { + const audioLevel = this.calculateAudioLevel(audioData); + const currentTime = Date.now(); + + if (audioLevel > this.silenceThreshold) { + if (!this.isSpeaking) { + this.isSpeaking = true; + this.speechStartTime = currentTime; + this.audioBuffer = []; + this.updateStatus('检测到语音,开始录音...', 'speaking'); + console.log('开始说话'); + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + + return true; + } else { + if (this.isSpeaking && !this.silenceTimer) { + this.silenceTimer = setTimeout(() => { + this.onSpeechEnd(); + }, this.silenceTimeout); + } + + return this.isSpeaking; + } + } + + // 语音结束处理 + async onSpeechEnd() { + if (this.isSpeaking) { + const speechDuration = Date.now() - this.speechStartTime; + + if (speechDuration >= this.minSpeechDuration) { + console.log(`语音结束,时长: ${speechDuration}ms`); + await this.processAudioBuffer(); + // this.updateStatus('语音识别中...', 'processing'); + console.log('语音识别中') + } else { + console.log('说话时长太短,忽略'); + // this.updateStatus('等待语音输入...', 'ready'); + console.log('等待语音输入...') + + } + + this.isSpeaking = false; + this.speechStartTime = null; + this.audioBuffer = []; + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + } + + // 处理音频缓冲区并发送到API + async processAudioBuffer() { + if (this.audioBuffer.length === 0) { + return; + } + + try { + // 合并所有音频数据 + const totalLength = this.audioBuffer.reduce((sum, buffer) => sum + buffer.length, 0); + const combinedBuffer = new Float32Array(totalLength); + let offset = 0; + + for (const buffer of this.audioBuffer) { + combinedBuffer.set(buffer, offset); + offset += buffer.length; + } + + // 转换为WAV格式并编码为base64 + const wavBuffer = this.encodeWAV(combinedBuffer, 16000); + const base64Audio = this.arrayBufferToBase64(wavBuffer); + + // 调用ASR API + await this.callASRAPI(base64Audio); + + } catch (error) { + console.error('处理音频数据失败:', error); + this.updateStatus('识别失败', 'error'); + } + } + + // 调用ASR API + async callASRAPI(base64AudioData) { + try { + const requestBody = { + user: { + uid: "1988591469" + }, + audio: { + data: base64AudioData + }, + request: { + model_name: "bigmodel" + } + }; + + const response = await fetch(this.apiConfig.url, { + method: 'POST', + headers: this.apiConfig.headers, + body: JSON.stringify(requestBody) + }); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const result = await response.json(); + this.handleASRResponse(result); + + } catch (error) { + console.error('ASR API调用失败:', error); + this.updateStatus('API调用失败', 'error'); + } + } + + // 处理ASR响应 + handleASRResponse(response) { + console.log('ASR响应:', response); + + if (response && response.data && response.data.result) { + ASRTEXT = response.data.result; + // this.displayResult(text); + // this.updateStatus('识别完成', 'completed'); + console.log('识别完成') + } else { + console.log('未识别到文字'); + // this.updateStatus('未识别到文字', 'ready'); + + } + } + + // 显示识别结果 + displayResult(text) { + const resultElement = document.createElement('div'); + resultElement.className = 'result-item'; + resultElement.innerHTML = ` + ${new Date().toLocaleTimeString()} + ${text} + `; + this.resultsDiv.appendChild(resultElement); + this.resultsDiv.scrollTop = this.resultsDiv.scrollHeight; + } + + // 更新状态显示 + updateStatus(message, status) { + this.statusDiv.textContent = message; + this.statusDiv.className = `status ${status}`; + } + + // 编码WAV格式 + encodeWAV(samples, sampleRate) { + const length = samples.length; + const buffer = new ArrayBuffer(44 + length * 2); + const view = new DataView(buffer); + + // WAV文件头 + const writeString = (offset, string) => { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + }; + + writeString(0, 'RIFF'); + view.setUint32(4, 36 + length * 2, true); + writeString(8, 'WAVE'); + writeString(12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + writeString(36, 'data'); + view.setUint32(40, length * 2, true); + + // 写入音频数据 + let offset = 44; + for (let i = 0; i < length; i++) { + const sample = Math.max(-1, Math.min(1, samples[i])); + view.setInt16(offset, sample * 0x7FFF, true); + offset += 2; + } + + return buffer; + } + + // ArrayBuffer转Base64 + arrayBufferToBase64(buffer) { + let binary = ''; + const bytes = new Uint8Array(buffer); + for (let i = 0; i < bytes.byteLength; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); + } + + async startRecording() { + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 16000, + channelCount: 1, + echoCancellation: true, + noiseSuppression: true + } + }); + + this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ + sampleRate: 16000 + }); + + const source = this.audioContext.createMediaStreamSource(stream); + const processor = this.audioContext.createScriptProcessor(4096, 1, 1); + + processor.onaudioprocess = (event) => { + const inputBuffer = event.inputBuffer; + const inputData = inputBuffer.getChannelData(0); + + // 语音活动检测 + if (this.detectVoiceActivity(inputData)) { + // 如果检测到语音活动,缓存音频数据 + this.audioBuffer.push(new Float32Array(inputData)); + } + }; + + source.connect(processor); + processor.connect(this.audioContext.destination); + + this.isRecording = true; + this.recordBtn.textContent = '停止录音'; + this.recordBtn.className = 'btn recording'; + // this.updateStatus('等待语音输入...', 'ready'); + + } catch (error) { + console.error('启动录音失败:', error); + // this.updateStatus('录音启动失败', 'error'); + } + } + + stopRecording() { + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + + if (this.silenceTimer) { + clearTimeout(this.silenceTimer); + this.silenceTimer = null; + } + + // 如果正在说话,处理最后的音频 + if (this.isSpeaking) { + this.onSpeechEnd(); + } + + this.isRecording = false; + this.isSpeaking = false; + this.audioBuffer = []; + + this.recordBtn.textContent = '开始录音'; + this.recordBtn.className = 'btn'; + console.log('录音已停止'); + // this.updateStatus('录音已停止', 'stopped'); + } +} + +// 初始化应用 +document.addEventListener('DOMContentLoaded', () => { + const asrRecognizer = new HttpASRRecognizer(); + console.log('HTTP ASR识别器已初始化'); +}); \ No newline at end of file