|
@@ -2,8 +2,8 @@
|
|
|
import { settings, showCallOverlay } from '$lib/stores';
|
|
|
import { onMount, tick, getContext } from 'svelte';
|
|
|
|
|
|
- import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils';
|
|
|
- import { transcribeAudio } from '$lib/apis/audio';
|
|
|
+ import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
|
|
|
+ import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
|
|
|
import { toast } from 'svelte-sonner';
|
|
|
|
|
|
const i18n = getContext('i18n');
|
|
@@ -14,7 +14,8 @@
|
|
|
let confirmed = false;
|
|
|
|
|
|
let assistantSpeaking = false;
|
|
|
- let assistantAudio = null;
|
|
|
+ let assistantAudio = {};
|
|
|
+ let assistantAudioIdx = null;
|
|
|
|
|
|
let rmsLevel = 0;
|
|
|
let hasStartedSpeaking = false;
|
|
@@ -26,6 +27,7 @@
|
|
|
let animationFrameId;
|
|
|
|
|
|
let speechRecognition;
|
|
|
+ let currentUtterance = null;
|
|
|
|
|
|
let mediaRecorder;
|
|
|
let audioChunks = [];
|
|
@@ -108,14 +110,7 @@
|
|
|
// Check if initial speech/noise has started
|
|
|
const hasSound = domainData.some((value) => value > 0);
|
|
|
if (hasSound) {
|
|
|
- if (assistantSpeaking) {
|
|
|
- speechSynthesis.cancel();
|
|
|
-
|
|
|
- if (assistantAudio) {
|
|
|
- assistantAudio.pause();
|
|
|
- assistantAudio.currentTime = 0;
|
|
|
- }
|
|
|
- }
|
|
|
+ stopAllAudio();
|
|
|
hasStartedSpeaking = true;
|
|
|
lastSoundTime = Date.now();
|
|
|
}
|
|
@@ -140,6 +135,55 @@
|
|
|
detectSound();
|
|
|
};
|
|
|
|
|
|
+ const stopAllAudio = () => {
|
|
|
+ if (currentUtterance) {
|
|
|
+ speechSynthesis.cancel();
|
|
|
+ currentUtterance = null;
|
|
|
+ }
|
|
|
+ if (assistantAudio[assistantAudioIdx]) {
|
|
|
+ assistantAudio[assistantAudioIdx].pause();
|
|
|
+ assistantAudio[assistantAudioIdx].currentTime = 0;
|
|
|
+ }
|
|
|
+ assistantSpeaking = false;
|
|
|
+ };
|
|
|
+
|
|
|
+ const playAudio = (idx) => {
|
|
|
+ return new Promise((res) => {
|
|
|
+ assistantAudioIdx = idx;
|
|
|
+ const audio = assistantAudio[idx];
|
|
|
+ audio.play();
|
|
|
+ audio.onended = async (e) => {
|
|
|
+ await new Promise((r) => setTimeout(r, 300));
|
|
|
+
|
|
|
+ if (Object.keys(assistantAudio).length - 1 === idx) {
|
|
|
+ assistantSpeaking = false;
|
|
|
+ }
|
|
|
+
|
|
|
+ res(e);
|
|
|
+ };
|
|
|
+ });
|
|
|
+ };
|
|
|
+
|
|
|
+ const getOpenAISpeech = async (text) => {
|
|
|
+ const res = await synthesizeOpenAISpeech(
|
|
|
+ localStorage.token,
|
|
|
+ $settings?.audio?.speaker ?? 'alloy',
|
|
|
+ text,
|
|
|
+ $settings?.audio?.model ?? 'tts-1'
|
|
|
+ ).catch((error) => {
|
|
|
+ toast.error(error);
|
|
|
+ assistantSpeaking = false;
|
|
|
+ return null;
|
|
|
+ });
|
|
|
+
|
|
|
+ if (res) {
|
|
|
+ const blob = await res.blob();
|
|
|
+ const blobUrl = URL.createObjectURL(blob);
|
|
|
+ const audio = new Audio(blobUrl);
|
|
|
+ assistantAudio = audio;
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
const transcribeHandler = async (audioBlob) => {
|
|
|
// Create a blob from the audio chunks
|
|
|
|
|
@@ -152,21 +196,68 @@
|
|
|
});
|
|
|
|
|
|
if (res) {
|
|
|
- toast.success(res.text);
|
|
|
+ console.log(res.text);
|
|
|
|
|
|
const _responses = await submitPrompt(res.text);
|
|
|
console.log(_responses);
|
|
|
|
|
|
if (_responses.at(0)) {
|
|
|
- const response = _responses[0];
|
|
|
- if (response) {
|
|
|
- assistantSpeaking = true;
|
|
|
+ const content = _responses[0];
|
|
|
+ if (content) {
|
|
|
+ assistantSpeakingHandler(content);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
|
|
|
- if ($settings?.audio?.TTSEngine ?? '') {
|
|
|
- speechSynthesis.speak(new SpeechSynthesisUtterance(response));
|
|
|
+ const assistantSpeakingHandler = async (content) => {
|
|
|
+ assistantSpeaking = true;
|
|
|
+
|
|
|
+ if (($settings?.audio?.TTSEngine ?? '') == '') {
|
|
|
+ currentUtterance = new SpeechSynthesisUtterance(content);
|
|
|
+ speechSynthesis.speak(currentUtterance);
|
|
|
+ } else if ($settings?.audio?.TTSEngine === 'openai') {
|
|
|
+ console.log('openai');
|
|
|
+
|
|
|
+ const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
|
|
|
+ const lastIndex = mergedTexts.length - 1;
|
|
|
+ if (lastIndex >= 0) {
|
|
|
+ const previousText = mergedTexts[lastIndex];
|
|
|
+ const wordCount = previousText.split(/\s+/).length;
|
|
|
+ if (wordCount < 2) {
|
|
|
+ mergedTexts[lastIndex] = previousText + ' ' + currentText;
|
|
|
} else {
|
|
|
- console.log('openai');
|
|
|
+ mergedTexts.push(currentText);
|
|
|
}
|
|
|
+ } else {
|
|
|
+ mergedTexts.push(currentText);
|
|
|
+ }
|
|
|
+ return mergedTexts;
|
|
|
+ }, []);
|
|
|
+
|
|
|
+ console.log(sentences);
|
|
|
+
|
|
|
+ let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
|
|
|
+
|
|
|
+ for (const [idx, sentence] of sentences.entries()) {
|
|
|
+ const res = await synthesizeOpenAISpeech(
|
|
|
+ localStorage.token,
|
|
|
+ $settings?.audio?.speaker,
|
|
|
+ sentence,
|
|
|
+ $settings?.audio?.model
|
|
|
+ ).catch((error) => {
|
|
|
+ toast.error(error);
|
|
|
+
|
|
|
+ assistantSpeaking = false;
|
|
|
+ return null;
|
|
|
+ });
|
|
|
+
|
|
|
+ if (res) {
|
|
|
+ const blob = await res.blob();
|
|
|
+ const blobUrl = URL.createObjectURL(blob);
|
|
|
+ const audio = new Audio(blobUrl);
|
|
|
+ assistantAudio[idx] = audio;
|
|
|
+ lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -311,7 +402,7 @@
|
|
|
{#if loading}
|
|
|
Thinking...
|
|
|
{:else}
|
|
|
- Listening... {Math.round(rmsLevel * 100)}
|
|
|
+ Listening...
|
|
|
{/if}
|
|
|
</div>
|
|
|
</button>
|