Timothy J. Baek 10 months ago
parent
commit
5300d2c531

+ 1 - 1
backend/main.py

@@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)):
     model = app.state.MODELS[model_id]
 
     template = '''
-You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please avoid using generic or overly ambiguous emojis like "🤔", and instead, choose ones that vividly represent the speaker's mood or reaction.
+You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction.
 
 Message: """{{prompt}}"""
 '''

+ 1 - 0
src/lib/components/chat/Chat.svelte

@@ -1209,6 +1209,7 @@
 
 <CallOverlay
 	{submitPrompt}
+	{stopResponse}
 	bind:files
 	modelId={selectedModelIds?.at(0) ?? null}
 	chatId={$chatId}

+ 299 - 283
src/lib/components/chat/MessageInput/CallOverlay.svelte

@@ -14,16 +14,18 @@
 	const i18n = getContext('i18n');
 
 	export let eventTarget: EventTarget;
+
 	export let submitPrompt: Function;
+	export let stopResponse: Function;
+
 	export let files;
 
 	export let chatId;
 	export let modelId;
 
-	let message = '';
-
 	let loading = false;
 	let confirmed = false;
+	let interrupted = false;
 
 	let emoji = null;
 
@@ -31,17 +33,141 @@
 	let cameraStream = null;
 
 	let assistantSpeaking = false;
-	let assistantAudio = {};
-	let assistantAudioIdx = null;
 
-	let rmsLevel = 0;
-	let hasStartedSpeaking = false;
+	let chatStreaming = false;
+	let assistantMessage = '';
+	let assistantSentences = [];
+	let assistantSentenceAudios = {};
+	let assistantSentenceIdx = -1;
+
+	let audioQueue = [];
+
+	$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
+		const lastIndex = mergedTexts.length - 1;
+		if (lastIndex >= 0) {
+			const previousText = mergedTexts[lastIndex];
+			const wordCount = previousText.split(/\s+/).length;
+			if (wordCount < 2) {
+				mergedTexts[lastIndex] = previousText + ' ' + currentText;
+			} else {
+				mergedTexts.push(currentText);
+			}
+		} else {
+			mergedTexts.push(currentText);
+		}
+		return mergedTexts;
+	}, []);
 
 	let currentUtterance = null;
 
+	let rmsLevel = 0;
+	let hasStartedSpeaking = false;
 	let mediaRecorder;
 	let audioChunks = [];
 
+	$: console.log('hasStartedSpeaking', hasStartedSpeaking);
+
+	let videoInputDevices = [];
+	let selectedVideoInputDeviceId = null;
+
+	const getVideoInputDevices = async () => {
+		const devices = await navigator.mediaDevices.enumerateDevices();
+		videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
+
+		if (!!navigator.mediaDevices.getDisplayMedia) {
+			videoInputDevices = [
+				...videoInputDevices,
+				{
+					deviceId: 'screen',
+					label: 'Screen Share'
+				}
+			];
+		}
+
+		console.log(videoInputDevices);
+		if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
+			selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
+		}
+	};
+
+	const startCamera = async () => {
+		await getVideoInputDevices();
+
+		if (cameraStream === null) {
+			camera = true;
+			await tick();
+			try {
+				await startVideoStream();
+			} catch (err) {
+				console.error('Error accessing webcam: ', err);
+			}
+		}
+	};
+
+	const startVideoStream = async () => {
+		const video = document.getElementById('camera-feed');
+		if (video) {
+			if (selectedVideoInputDeviceId === 'screen') {
+				cameraStream = await navigator.mediaDevices.getDisplayMedia({
+					video: {
+						cursor: 'always'
+					},
+					audio: false
+				});
+			} else {
+				cameraStream = await navigator.mediaDevices.getUserMedia({
+					video: {
+						deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
+					}
+				});
+			}
+
+			if (cameraStream) {
+				await getVideoInputDevices();
+				video.srcObject = cameraStream;
+				await video.play();
+			}
+		}
+	};
+
+	const stopVideoStream = async () => {
+		if (cameraStream) {
+			const tracks = cameraStream.getTracks();
+			tracks.forEach((track) => track.stop());
+		}
+
+		cameraStream = null;
+	};
+
+	const takeScreenshot = () => {
+		const video = document.getElementById('camera-feed');
+		const canvas = document.getElementById('camera-canvas');
+
+		if (!canvas) {
+			return;
+		}
+
+		const context = canvas.getContext('2d');
+
+		// Make the canvas match the video dimensions
+		canvas.width = video.videoWidth;
+		canvas.height = video.videoHeight;
+
+		// Draw the image from the video onto the canvas
+		context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
+
+		// Convert the canvas to a data base64 URL and console log it
+		const dataURL = canvas.toDataURL('image/png');
+		console.log(dataURL);
+
+		return dataURL;
+	};
+
+	const stopCamera = async () => {
+		await stopVideoStream();
+		camera = false;
+	};
+
 	const MIN_DECIBELS = -45;
 	const VISUALIZER_BUFFER_LENGTH = 300;
 
@@ -55,15 +181,6 @@
 		return Math.sqrt(sumSquares / data.length);
 	};
 
-	const normalizeRMS = (rms) => {
-		rms = rms * 10;
-		const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
-		const scaledRMS = Math.pow(rms, exp);
-
-		// Scale between 0.01 (1%) and 1.0 (100%)
-		return Math.min(1.0, Math.max(0.01, scaledRMS));
-	};
-
 	const analyseAudio = (stream) => {
 		const audioContext = new AudioContext();
 		const audioStreamSource = audioContext.createMediaStreamSource(stream);
@@ -83,12 +200,9 @@
 		const detectSound = () => {
 			const processFrame = () => {
 				if (!mediaRecorder || !$showCallOverlay) {
-					if (mediaRecorder) {
-						mediaRecorder.stop();
-					}
-
 					return;
 				}
+
 				analyser.getByteTimeDomainData(timeDomainData);
 				analyser.getByteFrequencyData(domainData);
 
@@ -98,9 +212,12 @@
 				// Check if initial speech/noise has started
 				const hasSound = domainData.some((value) => value > 0);
 				if (hasSound) {
-					stopAllAudio();
 					hasStartedSpeaking = true;
 					lastSoundTime = Date.now();
+
+					// BIG RED TEXT
+					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
+					stopAllAudio();
 				}
 
 				// Start silence detection only after initial speech/noise has been detected
@@ -123,35 +240,94 @@
 		detectSound();
 	};
 
-	const stopAllAudio = () => {
+	const transcribeHandler = async (audioBlob) => {
+		// Create a blob from the audio chunks
+
+		await tick();
+		const file = blobToFile(audioBlob, 'recording.wav');
+
+		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
+			toast.error(error);
+			return null;
+		});
+
+		if (res) {
+			console.log(res.text);
+
+			if (res.text !== '') {
+				const _responses = await submitPrompt(res.text, { _raw: true });
+				console.log(_responses);
+			}
+		}
+	};
+
+	const stopAllAudio = async () => {
+		interrupted = true;
+
+		if (chatStreaming) {
+			stopResponse();
+		}
+
 		if (currentUtterance) {
 			speechSynthesis.cancel();
 			currentUtterance = null;
 		}
-		if (assistantAudio[assistantAudioIdx]) {
-			assistantAudio[assistantAudioIdx].pause();
-			assistantAudio[assistantAudioIdx].currentTime = 0;
-		}
 
-		const audioElement = document.getElementById('audioElement');
+		await tick();
+		audioQueue = [];
+		await tick();
 
+		const audioElement = document.getElementById('audioElement');
 		if (audioElement) {
 			audioElement.pause();
 			audioElement.currentTime = 0;
 		}
+
 		assistantSpeaking = false;
 	};
 
-	const playAudio = (idx) => {
+	const speakSpeechSynthesisHandler = (content) => {
+		if ($showCallOverlay) {
+			return new Promise((resolve) => {
+				let voices = [];
+				const getVoicesLoop = setInterval(async () => {
+					voices = await speechSynthesis.getVoices();
+					if (voices.length > 0) {
+						clearInterval(getVoicesLoop);
+
+						const voice =
+							voices
+								?.filter(
+									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								)
+								?.at(0) ?? undefined;
+
+						currentUtterance = new SpeechSynthesisUtterance(content);
+
+						if (voice) {
+							currentUtterance.voice = voice;
+						}
+
+						speechSynthesis.speak(currentUtterance);
+						currentUtterance.onend = async (e) => {
+							await new Promise((r) => setTimeout(r, 100));
+							resolve(e);
+						};
+					}
+				}, 100);
+			});
+		} else {
+			return Promise.resolve();
+		}
+	};
+
+	const playAudio = (audio) => {
 		if ($showCallOverlay) {
-			return new Promise((res) => {
-				assistantAudioIdx = idx;
+			return new Promise((resolve) => {
 				const audioElement = document.getElementById('audioElement');
-				const audio = assistantAudio[idx];
 
 				if (audioElement) {
-					audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
-
+					audioElement.src = audio.src;
 					audioElement.muted = true;
 
 					audioElement
@@ -160,17 +336,12 @@
 							audioElement.muted = false;
 						})
 						.catch((error) => {
-							toast.error(error);
+							console.error(error);
 						});
 
 					audioElement.onended = async (e) => {
-						await new Promise((r) => setTimeout(r, 300));
-
-						if (Object.keys(assistantAudio).length - 1 === idx) {
-							assistantSpeaking = false;
-						}
-
-						res(e);
+						await new Promise((r) => setTimeout(r, 100));
+						resolve(e);
 					};
 				}
 			});
@@ -179,147 +350,57 @@
 		}
 	};
 
-	const getOpenAISpeech = async (text) => {
-		const res = await synthesizeOpenAISpeech(
-			localStorage.token,
-			$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-			text
-		).catch((error) => {
-			toast.error(error);
+	const playAudioHandler = async () => {
+		console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
+		if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
+			assistantSpeaking = true;
+			const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
+			audioQueue = audioQueue;
+			await playAudio(audioToPlay);
 			assistantSpeaking = false;
-			return null;
-		});
-
-		if (res) {
-			const blob = await res.blob();
-			const blobUrl = URL.createObjectURL(blob);
-			const audio = new Audio(blobUrl);
-			assistantAudio = audio;
 		}
 	};
 
-	const transcribeHandler = async (audioBlob) => {
-		// Create a blob from the audio chunks
-
-		await tick();
-		const file = blobToFile(audioBlob, 'recording.wav');
-
-		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
-			toast.error(error);
-			return null;
-		});
-
-		if (res) {
-			console.log(res.text);
-
-			if (res.text !== '') {
-				const _responses = await submitPrompt(res.text, { _raw: true });
-				console.log(_responses);
-			}
-		}
-	};
-
-	const assistantSpeakingHandler = async (content) => {
-		assistantSpeaking = true;
-
-		if (modelId && ($settings?.showEmojiInCall ?? false)) {
-			console.log('Generating emoji');
-			const res = await generateEmoji(localStorage.token, modelId, content, chatId).catch(
-				(error) => {
-					console.error(error);
-					return null;
-				}
-			);
+	const setContentAudio = async (content, idx) => {
+		if (assistantSentenceAudios[idx] === undefined) {
+			console.log('%c%s', 'color: red; font-size: 20px;', content);
+
+			assistantSentenceAudios[idx] = null;
+			const res = await synthesizeOpenAISpeech(
+				localStorage.token,
+				$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
+				content
+			).catch((error) => {
+				toast.error(error);
+				assistantSpeaking = false;
+				return null;
+			});
 
 			if (res) {
-				console.log(res);
-				if (/\p{Extended_Pictographic}/u.test(res)) {
-					emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
-				}
+				const blob = await res.blob();
+				const blobUrl = URL.createObjectURL(blob);
+				const audio = new Audio(blobUrl);
+				assistantSentenceAudios[idx] = audio;
+				audioQueue.push(audio);
+				audioQueue = audioQueue;
 			}
 		}
+	};
 
-		if (($config.audio.tts.engine ?? '') == '') {
-			let voices = [];
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
-				if (voices.length > 0) {
-					clearInterval(getVoicesLoop);
-
-					const voice =
-						voices
-							?.filter(
-								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-							)
-							?.at(0) ?? undefined;
-
-					currentUtterance = new SpeechSynthesisUtterance(content);
-
-					if (voice) {
-						currentUtterance.voice = voice;
-					}
-
-					speechSynthesis.speak(currentUtterance);
-
-					currentUtterance.onend = async () => {
-						assistantSpeaking = false;
-					};
-				}
-			}, 100);
-		} else if ($config.audio.tts.engine === 'openai') {
-			console.log('openai');
-
-			const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
-				const lastIndex = mergedTexts.length - 1;
-				if (lastIndex >= 0) {
-					const previousText = mergedTexts[lastIndex];
-					const wordCount = previousText.split(/\s+/).length;
-					if (wordCount < 2) {
-						mergedTexts[lastIndex] = previousText + ' ' + currentText;
-					} else {
-						mergedTexts.push(currentText);
-					}
-				} else {
-					mergedTexts.push(currentText);
-				}
-				return mergedTexts;
-			}, []);
-
-			console.log(sentences);
-
-			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
-
-			for (const [idx, sentence] of sentences.entries()) {
-				const res = await synthesizeOpenAISpeech(
-					localStorage.token,
-					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-					sentence
-				).catch((error) => {
-					toast.error(error);
+	const stopRecordingCallback = async (_continue = true) => {
+		console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
 
-					assistantSpeaking = false;
-					return null;
-				});
+		if ($showCallOverlay) {
+			// deep copy the audioChunks array
+			const _audioChunks = audioChunks.slice(0);
 
-				if (res) {
-					const blob = await res.blob();
-					const blobUrl = URL.createObjectURL(blob);
-					const audio = new Audio(blobUrl);
-					assistantAudio[idx] = audio;
-					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
+			audioChunks = [];
+			mediaRecorder = false;
 
-					if (idx === sentences.length - 1) {
-						lastPlayedAudioPromise.then(() => {
-							assistantSpeaking = false;
-						});
-					}
-				}
+			if (_continue) {
+				startRecording();
 			}
-		}
-	};
 
-	const stopRecordingCallback = async (_continue = true) => {
-		if ($showCallOverlay) {
 			if (confirmed) {
 				loading = true;
 				emoji = null;
@@ -335,18 +416,12 @@
 					];
 				}
 
-				const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
+				const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
 				await transcribeHandler(audioBlob);
 
 				confirmed = false;
 				loading = false;
 			}
-			audioChunks = [];
-			mediaRecorder = false;
-
-			if (_continue) {
-				startRecording();
-			}
 		} else {
 			audioChunks = [];
 			mediaRecorder = false;
@@ -368,113 +443,11 @@
 		};
 		mediaRecorder.onstop = async () => {
 			console.log('Recording stopped');
-
 			await stopRecordingCallback();
 		};
 		mediaRecorder.start();
 	};
 
-	let videoInputDevices = [];
-	let selectedVideoInputDeviceId = null;
-
-	const getVideoInputDevices = async () => {
-		const devices = await navigator.mediaDevices.enumerateDevices();
-		videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
-
-		if (!!navigator.mediaDevices.getDisplayMedia) {
-			videoInputDevices = [
-				...videoInputDevices,
-				{
-					deviceId: 'screen',
-					label: 'Screen Share'
-				}
-			];
-		}
-
-		console.log(videoInputDevices);
-		if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
-			selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
-		}
-	};
-
-	const startCamera = async () => {
-		await getVideoInputDevices();
-
-		if (cameraStream === null) {
-			camera = true;
-			await tick();
-			try {
-				await startVideoStream();
-			} catch (err) {
-				console.error('Error accessing webcam: ', err);
-			}
-		}
-	};
-
-	const startVideoStream = async () => {
-		const video = document.getElementById('camera-feed');
-		if (video) {
-			if (selectedVideoInputDeviceId === 'screen') {
-				cameraStream = await navigator.mediaDevices.getDisplayMedia({
-					video: {
-						cursor: 'always'
-					},
-					audio: false
-				});
-			} else {
-				cameraStream = await navigator.mediaDevices.getUserMedia({
-					video: {
-						deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
-					}
-				});
-			}
-
-			if (cameraStream) {
-				await getVideoInputDevices();
-				video.srcObject = cameraStream;
-				await video.play();
-			}
-		}
-	};
-
-	const stopVideoStream = async () => {
-		if (cameraStream) {
-			const tracks = cameraStream.getTracks();
-			tracks.forEach((track) => track.stop());
-		}
-
-		cameraStream = null;
-	};
-
-	const takeScreenshot = () => {
-		const video = document.getElementById('camera-feed');
-		const canvas = document.getElementById('camera-canvas');
-
-		if (!canvas) {
-			return;
-		}
-
-		const context = canvas.getContext('2d');
-
-		// Make the canvas match the video dimensions
-		canvas.width = video.videoWidth;
-		canvas.height = video.videoHeight;
-
-		// Draw the image from the video onto the canvas
-		context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
-
-		// Convert the canvas to a data base64 URL and console log it
-		const dataURL = canvas.toDataURL('image/png');
-		console.log(dataURL);
-
-		return dataURL;
-	};
-
-	const stopCamera = async () => {
-		await stopVideoStream();
-		camera = false;
-	};
-
 	$: if ($showCallOverlay) {
 		startRecording();
 	} else {
@@ -483,30 +456,73 @@
 		stopRecordingCallback(false);
 	}
 
+	$: {
+		if (audioQueue.length > 0 && !assistantSpeaking) {
+			playAudioHandler();
+		}
+	}
+
 	onMount(() => {
 		console.log(eventTarget);
 
 		eventTarget.addEventListener('chat:start', async (e) => {
-			console.log('Chat start event:', e.detail);
-			message = '';
+			console.log('Chat start event:', e);
+			interrupted = false;
+
+			assistantMessage = '';
+			assistantSentenceIdx = -1;
+			assistantSentenceAudios = {}; // Reset audio tracking
+			audioQueue = []; // Clear the audio queue
+
+			chatStreaming = true;
 		});
 
 		eventTarget.addEventListener('chat', async (e) => {
 			const { content } = e.detail;
+			assistantMessage += content;
+			await tick();
+
+			if (!interrupted) {
+				if ($config.audio.tts.engine !== '') {
+					assistantSentenceIdx = assistantSentences.length - 2;
+
+					if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
+						await tick();
+						setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
+					}
+				}
+			}
 
-			message += content;
-			console.log('Chat event:', message);
+			chatStreaming = true;
 		});
 
 		eventTarget.addEventListener('chat:finish', async (e) => {
-			console.log('Chat finish event:', e.detail);
-			message = '';
+			chatStreaming = false;
+			loading = false;
+
+			console.log('Chat finish event:', e);
+			await tick();
+
+			if (!interrupted) {
+				if ($config.audio.tts.engine !== '') {
+					for (const [idx, sentence] of assistantSentences.entries()) {
+						if (!assistantSentenceAudios[idx]) {
+							await tick();
+							setContentAudio(sentence, idx);
+						}
+					}
+				} else {
+					emoji = generateEmoji(localStorage.token, modelId, assistantMessage);
+					speakSpeechSynthesisHandler(assistantMessage);
+				}
+			}
 		});
 	});
 </script>
 
+<audio id="audioElement" src="" style="display: none;" />
+
 {#if $showCallOverlay}
-	<audio id="audioElement" src="" style="display: none;" />
 	<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
 		<div
 			class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"