Explorar el Código

Merge pull request #3128 from open-webui/voice-enh

enh: voice
Timothy Jaeryang Baek hace 10 meses
padre
commit
4727e5cbb1

+ 68 - 0
backend/main.py

@@ -494,6 +494,9 @@ def filter_pipeline(payload, user):
         if "title" in payload:
             del payload["title"]
 
+        if "task" in payload:
+            del payload["task"]
+
     return payload
 
 
@@ -835,6 +838,71 @@ async def generate_search_query(form_data: dict, user=Depends(get_verified_user)
         "messages": [{"role": "user", "content": content}],
         "stream": False,
         "max_tokens": 30,
+        "task": True,
+    }
+
+    print(payload)
+
+    try:
+        payload = filter_pipeline(payload, user)
+    except Exception as e:
+        return JSONResponse(
+            status_code=e.args[0],
+            content={"detail": e.args[1]},
+        )
+
+    if model["owned_by"] == "ollama":
+        return await generate_ollama_chat_completion(
+            OpenAIChatCompletionForm(**payload), user=user
+        )
+    else:
+        return await generate_openai_chat_completion(payload, user=user)
+
+
+@app.post("/api/task/emoji/completions")
+async def generate_emoji(form_data: dict, user=Depends(get_verified_user)):
+    print("generate_emoji")
+
+    model_id = form_data["model"]
+    if model_id not in app.state.MODELS:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Model not found",
+        )
+
+    # Check if the user has a custom task model
+    # If the user has a custom task model, use that model
+    if app.state.MODELS[model_id]["owned_by"] == "ollama":
+        if app.state.config.TASK_MODEL:
+            task_model_id = app.state.config.TASK_MODEL
+            if task_model_id in app.state.MODELS:
+                model_id = task_model_id
+    else:
+        if app.state.config.TASK_MODEL_EXTERNAL:
+            task_model_id = app.state.config.TASK_MODEL_EXTERNAL
+            if task_model_id in app.state.MODELS:
+                model_id = task_model_id
+
+    print(model_id)
+    model = app.state.MODELS[model_id]
+
+    template = '''
+Your task is to reflect the speaker's likely facial expression through a fitting emoji. Interpret emotions from the message and reflect their facial expression using fitting, diverse emojis (e.g., 😊, 😢, 😡, 😱).
+
+Message: """{{prompt}}"""
+'''
+
+    content = title_generation_template(
+        template, form_data["prompt"], user.model_dump()
+    )
+
+    payload = {
+        "model": model_id,
+        "messages": [{"role": "user", "content": content}],
+        "stream": False,
+        "max_tokens": 4,
+        "chat_id": form_data.get("chat_id", None),
+        "task": True,
     }
 
     print(payload)

+ 40 - 0
src/lib/apis/index.ts

@@ -205,6 +205,46 @@ export const generateTitle = async (
 	return res?.choices[0]?.message?.content.replace(/["']/g, '') ?? 'New Chat';
 };
 
+export const generateEmoji = async (
+	token: string = '',
+	model: string,
+	prompt: string,
+	chat_id?: string
+) => {
+	let error = null;
+
+	const res = await fetch(`${WEBUI_BASE_URL}/api/task/emoji/completions`, {
+		method: 'POST',
+		headers: {
+			Accept: 'application/json',
+			'Content-Type': 'application/json',
+			Authorization: `Bearer ${token}`
+		},
+		body: JSON.stringify({
+			model: model,
+			prompt: prompt,
+			...(chat_id && { chat_id: chat_id })
+		})
+	})
+		.then(async (res) => {
+			if (!res.ok) throw await res.json();
+			return res.json();
+		})
+		.catch((err) => {
+			console.log(err);
+			if ('detail' in err) {
+				error = err.detail;
+			}
+			return null;
+		});
+
+	if (error) {
+		throw error;
+	}
+
+	return res?.choices[0]?.message?.content.replace(/["']/g, '') ?? null;
+};
+
 export const generateSearchQuery = async (
 	token: string = '',
 	model: string,

+ 112 - 104
src/lib/components/chat/Chat.svelte

@@ -64,6 +64,8 @@
 	export let chatIdProp = '';
 	let loaded = false;
 
+	const eventTarget = new EventTarget();
+
 	let stopResponseFlag = false;
 	let autoScroll = true;
 	let processing = '';
@@ -300,7 +302,7 @@
 	// Chat functions
 	//////////////////////////
 
-	const submitPrompt = async (userPrompt, _user = null) => {
+	const submitPrompt = async (userPrompt, { _raw = false } = {}) => {
 		let _responses = [];
 		console.log('submitPrompt', $chatId);
 
@@ -344,7 +346,6 @@
 				parentId: messages.length !== 0 ? messages.at(-1).id : null,
 				childrenIds: [],
 				role: 'user',
-				user: _user ?? undefined,
 				content: userPrompt,
 				files: _files.length > 0 ? _files : undefined,
 				timestamp: Math.floor(Date.now() / 1000), // Unix epoch
@@ -362,15 +363,13 @@
 
 			// Wait until history/message have been updated
 			await tick();
-
-			// Send prompt
-			_responses = await sendPrompt(userPrompt, userMessageId);
+			_responses = await sendPrompt(userPrompt, userMessageId, { newChat: true });
 		}
 
 		return _responses;
 	};
 
-	const sendPrompt = async (prompt, parentId, modelId = null, newChat = true) => {
+	const sendPrompt = async (prompt, parentId, { modelId = null, newChat = false } = {}) => {
 		let _responses = [];
 
 		// If modelId is provided, use it, else use selected model
@@ -490,7 +489,6 @@
 					responseMessage.userContext = userContext;
 
 					const chatEventEmitter = await getChatEventEmitter(model.id, _chatId);
-
 					if (webSearchEnabled) {
 						await getWebSearchResults(model.id, parentId, responseMessageId);
 					}
@@ -503,8 +501,6 @@
 					}
 					_responses.push(_response);
 
-					console.log('chatEventEmitter', chatEventEmitter);
-
 					if (chatEventEmitter) clearInterval(chatEventEmitter);
 				} else {
 					toast.error($i18n.t(`Model {{modelId}} not found`, { modelId }));
@@ -513,88 +509,9 @@
 		);
 
 		await chats.set(await getChatList(localStorage.token));
-
 		return _responses;
 	};
 
-	const getWebSearchResults = async (model: string, parentId: string, responseId: string) => {
-		const responseMessage = history.messages[responseId];
-
-		responseMessage.statusHistory = [
-			{
-				done: false,
-				action: 'web_search',
-				description: $i18n.t('Generating search query')
-			}
-		];
-		messages = messages;
-
-		const prompt = history.messages[parentId].content;
-		let searchQuery = await generateSearchQuery(localStorage.token, model, messages, prompt).catch(
-			(error) => {
-				console.log(error);
-				return prompt;
-			}
-		);
-
-		if (!searchQuery) {
-			toast.warning($i18n.t('No search query generated'));
-			responseMessage.statusHistory.push({
-				done: true,
-				error: true,
-				action: 'web_search',
-				description: 'No search query generated'
-			});
-
-			messages = messages;
-		}
-
-		responseMessage.statusHistory.push({
-			done: false,
-			action: 'web_search',
-			description: $i18n.t(`Searching "{{searchQuery}}"`, { searchQuery })
-		});
-		messages = messages;
-
-		const results = await runWebSearch(localStorage.token, searchQuery).catch((error) => {
-			console.log(error);
-			toast.error(error);
-
-			return null;
-		});
-
-		if (results) {
-			responseMessage.statusHistory.push({
-				done: true,
-				action: 'web_search',
-				description: $i18n.t('Searched {{count}} sites', { count: results.filenames.length }),
-				query: searchQuery,
-				urls: results.filenames
-			});
-
-			if (responseMessage?.files ?? undefined === undefined) {
-				responseMessage.files = [];
-			}
-
-			responseMessage.files.push({
-				collection_name: results.collection_name,
-				name: searchQuery,
-				type: 'web_search_results',
-				urls: results.filenames
-			});
-
-			messages = messages;
-		} else {
-			responseMessage.statusHistory.push({
-				done: true,
-				error: true,
-				action: 'web_search',
-				description: 'No search results found'
-			});
-			messages = messages;
-		}
-	};
-
 	const sendPromptOllama = async (model, userPrompt, responseMessageId, _chatId) => {
 		let _response = null;
 
@@ -676,6 +593,8 @@
 				array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
 		);
 
+		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+
 		const [res, controller] = await generateChatCompletion(localStorage.token, {
 			model: model.id,
 			messages: messagesBody,
@@ -745,6 +664,9 @@
 									continue;
 								} else {
 									responseMessage.content += data.message.content;
+									eventTarget.dispatchEvent(
+										new CustomEvent('chat', { detail: { content: data.message.content } })
+									);
 									messages = messages;
 								}
 							} else {
@@ -771,21 +693,13 @@
 								messages = messages;
 
 								if ($settings.notificationEnabled && !document.hasFocus()) {
-									const notification = new Notification(
-										selectedModelfile
-											? `${
-													selectedModelfile.title.charAt(0).toUpperCase() +
-													selectedModelfile.title.slice(1)
-											  }`
-											: `${model.id}`,
-										{
-											body: responseMessage.content,
-											icon: selectedModelfile?.imageUrl ?? `${WEBUI_BASE_URL}/static/favicon.png`
-										}
-									);
+									const notification = new Notification(`${model.id}`, {
+										body: responseMessage.content,
+										icon: `${WEBUI_BASE_URL}/static/favicon.png`
+									});
 								}
 
-								if ($settings.responseAutoCopy) {
+								if ($settings?.responseAutoCopy ?? false) {
 									copyToClipboard(responseMessage.content);
 								}
 
@@ -846,6 +760,7 @@
 
 		stopResponseFlag = false;
 		await tick();
+		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
 
 		if (autoScroll) {
 			scrollToBottom();
@@ -887,6 +802,8 @@
 
 		scrollToBottom();
 
+		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+
 		try {
 			const [res, controller] = await generateOpenAIChatCompletion(
 				localStorage.token,
@@ -1007,6 +924,7 @@
 						continue;
 					} else {
 						responseMessage.content += value;
+						eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
 						messages = messages;
 					}
 
@@ -1057,6 +975,8 @@
 		stopResponseFlag = false;
 		await tick();
 
+		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
+
 		if (autoScroll) {
 			scrollToBottom();
 		}
@@ -1123,9 +1043,12 @@
 			let userPrompt = userMessage.content;
 
 			if ((userMessage?.models ?? [...selectedModels]).length == 1) {
-				await sendPrompt(userPrompt, userMessage.id, undefined, false);
+				// If user message has only one model selected, sendPrompt automatically selects it for regeneration
+				await sendPrompt(userPrompt, userMessage.id);
 			} else {
-				await sendPrompt(userPrompt, userMessage.id, message.model, false);
+				// If there are multiple models selected, use the model of the response message for regeneration
+				// e.g. many model chat
+				await sendPrompt(userPrompt, userMessage.id, { modelId: message.model });
 			}
 		}
 	};
@@ -1191,6 +1114,84 @@
 		}
 	};
 
+	const getWebSearchResults = async (model: string, parentId: string, responseId: string) => {
+		const responseMessage = history.messages[responseId];
+
+		responseMessage.statusHistory = [
+			{
+				done: false,
+				action: 'web_search',
+				description: $i18n.t('Generating search query')
+			}
+		];
+		messages = messages;
+
+		const prompt = history.messages[parentId].content;
+		let searchQuery = await generateSearchQuery(localStorage.token, model, messages, prompt).catch(
+			(error) => {
+				console.log(error);
+				return prompt;
+			}
+		);
+
+		if (!searchQuery) {
+			toast.warning($i18n.t('No search query generated'));
+			responseMessage.statusHistory.push({
+				done: true,
+				error: true,
+				action: 'web_search',
+				description: 'No search query generated'
+			});
+
+			messages = messages;
+		}
+
+		responseMessage.statusHistory.push({
+			done: false,
+			action: 'web_search',
+			description: $i18n.t(`Searching "{{searchQuery}}"`, { searchQuery })
+		});
+		messages = messages;
+
+		const results = await runWebSearch(localStorage.token, searchQuery).catch((error) => {
+			console.log(error);
+			toast.error(error);
+
+			return null;
+		});
+
+		if (results) {
+			responseMessage.statusHistory.push({
+				done: true,
+				action: 'web_search',
+				description: $i18n.t('Searched {{count}} sites', { count: results.filenames.length }),
+				query: searchQuery,
+				urls: results.filenames
+			});
+
+			if (responseMessage?.files ?? undefined === undefined) {
+				responseMessage.files = [];
+			}
+
+			responseMessage.files.push({
+				collection_name: results.collection_name,
+				name: searchQuery,
+				type: 'web_search_results',
+				urls: results.filenames
+			});
+
+			messages = messages;
+		} else {
+			responseMessage.statusHistory.push({
+				done: true,
+				error: true,
+				action: 'web_search',
+				description: 'No search results found'
+			});
+			messages = messages;
+		}
+	};
+
 	const getTags = async () => {
 		return await getTagsById(localStorage.token, $chatId).catch(async (error) => {
 			return [];
@@ -1206,7 +1207,14 @@
 	</title>
 </svelte:head>
 
-<CallOverlay {submitPrompt} bind:files />
+<CallOverlay
+	{submitPrompt}
+	{stopResponse}
+	bind:files
+	modelId={selectedModelIds?.at(0) ?? null}
+	chatId={$chatId}
+	{eventTarget}
+/>
 
 {#if !chatIdProp || (loaded && chatIdProp)}
 	<div

+ 3 - 4
src/lib/components/chat/MessageInput.svelte

@@ -348,7 +348,6 @@
 				<Models
 					bind:this={modelsElement}
 					bind:prompt
-					bind:user
 					bind:chatInputPlaceholder
 					{messages}
 					on:select={(e) => {
@@ -467,7 +466,7 @@
 							document.getElementById('chat-textarea')?.focus();
 
 							if ($settings?.speechAutoSend ?? false) {
-								submitPrompt(prompt, user);
+								submitPrompt(prompt);
 							}
 						}}
 					/>
@@ -476,7 +475,7 @@
 						class="w-full flex gap-1.5"
 						on:submit|preventDefault={() => {
 							// check if selectedModels support image input
-							submitPrompt(prompt, user);
+							submitPrompt(prompt);
 						}}
 					>
 						<div
@@ -718,7 +717,7 @@
 
 											// Submit the prompt when Enter key is pressed
 											if (prompt !== '' && e.key === 'Enter' && !e.shiftKey) {
-												submitPrompt(prompt, user);
+												submitPrompt(prompt);
 											}
 										}
 									}}

+ 413 - 244
src/lib/components/chat/MessageInput/CallOverlay.svelte

@@ -3,36 +3,170 @@
 	import { onMount, tick, getContext } from 'svelte';
 
 	import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
+	import { generateEmoji } from '$lib/apis';
 	import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
+
 	import { toast } from 'svelte-sonner';
 
 	import Tooltip from '$lib/components/common/Tooltip.svelte';
 	import VideoInputMenu from './CallOverlay/VideoInputMenu.svelte';
-	import { get } from 'svelte/store';
 
 	const i18n = getContext('i18n');
 
+	export let eventTarget: EventTarget;
+
 	export let submitPrompt: Function;
+	export let stopResponse: Function;
+
 	export let files;
 
+	export let chatId;
+	export let modelId;
+
 	let loading = false;
 	let confirmed = false;
+	let interrupted = false;
+
+	let emoji = null;
 
 	let camera = false;
 	let cameraStream = null;
 
 	let assistantSpeaking = false;
-	let assistantAudio = {};
-	let assistantAudioIdx = null;
 
-	let rmsLevel = 0;
-	let hasStartedSpeaking = false;
+	let chatStreaming = false;
+	let assistantMessage = '';
+	let assistantSentences = [];
+	let assistantSentenceAudios = {};
+	let assistantSentenceIdx = -1;
+
+	let audioQueue = [];
+	let emojiQueue = [];
+
+	$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
+		const lastIndex = mergedTexts.length - 1;
+		if (lastIndex >= 0) {
+			const previousText = mergedTexts[lastIndex];
+			const wordCount = previousText.split(/\s+/).length;
+			if (wordCount < 2) {
+				mergedTexts[lastIndex] = previousText + ' ' + currentText;
+			} else {
+				mergedTexts.push(currentText);
+			}
+		} else {
+			mergedTexts.push(currentText);
+		}
+		return mergedTexts;
+	}, []);
 
 	let currentUtterance = null;
 
+	let rmsLevel = 0;
+	let hasStartedSpeaking = false;
 	let mediaRecorder;
 	let audioChunks = [];
 
+	let videoInputDevices = [];
+	let selectedVideoInputDeviceId = null;
+
+	const getVideoInputDevices = async () => {
+		const devices = await navigator.mediaDevices.enumerateDevices();
+		videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
+
+		if (!!navigator.mediaDevices.getDisplayMedia) {
+			videoInputDevices = [
+				...videoInputDevices,
+				{
+					deviceId: 'screen',
+					label: 'Screen Share'
+				}
+			];
+		}
+
+		console.log(videoInputDevices);
+		if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
+			selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
+		}
+	};
+
+	const startCamera = async () => {
+		await getVideoInputDevices();
+
+		if (cameraStream === null) {
+			camera = true;
+			await tick();
+			try {
+				await startVideoStream();
+			} catch (err) {
+				console.error('Error accessing webcam: ', err);
+			}
+		}
+	};
+
+	const startVideoStream = async () => {
+		const video = document.getElementById('camera-feed');
+		if (video) {
+			if (selectedVideoInputDeviceId === 'screen') {
+				cameraStream = await navigator.mediaDevices.getDisplayMedia({
+					video: {
+						cursor: 'always'
+					},
+					audio: false
+				});
+			} else {
+				cameraStream = await navigator.mediaDevices.getUserMedia({
+					video: {
+						deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
+					}
+				});
+			}
+
+			if (cameraStream) {
+				await getVideoInputDevices();
+				video.srcObject = cameraStream;
+				await video.play();
+			}
+		}
+	};
+
+	const stopVideoStream = async () => {
+		if (cameraStream) {
+			const tracks = cameraStream.getTracks();
+			tracks.forEach((track) => track.stop());
+		}
+
+		cameraStream = null;
+	};
+
+	const takeScreenshot = () => {
+		const video = document.getElementById('camera-feed');
+		const canvas = document.getElementById('camera-canvas');
+
+		if (!canvas) {
+			return;
+		}
+
+		const context = canvas.getContext('2d');
+
+		// Make the canvas match the video dimensions
+		canvas.width = video.videoWidth;
+		canvas.height = video.videoHeight;
+
+		// Draw the image from the video onto the canvas
+		context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
+
+		// Convert the canvas to a data base64 URL and console log it
+		const dataURL = canvas.toDataURL('image/png');
+		console.log(dataURL);
+
+		return dataURL;
+	};
+
+	const stopCamera = async () => {
+		await stopVideoStream();
+		camera = false;
+	};
+
 	const MIN_DECIBELS = -45;
 	const VISUALIZER_BUFFER_LENGTH = 300;
 
@@ -46,15 +180,6 @@
 		return Math.sqrt(sumSquares / data.length);
 	};
 
-	const normalizeRMS = (rms) => {
-		rms = rms * 10;
-		const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
-		const scaledRMS = Math.pow(rms, exp);
-
-		// Scale between 0.01 (1%) and 1.0 (100%)
-		return Math.min(1.0, Math.max(0.01, scaledRMS));
-	};
-
 	const analyseAudio = (stream) => {
 		const audioContext = new AudioContext();
 		const audioStreamSource = audioContext.createMediaStreamSource(stream);
@@ -74,12 +199,9 @@
 		const detectSound = () => {
 			const processFrame = () => {
 				if (!mediaRecorder || !$showCallOverlay) {
-					if (mediaRecorder) {
-						mediaRecorder.stop();
-					}
-
 					return;
 				}
+
 				analyser.getByteTimeDomainData(timeDomainData);
 				analyser.getByteFrequencyData(domainData);
 
@@ -89,9 +211,12 @@
 				// Check if initial speech/noise has started
 				const hasSound = domainData.some((value) => value > 0);
 				if (hasSound) {
-					stopAllAudio();
 					hasStartedSpeaking = true;
 					lastSoundTime = Date.now();
+
+					// BIG RED TEXT
+					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
+					stopAllAudio();
 				}
 
 				// Start silence detection only after initial speech/noise has been detected
@@ -114,181 +239,212 @@
 		detectSound();
 	};
 
-	const stopAllAudio = () => {
+	const transcribeHandler = async (audioBlob) => {
+		// Create a blob from the audio chunks
+
+		await tick();
+		const file = blobToFile(audioBlob, 'recording.wav');
+
+		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
+			toast.error(error);
+			return null;
+		});
+
+		if (res) {
+			console.log(res.text);
+
+			if (res.text !== '') {
+				const _responses = await submitPrompt(res.text, { _raw: true });
+				console.log(_responses);
+			}
+		}
+	};
+
+	const stopAllAudio = async () => {
+		interrupted = true;
+
+		if (chatStreaming) {
+			stopResponse();
+		}
+
 		if (currentUtterance) {
 			speechSynthesis.cancel();
 			currentUtterance = null;
 		}
-		if (assistantAudio[assistantAudioIdx]) {
-			assistantAudio[assistantAudioIdx].pause();
-			assistantAudio[assistantAudioIdx].currentTime = 0;
-		}
+
+		await tick();
+		emojiQueue = [];
+		audioQueue = [];
+		await tick();
 
 		const audioElement = document.getElementById('audioElement');
-		audioElement.pause();
-		audioElement.currentTime = 0;
+		if (audioElement) {
+			audioElement.pause();
+			audioElement.currentTime = 0;
+		}
 
 		assistantSpeaking = false;
 	};
 
-	const playAudio = (idx) => {
+	const speakSpeechSynthesisHandler = (content) => {
 		if ($showCallOverlay) {
-			return new Promise((res) => {
-				assistantAudioIdx = idx;
-				const audioElement = document.getElementById('audioElement');
-				const audio = assistantAudio[idx];
-
-				audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
-
-				audioElement.muted = true;
-
-				audioElement
-					.play()
-					.then(() => {
-						audioElement.muted = false;
-					})
-					.catch((error) => {
-						toast.error(error);
-					});
-
-				audioElement.onended = async (e) => {
-					await new Promise((r) => setTimeout(r, 300));
+			return new Promise((resolve) => {
+				let voices = [];
+				const getVoicesLoop = setInterval(async () => {
+					voices = await speechSynthesis.getVoices();
+					if (voices.length > 0) {
+						clearInterval(getVoicesLoop);
+
+						const voice =
+							voices
+								?.filter(
+									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								)
+								?.at(0) ?? undefined;
+
+						currentUtterance = new SpeechSynthesisUtterance(content);
+
+						if (voice) {
+							currentUtterance.voice = voice;
+						}
 
-					if (Object.keys(assistantAudio).length - 1 === idx) {
-						assistantSpeaking = false;
+						speechSynthesis.speak(currentUtterance);
+						currentUtterance.onend = async (e) => {
+							await new Promise((r) => setTimeout(r, 100));
+							resolve(e);
+						};
 					}
-
-					res(e);
-				};
+				}, 100);
 			});
 		} else {
 			return Promise.resolve();
 		}
 	};
 
-	const getOpenAISpeech = async (text) => {
-		const res = await synthesizeOpenAISpeech(
-			localStorage.token,
-			$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-			text
-		).catch((error) => {
-			toast.error(error);
-			assistantSpeaking = false;
-			return null;
-		});
+	const playAudio = (audio) => {
+		if ($showCallOverlay) {
+			return new Promise((resolve) => {
+				const audioElement = document.getElementById('audioElement');
 
-		if (res) {
-			const blob = await res.blob();
-			const blobUrl = URL.createObjectURL(blob);
-			const audio = new Audio(blobUrl);
-			assistantAudio = audio;
+				if (audioElement) {
+					audioElement.src = audio.src;
+					audioElement.muted = true;
+
+					audioElement
+						.play()
+						.then(() => {
+							audioElement.muted = false;
+						})
+						.catch((error) => {
+							console.error(error);
+						});
+
+					audioElement.onended = async (e) => {
+						await new Promise((r) => setTimeout(r, 100));
+						resolve(e);
+					};
+				}
+			});
+		} else {
+			return Promise.resolve();
 		}
 	};
 
-	const transcribeHandler = async (audioBlob) => {
-		// Create a blob from the audio chunks
-
-		await tick();
-		const file = blobToFile(audioBlob, 'recording.wav');
+	const playAudioHandler = async () => {
+		console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
+		if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
+			assistantSpeaking = true;
 
-		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
-			toast.error(error);
-			return null;
-		});
-
-		if (res) {
-			console.log(res.text);
-
-			if (res.text !== '') {
-				const _responses = await submitPrompt(res.text);
-				console.log(_responses);
-
-				if (_responses.at(0)) {
-					const content = _responses[0];
-					if ((content ?? '').trim() !== '') {
-						assistantSpeakingHandler(content);
-					}
+			if ($settings?.showEmojiInCall ?? false) {
+				if (emojiQueue.length > 0) {
+					emoji = emojiQueue.shift();
+					emojiQueue = emojiQueue;
 				}
 			}
+
+			const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
+			audioQueue = audioQueue;
+			await playAudio(audioToPlay);
+			assistantSpeaking = false;
 		}
 	};
 
-	const assistantSpeakingHandler = async (content) => {
-		assistantSpeaking = true;
-
-		if (($config.audio.tts.engine ?? '') == '') {
-			let voices = [];
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
-				if (voices.length > 0) {
-					clearInterval(getVoicesLoop);
+	const setContentAudio = async (content, idx) => {
+		if (assistantSentenceAudios[idx] === undefined) {
+			// Wait for the previous audio to be loaded
+			if (idx > 0) {
+				await new Promise((resolve) => {
+					const check = setInterval(() => {
+						if (
+							assistantSentenceAudios[idx - 1] !== undefined &&
+							assistantSentenceAudios[idx - 1] !== null
+						) {
+							clearInterval(check);
+							resolve();
+						}
+					}, 100);
+				});
+			}
 
-					const voice =
-						voices
-							?.filter(
-								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-							)
-							?.at(0) ?? undefined;
+			assistantSentenceAudios[idx] = null;
 
-					currentUtterance = new SpeechSynthesisUtterance(content);
+			if ($settings?.showEmojiInCall ?? false) {
+				const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
 
-					if (voice) {
-						currentUtterance.voice = voice;
-					}
+				if (sentenceEmoji) {
+					// Big red text with content and emoji
+					console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
 
-					speechSynthesis.speak(currentUtterance);
-				}
-			}, 100);
-		} else if ($config.audio.tts.engine === 'openai') {
-			console.log('openai');
-
-			const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
-				const lastIndex = mergedTexts.length - 1;
-				if (lastIndex >= 0) {
-					const previousText = mergedTexts[lastIndex];
-					const wordCount = previousText.split(/\s+/).length;
-					if (wordCount < 2) {
-						mergedTexts[lastIndex] = previousText + ' ' + currentText;
-					} else {
-						mergedTexts.push(currentText);
+					if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
+						emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
+						emojiQueue = emojiQueue;
 					}
-				} else {
-					mergedTexts.push(currentText);
 				}
-				return mergedTexts;
-			}, []);
 
-			console.log(sentences);
+				await tick();
+			}
 
-			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+			const res = await synthesizeOpenAISpeech(
+				localStorage.token,
+				$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
+				content
+			).catch((error) => {
+				toast.error(error);
+				assistantSpeaking = false;
+				return null;
+			});
 
-			for (const [idx, sentence] of sentences.entries()) {
-				const res = await synthesizeOpenAISpeech(
-					localStorage.token,
-					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-					sentence
-				).catch((error) => {
-					toast.error(error);
+			if (res) {
+				const blob = await res.blob();
+				const blobUrl = URL.createObjectURL(blob);
+				const audio = new Audio(blobUrl);
+				assistantSentenceAudios[idx] = audio;
 
-					assistantSpeaking = false;
-					return null;
-				});
+				console.log('%c%s', 'color: red; font-size: 20px;', content);
 
-				if (res) {
-					const blob = await res.blob();
-					const blobUrl = URL.createObjectURL(blob);
-					const audio = new Audio(blobUrl);
-					assistantAudio[idx] = audio;
-					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
-				}
+				audioQueue.push(audio);
+				audioQueue = audioQueue;
 			}
 		}
 	};
 
-	const stopRecordingCallback = async () => {
+	const stopRecordingCallback = async (_continue = true) => {
 		if ($showCallOverlay) {
+			console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
+
+			// deep copy the audioChunks array
+			const _audioChunks = audioChunks.slice(0);
+
+			audioChunks = [];
+			mediaRecorder = false;
+
+			if (_continue) {
+				startRecording();
+			}
+
 			if (confirmed) {
 				loading = true;
+				emoji = null;
 
 				if (cameraStream) {
 					const imageUrl = takeScreenshot();
@@ -301,16 +457,12 @@
 					];
 				}
 
-				const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
+				const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
 				await transcribeHandler(audioBlob);
 
 				confirmed = false;
 				loading = false;
 			}
-			audioChunks = [];
-			mediaRecorder = false;
-
-			startRecording();
 		} else {
 			audioChunks = [];
 			mediaRecorder = false;
@@ -332,129 +484,120 @@
 		};
 		mediaRecorder.onstop = async () => {
 			console.log('Recording stopped');
-
 			await stopRecordingCallback();
 		};
 		mediaRecorder.start();
 	};
 
-	let videoInputDevices = [];
-	let selectedVideoInputDeviceId = null;
+	const resetAssistantMessage = async () => {
+		interrupted = false;
 
-	const getVideoInputDevices = async () => {
-		const devices = await navigator.mediaDevices.enumerateDevices();
-		videoInputDevices = devices.filter((device) => device.kind === 'videoinput');
+		assistantMessage = '';
+		assistantSentenceIdx = -1;
+		assistantSentenceAudios = {}; // Reset audio tracking
+		audioQueue = []; // Clear the audio queue
+		audioQueue = audioQueue;
 
-		if (!!navigator.mediaDevices.getDisplayMedia) {
-			videoInputDevices = [
-				...videoInputDevices,
-				{
-					deviceId: 'screen',
-					label: 'Screen Share'
-				}
-			];
-		}
-
-		console.log(videoInputDevices);
-		if (selectedVideoInputDeviceId === null && videoInputDevices.length > 0) {
-			selectedVideoInputDeviceId = videoInputDevices[0].deviceId;
-		}
+		emoji = null;
+		emojiQueue = [];
+		emojiQueue = emojiQueue;
 	};
 
-	const startCamera = async () => {
-		await getVideoInputDevices();
-
-		if (cameraStream === null) {
-			camera = true;
+	$: (async () => {
+		if ($showCallOverlay) {
+			await resetAssistantMessage();
 			await tick();
-			try {
-				await startVideoStream();
-			} catch (err) {
-				console.error('Error accessing webcam: ', err);
-			}
-		}
-	};
-
-	const startVideoStream = async () => {
-		const video = document.getElementById('camera-feed');
-		if (video) {
-			if (selectedVideoInputDeviceId === 'screen') {
-				cameraStream = await navigator.mediaDevices.getDisplayMedia({
-					video: {
-						cursor: 'always'
-					},
-					audio: false
-				});
-			} else {
-				cameraStream = await navigator.mediaDevices.getUserMedia({
-					video: {
-						deviceId: selectedVideoInputDeviceId ? { exact: selectedVideoInputDeviceId } : undefined
-					}
-				});
-			}
-
-			if (cameraStream) {
-				await getVideoInputDevices();
-				video.srcObject = cameraStream;
-				await video.play();
-			}
+			startRecording();
+		} else {
+			stopCamera();
+			stopAllAudio();
+			stopRecordingCallback(false);
 		}
-	};
+	})();
 
-	const stopVideoStream = async () => {
-		if (cameraStream) {
-			const tracks = cameraStream.getTracks();
-			tracks.forEach((track) => track.stop());
+	$: {
+		if (audioQueue.length > 0 && !assistantSpeaking) {
+			playAudioHandler();
 		}
+	}
 
-		cameraStream = null;
-	};
+	onMount(() => {
+		eventTarget.addEventListener('chat:start', async (e) => {
+			if ($showCallOverlay) {
+				console.log('Chat start event:', e);
+				await resetAssistantMessage();
+				await tick();
+				chatStreaming = true;
+			}
+		});
 
-	const takeScreenshot = () => {
-		const video = document.getElementById('camera-feed');
-		const canvas = document.getElementById('camera-canvas');
+		eventTarget.addEventListener('chat', async (e) => {
+			if ($showCallOverlay) {
+				const { content } = e.detail;
+				assistantMessage += content;
+				await tick();
 
-		if (!canvas) {
-			return;
-		}
+				if (!interrupted) {
+					if ($config.audio.tts.engine !== '') {
+						assistantSentenceIdx = assistantSentences.length - 2;
 
-		const context = canvas.getContext('2d');
-
-		// Make the canvas match the video dimensions
-		canvas.width = video.videoWidth;
-		canvas.height = video.videoHeight;
+						if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
+							await tick();
+							setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
+						}
+					}
+				}
 
-		// Draw the image from the video onto the canvas
-		context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight);
+				chatStreaming = true;
+			}
+		});
 
-		// Convert the canvas to a data base64 URL and console log it
-		const dataURL = canvas.toDataURL('image/png');
-		console.log(dataURL);
+		eventTarget.addEventListener('chat:finish', async (e) => {
+			if ($showCallOverlay) {
+				chatStreaming = false;
+				loading = false;
 
-		return dataURL;
-	};
+				console.log('Chat finish event:', e);
+				await tick();
 
-	const stopCamera = async () => {
-		await stopVideoStream();
-		camera = false;
-	};
+				if (!interrupted) {
+					if ($config.audio.tts.engine !== '') {
+						for (const [idx, sentence] of assistantSentences.entries()) {
+							if (!assistantSentenceAudios[idx]) {
+								await tick();
+								setContentAudio(sentence, idx);
+							}
+						}
+					} else {
+						if ($settings?.showEmojiInCall ?? false) {
+							const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
+
+							if (res) {
+								console.log(res);
+								if (/\p{Extended_Pictographic}/u.test(res)) {
+									emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
+								}
+							}
+						}
 
-	$: if ($showCallOverlay) {
-		startRecording();
-	} else {
-		stopCamera();
-	}
+						speakSpeechSynthesisHandler(assistantMessage);
+					}
+				}
+			}
+		});
+	});
 </script>
 
+<audio id="audioElement" src="" style="display: none;" />
+
 {#if $showCallOverlay}
-	<audio id="audioElement" src="" style="display: none;" />
 	<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
 		<div
 			class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"
 		>
 			<div class="max-w-lg w-full h-screen max-h-[100dvh] flex flex-col justify-between p-3 md:p-6">
 				{#if camera}
-					<div class="flex justify-center items-center w-full min-h-20">
+					<div class="flex justify-center items-center w-full h-20 min-h-20">
 						{#if loading}
 							<svg
 								class="size-12 text-gray-900 dark:text-gray-400"
@@ -492,6 +635,19 @@
 									r="3"
 								/><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
 							>
+						{:else if emoji}
+							<div
+								class="  transition-all rounded-full"
+								style="font-size:{rmsLevel * 100 > 4
+									? '4.5'
+									: rmsLevel * 100 > 2
+									? '4.25'
+									: rmsLevel * 100 > 1
+									? '3.75'
+									: '3.5'}rem;width: 100%; text-align:center;"
+							>
+								{emoji}
+							</div>
 						{:else}
 							<div
 								class=" {rmsLevel * 100 > 4
@@ -546,6 +702,19 @@
 									r="3"
 								/><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
 							>
+						{:else if emoji}
+							<div
+								class="  transition-all rounded-full"
+								style="font-size:{rmsLevel * 100 > 4
+									? '13'
+									: rmsLevel * 100 > 2
+									? '12'
+									: rmsLevel * 100 > 1
+									? '11.5'
+									: '11'}rem;width:100%;text-align:center;"
+							>
+								{emoji}
+							</div>
 						{:else}
 							<div
 								class=" {rmsLevel * 100 > 4

+ 1 - 1
src/lib/components/chat/Messages.svelte

@@ -79,7 +79,7 @@
 		history.currentId = userMessageId;
 
 		await tick();
-		await sendPrompt(userPrompt, userMessageId, undefined, false);
+		await sendPrompt(userPrompt, userMessageId);
 	};
 
 	const updateChatMessages = async () => {

+ 32 - 0
src/lib/components/chat/Settings/Interface.svelte

@@ -20,9 +20,12 @@
 	// Interface
 	let defaultModelId = '';
 	let showUsername = false;
+
 	let chatBubble = true;
 	let chatDirection: 'LTR' | 'RTL' = 'LTR';
 
+	let showEmojiInCall = false;
+
 	const toggleSplitLargeChunks = async () => {
 		splitLargeChunks = !splitLargeChunks;
 		saveSettings({ splitLargeChunks: splitLargeChunks });
@@ -43,6 +46,11 @@
 		saveSettings({ showUsername: showUsername });
 	};
 
+	const toggleEmojiInCall = async () => {
+		showEmojiInCall = !showEmojiInCall;
+		saveSettings({ showEmojiInCall: showEmojiInCall });
+	};
+
 	const toggleTitleAutoGenerate = async () => {
 		titleAutoGenerate = !titleAutoGenerate;
 		saveSettings({
@@ -88,8 +96,12 @@
 
 	onMount(async () => {
 		titleAutoGenerate = $settings?.title?.auto ?? true;
+
 		responseAutoCopy = $settings.responseAutoCopy ?? false;
 		showUsername = $settings.showUsername ?? false;
+
+		showEmojiInCall = $settings.showEmojiInCall ?? false;
+
 		chatBubble = $settings.chatBubble ?? true;
 		widescreenMode = $settings.widescreenMode ?? false;
 		splitLargeChunks = $settings.splitLargeChunks ?? false;
@@ -192,6 +204,26 @@
 				</div>
 			</div>
 
+			<div>
+				<div class=" py-0.5 flex w-full justify-between">
+					<div class=" self-center text-xs font-medium">{$i18n.t('Display Emoji in Call')}</div>
+
+					<button
+						class="p-1 px-3 text-xs flex rounded transition"
+						on:click={() => {
+							toggleEmojiInCall();
+						}}
+						type="button"
+					>
+						{#if showEmojiInCall === true}
+							<span class="ml-2 self-center">{$i18n.t('On')}</span>
+						{:else}
+							<span class="ml-2 self-center">{$i18n.t('Off')}</span>
+						{/if}
+					</button>
+				</div>
+			</div>
+
 			{#if !$settings.chatBubble}
 				<div>
 					<div class=" py-0.5 flex w-full justify-between">

+ 1 - 1
src/lib/utils/index.ts

@@ -436,7 +436,7 @@ export const removeEmojis = (str) => {
 
 export const extractSentences = (text) => {
 	// Split the paragraph into sentences based on common punctuation marks
-	const sentences = text.split(/(?<=[.!?])/);
+	const sentences = text.split(/(?<=[.!?])\s+/);
 
 	return sentences
 		.map((sentence) => removeEmojis(sentence.trim()))