il y a 8 mois · 3967c34261
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -37,6 +37,7 @@ from config import (
 
				     AUDIO_TTS_ENGINE,
			
 
				     AUDIO_TTS_MODEL,
			
 
				     AUDIO_TTS_VOICE,
			
 
				+    AUDIO_TTS_SPLIT_ON,
			
 
				     AppConfig,
			
 
				     CORS_ALLOW_ORIGIN,
			
 
				 )
			
@@ -72,6 +73,7 @@ app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
 
				 app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
			
 
				 app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
			
 
				 app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
			
 
				+app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
			
 
				 
			
 
				 # setting device type for whisper model
			
 
				 whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
			
@@ -88,6 +90,7 @@ class TTSConfigForm(BaseModel):
 
				     ENGINE: str
			
 
				     MODEL: str
			
 
				     VOICE: str
			
 
				+    SPLIT_ON: str
			
 
				 
			
 
				 
			
 
				 class STTConfigForm(BaseModel):
			
@@ -139,6 +142,7 @@ async def get_audio_config(user=Depends(get_admin_user)):
 
				             "ENGINE": app.state.config.TTS_ENGINE,
			
 
				             "MODEL": app.state.config.TTS_MODEL,
			
 
				             "VOICE": app.state.config.TTS_VOICE,
			
 
				+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
			
 
				         },
			
 
				         "stt": {
			
 
				             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
			
@@ -159,6 +163,7 @@ async def update_audio_config(
 
				     app.state.config.TTS_ENGINE = form_data.tts.ENGINE
			
 
				     app.state.config.TTS_MODEL = form_data.tts.MODEL
			
 
				     app.state.config.TTS_VOICE = form_data.tts.VOICE
			
 
				+    app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
			
 
				 
			
 
				     app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
			
 
				     app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
			
@@ -173,6 +178,7 @@ async def update_audio_config(
 
				             "ENGINE": app.state.config.TTS_ENGINE,
			
 
				             "MODEL": app.state.config.TTS_MODEL,
			
 
				             "VOICE": app.state.config.TTS_VOICE,
			
 
				+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
			
 
				         },
			
 
				         "stt": {
			
 
				             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
			
--- a/backend/config.py
+++ b/backend/config.py
@@ -1484,3 +1484,9 @@ AUDIO_TTS_VOICE = PersistentConfig(
 
				     "audio.tts.voice",
			
 
				     os.getenv("AUDIO_TTS_VOICE", "alloy"),  # OpenAI default voice
			
 
				 )
			
 
				+
			
 
				+AUDIO_TTS_SPLIT_ON = PersistentConfig(
			
 
				+    "AUDIO_TTS_SPLIT_ON",
			
 
				+    "audio.tts.split_on",
			
 
				+    os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
			
 
				+)
			
--- a/backend/main.py
+++ b/backend/main.py
@@ -1924,6 +1924,7 @@ async def get_app_config(request: Request):
 
				                     "tts": {
			
 
				                         "engine": audio_app.state.config.TTS_ENGINE,
			
 
				                         "voice": audio_app.state.config.TTS_VOICE,
			
 
				+                        "split_on": audio_app.state.config.TTS_SPLIT_ON,
			
 
				                     },
			
 
				                     "stt": {
			
 
				                         "engine": audio_app.state.config.STT_ENGINE,
			
--- a/src/lib/apis/audio/index.ts
+++ b/src/lib/apis/audio/index.ts
@@ -132,7 +132,11 @@ export const synthesizeOpenAISpeech = async (
 
				 	return res;
			
 
				 };
			
 
				 
			
 
				-export const getModels = async (token: string = '') => {
			
 
				+interface AvailableModelsResponse {
			
 
				+	models: { name: string; id: string }[] | { id: string }[];
			
 
				+}
			
 
				+
			
 
				+export const getModels = async (token: string = ''): Promise<AvailableModelsResponse> => {
			
 
				 	let error = null;
			
 
				 
			
 
				 	const res = await fetch(`${AUDIO_API_BASE_URL}/models`, {
			
--- a/src/lib/components/admin/Settings/Audio.svelte
+++ b/src/lib/components/admin/Settings/Audio.svelte
@@ -10,31 +10,36 @@
 
				 		getModels as _getModels,
			
 
				 		getVoices as _getVoices
			
 
				 	} from '$lib/apis/audio';
			
 
				-	import { user, settings, config } from '$lib/stores';
			
 
				+	import { config } from '$lib/stores';
			
 
				 
			
 
				 	import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
			
 
				 
			
 
				-	const i18n = getContext('i18n');
			
 
				+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
			
 
				 
			
 
				-	export let saveHandler: Function;
			
 
				+	import type { Writable } from 'svelte/store';
			
 
				+	import type { i18n as i18nType } from 'i18next';
			
 
				 
			
 
				-	// Audio
			
 
				+	const i18n = getContext<Writable<i18nType>>('i18n');
			
 
				+
			
 
				+	export let saveHandler: () => void;
			
 
				 
			
 
				+	// Audio
			
 
				 	let TTS_OPENAI_API_BASE_URL = '';
			
 
				 	let TTS_OPENAI_API_KEY = '';
			
 
				 	let TTS_API_KEY = '';
			
 
				 	let TTS_ENGINE = '';
			
 
				 	let TTS_MODEL = '';
			
 
				 	let TTS_VOICE = '';
			
 
				+	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
			
 
				 
			
 
				 	let STT_OPENAI_API_BASE_URL = '';
			
 
				 	let STT_OPENAI_API_KEY = '';
			
 
				 	let STT_ENGINE = '';
			
 
				 	let STT_MODEL = '';
			
 
				 
			
 
				-	let voices = [];
			
 
				-	let models = [];
			
 
				-	let nonLocalVoices = false;
			
 
				+	// eslint-disable-next-line no-undef
			
 
				+	let voices: SpeechSynthesisVoice[] = [];
			
 
				+	let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
			
 
				 
			
 
				 	const getModels = async () => {
			
 
				 		if (TTS_ENGINE === '') {
			
@@ -53,8 +58,8 @@
 
				 
			
 
				 	const getVoices = async () => {
			
 
				 		if (TTS_ENGINE === '') {
			
 
				-			const getVoicesLoop = setInterval(async () => {
			
 
				-				voices = await speechSynthesis.getVoices();
			
 
				+			const getVoicesLoop = setInterval(() => {
			
 
				+				voices = speechSynthesis.getVoices();
			
 
				 
			
 
				 				// do your loop
			
 
				 				if (voices.length > 0) {
			
@@ -81,7 +86,8 @@
 
				 				API_KEY: TTS_API_KEY,
			
 
				 				ENGINE: TTS_ENGINE,
			
 
				 				MODEL: TTS_MODEL,
			
 
				-				VOICE: TTS_VOICE
			
 
				+				VOICE: TTS_VOICE,
			
 
				+				SPLIT_ON: TTS_SPLIT_ON
			
 
				 			},
			
 
				 			stt: {
			
 
				 				OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
			
@@ -92,9 +98,8 @@
 
				 		});
			
 
				 
			
 
				 		if (res) {
			
 
				-			toast.success($i18n.t('Audio settings updated successfully'));
			
 
				-
			
 
				-			config.set(await getBackendConfig());
			
 
				+			saveHandler();
			
 
				+			getBackendConfig().then(config.set).catch(() => {});
			
 
				 		}
			
 
				 	};
			
 
				 
			
@@ -111,6 +116,8 @@
 
				 			TTS_MODEL = res.tts.MODEL;
			
 
				 			TTS_VOICE = res.tts.VOICE;
			
 
				 
			
 
				+			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
			
 
				+
			
 
				 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
			
 
				 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
			
 
				 
			
@@ -139,7 +146,7 @@
 
				 					<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
			
 
				 					<div class="flex items-center relative">
			
 
				 						<select
			
 
				-							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
			
 
				+							class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
			
 
				 							bind:value={STT_ENGINE}
			
 
				 							placeholder="Select an engine"
			
 
				 						>
			
@@ -195,7 +202,7 @@
 
				 					<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
			
 
				 					<div class="flex items-center relative">
			
 
				 						<select
			
 
				-							class=" dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
			
 
				+							class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
			
 
				 							bind:value={TTS_ENGINE}
			
 
				 							placeholder="Select a mode"
			
 
				 							on:change={async (e) => {
			
@@ -203,7 +210,7 @@
 
				 								await getVoices();
			
 
				 								await getModels();
			
 
				 
			
 
				-								if (e.target.value === 'openai') {
			
 
				+								if (e.target?.value === 'openai') {
			
 
				 									TTS_VOICE = 'alloy';
			
 
				 									TTS_MODEL = 'tts-1';
			
 
				 								} else {
			
@@ -351,6 +358,28 @@
 
				 						</div>
			
 
				 					</div>
			
 
				 				{/if}
			
 
				+
			
 
				+				<hr class="dark:border-gray-850 my-2" />
			
 
				+
			
 
				+				<div class="pt-0.5 flex w-full justify-between">
			
 
				+					<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
			
 
				+					<div class="flex items-center relative">
			
 
				+						<select
			
 
				+							class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
			
 
				+							placeholder="Select how to split response text"
			
 
				+							bind:value={TTS_SPLIT_ON}
			
 
				+						>
			
 
				+						{#each Object.values(TTS_RESPONSE_SPLIT) as split}
			
 
				+							<option value={split}>{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option>
			
 
				+						{/each}
			
 
				+						</select>
			
 
				+					</div>
			
 
				+				</div>
			
 
				+				<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
			
 
				+					{$i18n.t(
			
 
				+						"Choose how to split response text for speech synthesis. 'Punctuation' splits by sentences, 'paragraphs' splits by paragraphs, and 'none' sends the response as a single string."
			
 
				+					)}
			
 
				+				</div>
			
 
				 			</div>
			
 
				 		</div>
			
 
				 	</div>
			
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@@ -2,11 +2,10 @@
 
				 	import { toast } from 'svelte-sonner';
			
 
				 	import dayjs from 'dayjs';
			
 
				 
			
 
				-	import { fade } from 'svelte/transition';
			
 
				 	import { createEventDispatcher } from 'svelte';
			
 
				 	import { onMount, tick, getContext } from 'svelte';
			
 
				 
			
 
				-	const i18n = getContext('i18n');
			
 
				+	const i18n = getContext<Writable<i18nType>>('i18n');
			
 
				 
			
 
				 	const dispatch = createEventDispatcher();
			
 
				 
			
@@ -15,20 +14,18 @@
 
				 	import { imageGenerations } from '$lib/apis/images';
			
 
				 	import {
			
 
				 		approximateToHumanReadable,
			
 
				-		extractSentences,
			
 
				-		replaceTokens,
			
 
				-		processResponseContent
			
 
				+		extractParagraphsForAudio,
			
 
				+		extractSentencesForAudio,
			
 
				+		prepareTextForTTS,
			
 
				 	} from '$lib/utils';
			
 
				 	import { WEBUI_BASE_URL } from '$lib/constants';
			
 
				 
			
 
				 	import Name from './Name.svelte';
			
 
				 	import ProfileImage from './ProfileImage.svelte';
			
 
				 	import Skeleton from './Skeleton.svelte';
			
 
				-	import CodeBlock from './CodeBlock.svelte';
			
 
				 	import Image from '$lib/components/common/Image.svelte';
			
 
				 	import Tooltip from '$lib/components/common/Tooltip.svelte';
			
 
				 	import RateComment from './RateComment.svelte';
			
 
				-	import CitationsModal from '$lib/components/chat/Messages/CitationsModal.svelte';
			
 
				 	import Spinner from '$lib/components/common/Spinner.svelte';
			
 
				 	import WebSearchResults from './ResponseMessage/WebSearchResults.svelte';
			
 
				 	import Sparkles from '$lib/components/icons/Sparkles.svelte';
			
@@ -36,7 +33,38 @@
 
				 	import Error from './Error.svelte';
			
 
				 	import Citations from './Citations.svelte';
			
 
				 
			
 
				-	export let message;
			
 
				+	import type { Writable } from 'svelte/store';
			
 
				+	import type { i18n as i18nType } from 'i18next';
			
 
				+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
			
 
				+
			
 
				+	interface MessageType {
			
 
				+		id: string;
			
 
				+		model: string;
			
 
				+		content: string;
			
 
				+		files?: { type: string; url: string }[];
			
 
				+		timestamp: number;
			
 
				+		role: string;
			
 
				+		statusHistory?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }[];
			
 
				+		status?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; };
			
 
				+		done: boolean;
			
 
				+		error?: boolean | { content: string };
			
 
				+		citations?: string[];
			
 
				+		info?: {
			
 
				+			openai?: boolean;
			
 
				+			prompt_tokens?: number;
			
 
				+			completion_tokens?: number;
			
 
				+			total_tokens?: number;
			
 
				+			eval_count?: number;
			
 
				+			eval_duration?: number;
			
 
				+			prompt_eval_count?: number;
			
 
				+			prompt_eval_duration?: number;
			
 
				+			total_duration?: number;
			
 
				+			load_duration?: number;
			
 
				+		};
			
 
				+		annotation?: { type: string; rating: number; };
			
 
				+	}
			
 
				+
			
 
				+	export let message: MessageType;
			
 
				 	export let siblings;
			
 
				 
			
 
				 	export let isLastMessage = true;
			
@@ -60,28 +88,33 @@
 
				 	let editedContent = '';
			
 
				 	let editTextAreaElement: HTMLTextAreaElement;
			
 
				 
			
 
				-	let sentencesAudio = {};
			
 
				-	let speaking = null;
			
 
				-	let speakingIdx = null;
			
 
				+	let audioParts: Record<number, HTMLAudioElement | null> = {};
			
 
				+	let speaking = false;
			
 
				+	let speakingIdx: number | undefined;
			
 
				 
			
 
				 	let loadingSpeech = false;
			
 
				 	let generatingImage = false;
			
 
				 
			
 
				 	let showRateComment = false;
			
 
				 
			
 
				-	const playAudio = (idx) => {
			
 
				-		return new Promise((res) => {
			
 
				+	const playAudio = (idx: number) => {
			
 
				+		return new Promise<void>((res) => {
			
 
				 			speakingIdx = idx;
			
 
				-			const audio = sentencesAudio[idx];
			
 
				+			const audio = audioParts[idx];
			
 
				+
			
 
				+			if (!audio) {
			
 
				+				return res();
			
 
				+			}
			
 
				+
			
 
				 			audio.play();
			
 
				-			audio.onended = async (e) => {
			
 
				+			audio.onended = async () => {
			
 
				 				await new Promise((r) => setTimeout(r, 300));
			
 
				 
			
 
				-				if (Object.keys(sentencesAudio).length - 1 === idx) {
			
 
				-					speaking = null;
			
 
				+				if (Object.keys(audioParts).length - 1 === idx) {
			
 
				+					speaking = false;
			
 
				 				}
			
 
				 
			
 
				-				res(e);
			
 
				+				res();
			
 
				 			};
			
 
				 		});
			
 
				 	};
			
@@ -91,113 +124,119 @@
 
				 			try {
			
 
				 				speechSynthesis.cancel();
			
 
				 
			
 
				-				sentencesAudio[speakingIdx].pause();
			
 
				-				sentencesAudio[speakingIdx].currentTime = 0;
			
 
				+				if (speakingIdx !== undefined && audioParts[speakingIdx]) {
			
 
				+					audioParts[speakingIdx]!.pause();
			
 
				+					audioParts[speakingIdx]!.currentTime = 0;
			
 
				+				}
			
 
				 			} catch {}
			
 
				 
			
 
				-			speaking = null;
			
 
				-			speakingIdx = null;
			
 
				+			speaking = false;
			
 
				+			speakingIdx = undefined;
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		if (!(message?.content ?? '').trim().length) {
			
 
				+			toast.info($i18n.t('No content to speak'));
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		speaking = true;
			
 
				+
			
 
				+		if ($config.audio.tts.engine !== '') {
			
 
				+			loadingSpeech = true;
			
 
				+
			
 
				+			const preparedMessageContent: string[] = [];
			
 
				+
			
 
				+			switch ($config.audio.tts.split_on) {
			
 
				+				default:
			
 
				+				case TTS_RESPONSE_SPLIT.PUNCTUATION:
			
 
				+				preparedMessageContent.push(...extractSentencesForAudio(message.content));
			
 
				+					break;
			
 
				+				case TTS_RESPONSE_SPLIT.PARAGRAPHS:
			
 
				+				preparedMessageContent.push(...extractParagraphsForAudio(message.content));
			
 
				+					break;
			
 
				+				case TTS_RESPONSE_SPLIT.NONE:
			
 
				+				preparedMessageContent.push(prepareTextForTTS(message.content));
			
 
				+					break;
			
 
				+			}
			
 
				+
			
 
				+			if (!preparedMessageContent.length) {
			
 
				+				console.log('No content to speak');
			
 
				+				toast.info($i18n.t('No content to speak'));
			
 
				+
			
 
				+				speaking = false;
			
 
				+				loadingSpeech = false;
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			console.debug('Prepared message content for TTS', preparedMessageContent);
			
 
				+
			
 
				+			audioParts = preparedMessageContent.reduce((acc, _sentence, idx) => {
			
 
				+				acc[idx] = null;
			
 
				+				return acc;
			
 
				+			}, {} as typeof audioParts);
			
 
				+
			
 
				+			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
			
 
				+
			
 
				+			for (const [idx, sentence] of preparedMessageContent.entries()) {
			
 
				+				const res = await synthesizeOpenAISpeech(
			
 
				+					localStorage.token,
			
 
				+					$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
			
 
				+						? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
			
 
				+						: $config?.audio?.tts?.voice,
			
 
				+					sentence
			
 
				+				).catch((error) => {
			
 
				+					console.error(error);
			
 
				+					toast.error(error);
			
 
				+
			
 
				+					speaking = false;
			
 
				+					loadingSpeech = false;
			
 
				+				});
			
 
				+
			
 
				+				if (res) {
			
 
				+					const blob = await res.blob();
			
 
				+					const blobUrl = URL.createObjectURL(blob);
			
 
				+					const audio = new Audio(blobUrl);
			
 
				+					audioParts[idx] = audio;
			
 
				+					loadingSpeech = false;
			
 
				+					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
			
 
				+				}
			
 
				+			}
			
 
				 		} else {
			
 
				-			if ((message?.content ?? '').trim() !== '') {
			
 
				-				speaking = true;
			
 
				-
			
 
				-				if ($config.audio.tts.engine !== '') {
			
 
				-					loadingSpeech = true;
			
 
				-
			
 
				-					const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
			
 
				-						const lastIndex = mergedTexts.length - 1;
			
 
				-						if (lastIndex >= 0) {
			
 
				-							const previousText = mergedTexts[lastIndex];
			
 
				-							const wordCount = previousText.split(/\s+/).length;
			
 
				-							if (wordCount < 2) {
			
 
				-								mergedTexts[lastIndex] = previousText + ' ' + currentText;
			
 
				-							} else {
			
 
				-								mergedTexts.push(currentText);
			
 
				-							}
			
 
				-						} else {
			
 
				-							mergedTexts.push(currentText);
			
 
				-						}
			
 
				-						return mergedTexts;
			
 
				-					}, []);
			
 
				-
			
 
				-					console.log(sentences);
			
 
				-
			
 
				-					if (sentences.length > 0) {
			
 
				-						sentencesAudio = sentences.reduce((a, e, i, arr) => {
			
 
				-							a[i] = null;
			
 
				-							return a;
			
 
				-						}, {});
			
 
				-
			
 
				-						let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
			
 
				-
			
 
				-						for (const [idx, sentence] of sentences.entries()) {
			
 
				-							const res = await synthesizeOpenAISpeech(
			
 
				-								localStorage.token,
			
 
				-								$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
			
 
				-									? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
			
 
				-									: $config?.audio?.tts?.voice,
			
 
				-								sentence
			
 
				-							).catch((error) => {
			
 
				-								toast.error(error);
			
 
				-
			
 
				-								speaking = null;
			
 
				-								loadingSpeech = false;
			
 
				-
			
 
				-								return null;
			
 
				-							});
			
 
				-
			
 
				-							if (res) {
			
 
				-								const blob = await res.blob();
			
 
				-								const blobUrl = URL.createObjectURL(blob);
			
 
				-								const audio = new Audio(blobUrl);
			
 
				-								sentencesAudio[idx] = audio;
			
 
				-								loadingSpeech = false;
			
 
				-								lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
			
 
				-							}
			
 
				+			let voices = [];
			
 
				+			const getVoicesLoop = setInterval(() => {
			
 
				+				voices = speechSynthesis.getVoices();
			
 
				+				if (voices.length > 0) {
			
 
				+					clearInterval(getVoicesLoop);
			
 
				+
			
 
				+					const voice =
			
 
				+						voices
			
 
				+							?.filter(
			
 
				+								(v) =>
			
 
				+									v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
			
 
				+							)
			
 
				+							?.at(0) ?? undefined;
			
 
				+
			
 
				+					console.log(voice);
			
 
				+
			
 
				+					const speak = new SpeechSynthesisUtterance(message.content);
			
 
				+
			
 
				+					console.log(speak);
			
 
				+
			
 
				+					speak.onend = () => {
			
 
				+						speaking = false;
			
 
				+						if ($settings.conversationMode) {
			
 
				+							document.getElementById('voice-input-button')?.click();
			
 
				 						}
			
 
				-					} else {
			
 
				-						speaking = null;
			
 
				-						loadingSpeech = false;
			
 
				+					};
			
 
				+
			
 
				+					if (voice) {
			
 
				+						speak.voice = voice;
			
 
				 					}
			
 
				-				} else {
			
 
				-					let voices = [];
			
 
				-					const getVoicesLoop = setInterval(async () => {
			
 
				-						voices = await speechSynthesis.getVoices();
			
 
				-						if (voices.length > 0) {
			
 
				-							clearInterval(getVoicesLoop);
			
 
				-
			
 
				-							const voice =
			
 
				-								voices
			
 
				-									?.filter(
			
 
				-										(v) =>
			
 
				-											v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
			
 
				-									)
			
 
				-									?.at(0) ?? undefined;
			
 
				-
			
 
				-							console.log(voice);
			
 
				-
			
 
				-							const speak = new SpeechSynthesisUtterance(message.content);
			
 
				-
			
 
				-							console.log(speak);
			
 
				-
			
 
				-							speak.onend = () => {
			
 
				-								speaking = null;
			
 
				-								if ($settings.conversationMode) {
			
 
				-									document.getElementById('voice-input-button')?.click();
			
 
				-								}
			
 
				-							};
			
 
				-
			
 
				-							if (voice) {
			
 
				-								speak.voice = voice;
			
 
				-							}
			
 
				-
			
 
				-							speechSynthesis.speak(speak);
			
 
				-						}
			
 
				-					}, 100);
			
 
				+
			
 
				+					speechSynthesis.speak(speak);
			
 
				 				}
			
 
				-			} else {
			
 
				-				toast.error($i18n.t('No content to speak'));
			
 
				-			}
			
 
				+			}, 100);
			
 
				 		}
			
 
				 	};
			
 
				 
			
@@ -230,7 +269,7 @@
 
				 		await tick();
			
 
				 	};
			
 
				 
			
 
				-	const generateImage = async (message) => {
			
 
				+	const generateImage = async (message: MessageType) => {
			
 
				 		generatingImage = true;
			
 
				 		const res = await imageGenerations(localStorage.token, message.content).catch((error) => {
			
 
				 			toast.error(error);
			
@@ -285,7 +324,7 @@
 
				 			</Name>
			
 
				 
			
 
				 			<div>
			
 
				-				{#if (message?.files ?? []).filter((f) => f.type === 'image').length > 0}
			
 
				+				{#if message?.files && message.files?.filter((f) => f.type === 'image').length > 0}
			
 
				 					<div class="my-2.5 w-full flex overflow-x-auto gap-2 flex-wrap">
			
 
				 						{#each message.files as file}
			
 
				 							<div>
			
@@ -304,7 +343,7 @@
 
				 								message?.statusHistory ?? [...(message?.status ? [message?.status] : [])]
			
 
				 							).at(-1)}
			
 
				 							<div class="flex items-center gap-2 pt-0.5 pb-1">
			
 
				-								{#if status.done === false}
			
 
				+								{#if status?.done === false}
			
 
				 									<div class="">
			
 
				 										<Spinner className="size-4" />
			
 
				 									</div>
			
@@ -521,7 +560,7 @@
 
				 											: 'invisible group-hover:visible'} p-1.5 hover:bg-black/5 dark:hover:bg-white/5 rounded-lg dark:hover:text-white hover:text-black transition"
			
 
				 										on:click={() => {
			
 
				 											if (!loadingSpeech) {
			
 
				-												toggleSpeakMessage(message);
			
 
				+												toggleSpeakMessage();
			
 
				 											}
			
 
				 										}}
			
 
				 									>
			
@@ -661,7 +700,7 @@
 
				 													`${
			
 
				 														Math.round(
			
 
				 															((message.info.eval_count ?? 0) /
			
 
				-																(message.info.eval_duration / 1000000000)) *
			
 
				+																((message.info.eval_duration ?? 0) / 1000000000)) *
			
 
				 																100
			
 
				 														) / 100
			
 
				 													} tokens` ?? 'N/A'
			
@@ -669,7 +708,7 @@
 
				 					prompt_token/s: ${
			
 
				 						Math.round(
			
 
				 							((message.info.prompt_eval_count ?? 0) /
			
 
				-								(message.info.prompt_eval_duration / 1000000000)) *
			
 
				+								((message.info.prompt_eval_duration ?? 0) / 1000000000)) *
			
 
				 								100
			
 
				 						) / 100 ?? 'N/A'
			
 
				 					} tokens<br/>
			
@@ -688,7 +727,7 @@
 
				 		            eval_duration: ${
			
 
				 									Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A'
			
 
				 								}ms<br/>
			
 
				-		            approximate_total: ${approximateToHumanReadable(message.info.total_duration)}`}
			
 
				+		            approximate_total: ${approximateToHumanReadable((message.info.total_duration ?? 0))}`}
			
 
				 										placement="top"
			
 
				 									>
			
 
				 										<Tooltip content={$i18n.t('Generation Info')} placement="bottom">
			
--- a/src/lib/types/index.ts
+++ b/src/lib/types/index.ts
@@ -7,3 +7,9 @@ export type Banner = {
 
				 	dismissible?: boolean;
			
 
				 	timestamp: number;
			
 
				 };
			
 
				+
			
 
				+export enum TTS_RESPONSE_SPLIT {
			
 
				+	PUNCTUATION = 'punctuation',
			
 
				+	PARAGRAPHS = 'paragraphs',
			
 
				+	NONE = 'none',
			
 
				+}
			
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -408,7 +408,7 @@ const convertOpenAIMessages = (convo) => {
 
				 	let currentId = '';
			
 
				 	let lastId = null;
			
 
				 
			
 
				-	for (let message_id in mapping) {
			
 
				+	for (const message_id in mapping) {
			
 
				 		const message = mapping[message_id];
			
 
				 		currentId = message_id;
			
 
				 		try {
			
@@ -442,7 +442,7 @@ const convertOpenAIMessages = (convo) => {
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	let history = {};
			
 
				+	const history: Record<PropertyKey, (typeof messages)[number]> = {};
			
 
				 	messages.forEach((obj) => (history[obj.id] = obj));
			
 
				 
			
 
				 	const chat = {
			
@@ -481,7 +481,7 @@ const validateChat = (chat) => {
 
				 	}
			
 
				 
			
 
				 	// Every message's content should be a string
			
 
				-	for (let message of messages) {
			
 
				+	for (const message of messages) {
			
 
				 		if (typeof message.content !== 'string') {
			
 
				 			return false;
			
 
				 		}
			
@@ -494,7 +494,7 @@ export const convertOpenAIChats = (_chats) => {
 
				 	// Create a list of dictionaries with each conversation from import
			
 
				 	const chats = [];
			
 
				 	let failed = 0;
			
 
				-	for (let convo of _chats) {
			
 
				+	for (const convo of _chats) {
			
 
				 		const chat = convertOpenAIMessages(convo);
			
 
				 
			
 
				 		if (validateChat(chat)) {
			
@@ -513,7 +513,7 @@ export const convertOpenAIChats = (_chats) => {
 
				 	return chats;
			
 
				 };
			
 
				 
			
 
				-export const isValidHttpUrl = (string) => {
			
 
				+export const isValidHttpUrl = (string: string) => {
			
 
				 	let url;
			
 
				 
			
 
				 	try {
			
@@ -525,7 +525,7 @@ export const isValidHttpUrl = (string) => {
 
				 	return url.protocol === 'http:' || url.protocol === 'https:';
			
 
				 };
			
 
				 
			
 
				-export const removeEmojis = (str) => {
			
 
				+export const removeEmojis = (str: string) => {
			
 
				 	// Regular expression to match emojis
			
 
				 	const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
			
 
				 
			
@@ -533,20 +533,24 @@ export const removeEmojis = (str) => {
 
				 	return str.replace(emojiRegex, '');
			
 
				 };
			
 
				 
			
 
				-export const removeFormattings = (str) => {
			
 
				+export const removeFormattings = (str: string) => {
			
 
				 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
			
 
				 };
			
 
				 
			
 
				-export const extractSentences = (text) => {
			
 
				-	// This regular expression matches code blocks marked by triple backticks
			
 
				-	const codeBlockRegex = /```[\s\S]*?```/g;
			
 
				+export const prepareTextForTTS = (content: string) => {
			
 
				+	return removeFormattings(removeEmojis(content.trim()));
			
 
				+};
			
 
				+
			
 
				+// This regular expression matches code blocks marked by triple backticks
			
 
				+const codeBlockRegex = /```[\s\S]*?```/g;
			
 
				 
			
 
				-	let codeBlocks = [];
			
 
				+export const extractSentences = (text: string) => {
			
 
				+	const codeBlocks: string[] = [];
			
 
				 	let index = 0;
			
 
				 
			
 
				 	// Temporarily replace code blocks with placeholders and store the blocks separately
			
 
				 	text = text.replace(codeBlockRegex, (match) => {
			
 
				-		let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
			
 
				+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
			
 
				 		codeBlocks[index++] = match;
			
 
				 		return placeholder;
			
 
				 	});
			
@@ -561,11 +565,36 @@ export const extractSentences = (text) => {
 
				 	});
			
 
				 
			
 
				 	return sentences
			
 
				-		.map((sentence) => removeFormattings(removeEmojis(sentence.trim())))
			
 
				-		.filter((sentence) => sentence);
			
 
				+		.map(prepareTextForTTS)
			
 
				+		.filter(Boolean);
			
 
				+};
			
 
				+
			
 
				+export const extractParagraphsForAudio = (text: string) => {
			
 
				+	const codeBlocks: string[] = [];
			
 
				+	let index = 0;
			
 
				+
			
 
				+	// Temporarily replace code blocks with placeholders and store the blocks separately
			
 
				+	text = text.replace(codeBlockRegex, (match) => {
			
 
				+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
			
 
				+		codeBlocks[index++] = match;
			
 
				+		return placeholder;
			
 
				+	});
			
 
				+
			
 
				+	// Split the modified text into paragraphs based on newlines, avoiding these blocks
			
 
				+	let paragraphs = text.split(/\n+/);
			
 
				+
			
 
				+	// Restore code blocks and process paragraphs
			
 
				+	paragraphs = paragraphs.map((paragraph) => {
			
 
				+		// Check if the paragraph includes a placeholder for a code block
			
 
				+		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
			
 
				+	});
			
 
				+
			
 
				+	return paragraphs
			
 
				+		.map(prepareTextForTTS)
			
 
				+		.filter(Boolean);
			
 
				 };
			
 
				 
			
 
				-export const extractSentencesForAudio = (text) => {
			
 
				+export const extractSentencesForAudio = (text: string) => {
			
 
				 	return extractSentences(text).reduce((mergedTexts, currentText) => {
			
 
				 		const lastIndex = mergedTexts.length - 1;
			
 
				 		if (lastIndex >= 0) {
			
@@ -580,7 +609,7 @@ export const extractSentencesForAudio = (text) => {
 
				 			mergedTexts.push(currentText);
			
 
				 		}
			
 
				 		return mergedTexts;
			
 
				-	}, []);
			
 
				+	}, [] as string[]);
			
 
				 };
			
 
				 
			
 
				 export const blobToFile = (blob, fileName) => {