Sfoglia il codice sorgente

Merge pull request #4886 from kiosion/dev

feat: Add control for how message content is split for TTS generation requests
Timothy Jaeryang Baek 8 mesi fa
parent
commit
b148865ee8

+ 6 - 0
backend/apps/audio/main.py

@@ -37,6 +37,7 @@ from config import (
     AUDIO_TTS_ENGINE,
     AUDIO_TTS_MODEL,
     AUDIO_TTS_VOICE,
+    AUDIO_TTS_SPLIT_ON,
     AppConfig,
     CORS_ALLOW_ORIGIN,
 )
@@ -72,6 +73,7 @@ app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
 app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
 app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
 app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
+app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
 
 # setting device type for whisper model
 whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
@@ -88,6 +90,7 @@ class TTSConfigForm(BaseModel):
     ENGINE: str
     MODEL: str
     VOICE: str
+    SPLIT_ON: str
 
 
 class STTConfigForm(BaseModel):
@@ -139,6 +142,7 @@ async def get_audio_config(user=Depends(get_admin_user)):
             "ENGINE": app.state.config.TTS_ENGINE,
             "MODEL": app.state.config.TTS_MODEL,
             "VOICE": app.state.config.TTS_VOICE,
+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
         },
         "stt": {
             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
@@ -159,6 +163,7 @@ async def update_audio_config(
     app.state.config.TTS_ENGINE = form_data.tts.ENGINE
     app.state.config.TTS_MODEL = form_data.tts.MODEL
     app.state.config.TTS_VOICE = form_data.tts.VOICE
+    app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
 
     app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
     app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
@@ -173,6 +178,7 @@ async def update_audio_config(
             "ENGINE": app.state.config.TTS_ENGINE,
             "MODEL": app.state.config.TTS_MODEL,
             "VOICE": app.state.config.TTS_VOICE,
+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
         },
         "stt": {
             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,

+ 6 - 0
backend/config.py

@@ -1484,3 +1484,9 @@ AUDIO_TTS_VOICE = PersistentConfig(
     "audio.tts.voice",
     os.getenv("AUDIO_TTS_VOICE", "alloy"),  # OpenAI default voice
 )
+
+AUDIO_TTS_SPLIT_ON = PersistentConfig(
+    "AUDIO_TTS_SPLIT_ON",
+    "audio.tts.split_on",
+    os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
+)

+ 1 - 0
backend/main.py

@@ -1933,6 +1933,7 @@ async def get_app_config(request: Request):
                     "tts": {
                         "engine": audio_app.state.config.TTS_ENGINE,
                         "voice": audio_app.state.config.TTS_VOICE,
+                        "split_on": audio_app.state.config.TTS_SPLIT_ON,
                     },
                     "stt": {
                         "engine": audio_app.state.config.STT_ENGINE,

+ 5 - 1
src/lib/apis/audio/index.ts

@@ -132,7 +132,11 @@ export const synthesizeOpenAISpeech = async (
 	return res;
 };
 
-export const getModels = async (token: string = '') => {
+interface AvailableModelsResponse {
+	models: { name: string; id: string }[] | { id: string }[];
+}
+
+export const getModels = async (token: string = ''): Promise<AvailableModelsResponse> => {
 	let error = null;
 
 	const res = await fetch(`${AUDIO_API_BASE_URL}/models`, {

+ 45 - 16
src/lib/components/admin/Settings/Audio.svelte

@@ -10,31 +10,36 @@
 		getModels as _getModels,
 		getVoices as _getVoices
 	} from '$lib/apis/audio';
-	import { user, settings, config } from '$lib/stores';
+	import { config } from '$lib/stores';
 
 	import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
 
-	const i18n = getContext('i18n');
+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
 
-	export let saveHandler: Function;
+	import type { Writable } from 'svelte/store';
+	import type { i18n as i18nType } from 'i18next';
 
-	// Audio
+	const i18n = getContext<Writable<i18nType>>('i18n');
+
+	export let saveHandler: () => void;
 
+	// Audio
 	let TTS_OPENAI_API_BASE_URL = '';
 	let TTS_OPENAI_API_KEY = '';
 	let TTS_API_KEY = '';
 	let TTS_ENGINE = '';
 	let TTS_MODEL = '';
 	let TTS_VOICE = '';
+	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 
 	let STT_OPENAI_API_BASE_URL = '';
 	let STT_OPENAI_API_KEY = '';
 	let STT_ENGINE = '';
 	let STT_MODEL = '';
 
-	let voices = [];
-	let models = [];
-	let nonLocalVoices = false;
+	// eslint-disable-next-line no-undef
+	let voices: SpeechSynthesisVoice[] = [];
+	let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
 
 	const getModels = async () => {
 		if (TTS_ENGINE === '') {
@@ -53,8 +58,8 @@
 
 	const getVoices = async () => {
 		if (TTS_ENGINE === '') {
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
+			const getVoicesLoop = setInterval(() => {
+				voices = speechSynthesis.getVoices();
 
 				// do your loop
 				if (voices.length > 0) {
@@ -81,7 +86,8 @@
 				API_KEY: TTS_API_KEY,
 				ENGINE: TTS_ENGINE,
 				MODEL: TTS_MODEL,
-				VOICE: TTS_VOICE
+				VOICE: TTS_VOICE,
+				SPLIT_ON: TTS_SPLIT_ON
 			},
 			stt: {
 				OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
@@ -92,9 +98,8 @@
 		});
 
 		if (res) {
-			toast.success($i18n.t('Audio settings updated successfully'));
-
-			config.set(await getBackendConfig());
+			saveHandler();
+			getBackendConfig().then(config.set).catch(() => {});
 		}
 	};
 
@@ -111,6 +116,8 @@
 			TTS_MODEL = res.tts.MODEL;
 			TTS_VOICE = res.tts.VOICE;
 
+			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
+
 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
 
@@ -139,7 +146,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={STT_ENGINE}
 							placeholder="Select an engine"
 						>
@@ -195,7 +202,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class=" dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={TTS_ENGINE}
 							placeholder="Select a mode"
 							on:change={async (e) => {
@@ -203,7 +210,7 @@
 								await getVoices();
 								await getModels();
 
-								if (e.target.value === 'openai') {
+								if (e.target?.value === 'openai') {
 									TTS_VOICE = 'alloy';
 									TTS_MODEL = 'tts-1';
 								} else {
@@ -351,6 +358,28 @@
 						</div>
 					</div>
 				{/if}
+
+				<hr class="dark:border-gray-850 my-2" />
+
+				<div class="pt-0.5 flex w-full justify-between">
+					<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
+					<div class="flex items-center relative">
+						<select
+							class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							aria-label="Select how to split message text for TTS requests"
+							bind:value={TTS_SPLIT_ON}
+						>
+						{#each Object.values(TTS_RESPONSE_SPLIT) as split}
+							<option value={split}>{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option>
+						{/each}
+						</select>
+					</div>
+				</div>
+				<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
+					{$i18n.t(
+						"Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string."
+					)}
+				</div>
 			</div>
 		</div>
 	</div>

+ 96 - 74
src/lib/components/chat/Chat.svelte

@@ -3,13 +3,13 @@
 	import { toast } from 'svelte-sonner';
 	import mermaid from 'mermaid';
 
-	import { getContext, onMount, tick } from 'svelte';
+	import { getContext, onDestroy, onMount, tick } from 'svelte';
 	import { goto } from '$app/navigation';
 	import { page } from '$app/stores';
 
-	import type { Writable } from 'svelte/store';
+	import type { Unsubscriber, Writable } from 'svelte/store';
 	import type { i18n as i18nType } from 'i18next';
-	import { OLLAMA_API_BASE_URL, OPENAI_API_BASE_URL, WEBUI_BASE_URL } from '$lib/constants';
+	import { WEBUI_BASE_URL } from '$lib/constants';
 
 	import {
 		chatId,
@@ -19,31 +19,26 @@
 		models,
 		settings,
 		showSidebar,
-		tags as _tags,
 		WEBUI_NAME,
 		banners,
 		user,
 		socket,
 		showCallOverlay,
-		tools,
 		currentChatPage,
 		temporaryChatEnabled
 	} from '$lib/stores';
 	import {
 		convertMessagesToHistory,
 		copyToClipboard,
+		getMessageContentParts,
 		extractSentencesForAudio,
-		getUserPosition,
 		promptTemplate,
 		splitStream
 	} from '$lib/utils';
 
 	import { generateChatCompletion } from '$lib/apis/ollama';
 	import {
-		addTagById,
 		createNewChat,
-		deleteTagById,
-		getAllChatTags,
 		getChatById,
 		getChatList,
 		getTagsById,
@@ -66,8 +61,6 @@
 	import MessageInput from '$lib/components/chat/MessageInput.svelte';
 	import Messages from '$lib/components/chat/Messages.svelte';
 	import Navbar from '$lib/components/layout/Navbar.svelte';
-	import CallOverlay from './MessageInput/CallOverlay.svelte';
-	import { error } from '@sveltejs/kit';
 	import ChatControls from './ChatControls.svelte';
 	import EventConfirmDialog from '../common/ConfirmDialog.svelte';
 
@@ -118,6 +111,8 @@
 
 	let params = {};
 
+	let chatIdUnsubscriber: Unsubscriber | undefined;
+
 	$: if (history.currentId !== null) {
 		let _messages = [];
 
@@ -207,47 +202,51 @@
 		}
 	};
 
-	onMount(async () => {
-		const onMessageHandler = async (event) => {
-			if (event.origin === window.origin) {
-				// Replace with your iframe's origin
-				console.log('Message received from iframe:', event.data);
-				if (event.data.type === 'input:prompt') {
-					console.log(event.data.text);
-
-					const inputElement = document.getElementById('chat-textarea');
-
-					if (inputElement) {
-						prompt = event.data.text;
-						inputElement.focus();
-					}
-				}
+	const onMessageHandler = async (event: {
+		origin: string;
+		data: { type: string; text: string };
+	}) => {
+		if (event.origin !== window.origin) {
+			return;
+		}
 
-				if (event.data.type === 'action:submit') {
-					console.log(event.data.text);
+		// Replace with your iframe's origin
+		if (event.data.type === 'input:prompt') {
+			console.debug(event.data.text);
 
-					if (prompt !== '') {
-						await tick();
-						submitPrompt(prompt);
-					}
-				}
+			const inputElement = document.getElementById('chat-textarea');
 
-				if (event.data.type === 'input:prompt:submit') {
-					console.log(event.data.text);
+			if (inputElement) {
+				prompt = event.data.text;
+				inputElement.focus();
+			}
+		}
 
-					if (prompt !== '') {
-						await tick();
-						submitPrompt(event.data.text);
-					}
-				}
+		if (event.data.type === 'action:submit') {
+			console.debug(event.data.text);
+
+			if (prompt !== '') {
+				await tick();
+				submitPrompt(prompt);
 			}
-		};
-		window.addEventListener('message', onMessageHandler);
+		}
 
-		$socket.on('chat-events', chatEventHandler);
+		if (event.data.type === 'input:prompt:submit') {
+			console.debug(event.data.text);
+
+			if (prompt !== '') {
+				await tick();
+				submitPrompt(event.data.text);
+			}
+		}
+	};
+
+	onMount(async () => {
+		window.addEventListener('message', onMessageHandler);
+		$socket?.on('chat-events', chatEventHandler);
 
 		if (!$chatId) {
-			chatId.subscribe(async (value) => {
+			chatIdUnsubscriber = chatId.subscribe(async (value) => {
 				if (!value) {
 					await initNewChat();
 				}
@@ -257,12 +256,12 @@
 				await goto('/');
 			}
 		}
+	});
 
-		return () => {
-			window.removeEventListener('message', onMessageHandler);
-
-			$socket.off('chat-events');
-		};
+	onDestroy(() => {
+		chatIdUnsubscriber?.();
+		window.removeEventListener('message', onMessageHandler);
+		$socket?.off('chat-events');
 	});
 
 	//////////////////////////
@@ -595,11 +594,11 @@
 	};
 
 	const sendPrompt = async (
-		prompt,
-		parentId,
+		prompt: string,
+		parentId: string,
 		{ modelId = null, modelIdx = null, newChat = false } = {}
 	) => {
-		let _responses = [];
+		let _responses: string[] = [];
 
 		// If modelId is provided, use it, else use selected model
 		let selectedModelIds = modelId
@@ -609,7 +608,7 @@
 				: selectedModels;
 
 		// Create response messages for each selected model
-		const responseMessageIds = {};
+		const responseMessageIds: Record<PropertyKey, string> = {};
 		for (const [_modelIdx, modelId] of selectedModelIds.entries()) {
 			const model = $models.filter((m) => m.id === modelId).at(0);
 
@@ -739,13 +738,13 @@
 		);
 
 		currentChatPage.set(1);
-		await chats.set(await getChatList(localStorage.token, $currentChatPage));
+		chats.set(await getChatList(localStorage.token, $currentChatPage));
 
 		return _responses;
 	};
 
 	const sendPromptOllama = async (model, userPrompt, responseMessageId, _chatId) => {
-		let _response = null;
+		let _response: string | null = null;
 
 		const responseMessage = history.messages[responseMessageId];
 		const userMessage = history.messages[responseMessage.parentId];
@@ -776,7 +775,7 @@
 			...messages
 		]
 			.filter((message) => message?.content?.trim())
-			.map((message, idx, arr) => {
+			.map((message) => {
 				// Prepare the base message object
 				const baseMessage = {
 					role: message.role,
@@ -928,18 +927,26 @@
 										navigator.vibrate(5);
 									}
 
-									const sentences = extractSentencesForAudio(responseMessage.content);
-									sentences.pop();
+									const messageContentParts = getMessageContentParts(
+										responseMessage.content,
+										$config?.audio?.tts?.split_on ?? 'punctuation'
+									);
+									messageContentParts.pop();
 
 									// dispatch only last sentence and make sure it hasn't been dispatched before
 									if (
-										sentences.length > 0 &&
-										sentences[sentences.length - 1] !== responseMessage.lastSentence
+										messageContentParts.length > 0 &&
+										messageContentParts[messageContentParts.length - 1] !==
+											responseMessage.lastSentence
 									) {
-										responseMessage.lastSentence = sentences[sentences.length - 1];
+										responseMessage.lastSentence =
+											messageContentParts[messageContentParts.length - 1];
 										eventTarget.dispatchEvent(
 											new CustomEvent('chat', {
-												detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+												detail: {
+													id: responseMessageId,
+													content: messageContentParts[messageContentParts.length - 1]
+												}
 											})
 										);
 									}
@@ -1042,14 +1049,19 @@
 		stopResponseFlag = false;
 		await tick();
 
-		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
-		if (lastSentence) {
+		let lastMessageContentPart =
+			getMessageContentParts(
+				responseMessage.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			)?.at(-1) ?? '';
+		if (lastMessageContentPart) {
 			eventTarget.dispatchEvent(
 				new CustomEvent('chat', {
-					detail: { id: responseMessageId, content: lastSentence }
+					detail: { id: responseMessageId, content: lastMessageContentPart }
 				})
 			);
 		}
+
 		eventTarget.dispatchEvent(
 			new CustomEvent('chat:finish', {
 				detail: {
@@ -1249,18 +1261,24 @@
 							navigator.vibrate(5);
 						}
 
-						const sentences = extractSentencesForAudio(responseMessage.content);
-						sentences.pop();
+						const messageContentParts = getMessageContentParts(
+							responseMessage.content,
+							$config?.audio?.tts?.split_on ?? 'punctuation'
+						);
+						messageContentParts.pop();
 
 						// dispatch only last sentence and make sure it hasn't been dispatched before
 						if (
-							sentences.length > 0 &&
-							sentences[sentences.length - 1] !== responseMessage.lastSentence
+							messageContentParts.length > 0 &&
+							messageContentParts[messageContentParts.length - 1] !== responseMessage.lastSentence
 						) {
-							responseMessage.lastSentence = sentences[sentences.length - 1];
+							responseMessage.lastSentence = messageContentParts[messageContentParts.length - 1];
 							eventTarget.dispatchEvent(
 								new CustomEvent('chat', {
-									detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+									detail: {
+										id: responseMessageId,
+										content: messageContentParts[messageContentParts.length - 1]
+									}
 								})
 							);
 						}
@@ -1315,11 +1333,15 @@
 		stopResponseFlag = false;
 		await tick();
 
-		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
-		if (lastSentence) {
+		let lastMessageContentPart =
+			getMessageContentParts(
+				responseMessage.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			)?.at(-1) ?? '';
+		if (lastMessageContentPart) {
 			eventTarget.dispatchEvent(
 				new CustomEvent('chat', {
-					detail: { id: responseMessageId, content: lastSentence }
+					detail: { id: responseMessageId, content: lastMessageContentPart }
 				})
 			);
 		}

+ 169 - 126
src/lib/components/chat/Messages/ResponseMessage.svelte

@@ -2,11 +2,10 @@
 	import { toast } from 'svelte-sonner';
 	import dayjs from 'dayjs';
 
-	import { fade } from 'svelte/transition';
 	import { createEventDispatcher } from 'svelte';
 	import { onMount, tick, getContext } from 'svelte';
 
-	const i18n = getContext('i18n');
+	const i18n = getContext<Writable<i18nType>>('i18n');
 
 	const dispatch = createEventDispatcher();
 
@@ -15,20 +14,19 @@
 	import { imageGenerations } from '$lib/apis/images';
 	import {
 		approximateToHumanReadable,
-		extractSentences,
-		replaceTokens,
-		processResponseContent
+		extractParagraphsForAudio,
+		extractSentencesForAudio,
+		cleanText,
+		getMessageContentParts
 	} from '$lib/utils';
 	import { WEBUI_BASE_URL } from '$lib/constants';
 
 	import Name from './Name.svelte';
 	import ProfileImage from './ProfileImage.svelte';
 	import Skeleton from './Skeleton.svelte';
-	import CodeBlock from './CodeBlock.svelte';
 	import Image from '$lib/components/common/Image.svelte';
 	import Tooltip from '$lib/components/common/Tooltip.svelte';
 	import RateComment from './RateComment.svelte';
-	import CitationsModal from '$lib/components/chat/Messages/CitationsModal.svelte';
 	import Spinner from '$lib/components/common/Spinner.svelte';
 	import WebSearchResults from './ResponseMessage/WebSearchResults.svelte';
 	import Sparkles from '$lib/components/icons/Sparkles.svelte';
@@ -36,7 +34,49 @@
 	import Error from './Error.svelte';
 	import Citations from './Citations.svelte';
 
-	export let message;
+	import type { Writable } from 'svelte/store';
+	import type { i18n as i18nType } from 'i18next';
+
+	interface MessageType {
+		id: string;
+		model: string;
+		content: string;
+		files?: { type: string; url: string }[];
+		timestamp: number;
+		role: string;
+		statusHistory?: {
+			done: boolean;
+			action: string;
+			description: string;
+			urls?: string[];
+			query?: string;
+		}[];
+		status?: {
+			done: boolean;
+			action: string;
+			description: string;
+			urls?: string[];
+			query?: string;
+		};
+		done: boolean;
+		error?: boolean | { content: string };
+		citations?: string[];
+		info?: {
+			openai?: boolean;
+			prompt_tokens?: number;
+			completion_tokens?: number;
+			total_tokens?: number;
+			eval_count?: number;
+			eval_duration?: number;
+			prompt_eval_count?: number;
+			prompt_eval_duration?: number;
+			total_duration?: number;
+			load_duration?: number;
+		};
+		annotation?: { type: string; rating: number };
+	}
+
+	export let message: MessageType;
 	export let siblings;
 
 	export let isLastMessage = true;
@@ -60,28 +100,33 @@
 	let editedContent = '';
 	let editTextAreaElement: HTMLTextAreaElement;
 
-	let sentencesAudio = {};
-	let speaking = null;
-	let speakingIdx = null;
+	let audioParts: Record<number, HTMLAudioElement | null> = {};
+	let speaking = false;
+	let speakingIdx: number | undefined;
 
 	let loadingSpeech = false;
 	let generatingImage = false;
 
 	let showRateComment = false;
 
-	const playAudio = (idx) => {
-		return new Promise((res) => {
+	const playAudio = (idx: number) => {
+		return new Promise<void>((res) => {
 			speakingIdx = idx;
-			const audio = sentencesAudio[idx];
+			const audio = audioParts[idx];
+
+			if (!audio) {
+				return res();
+			}
+
 			audio.play();
-			audio.onended = async (e) => {
+			audio.onended = async () => {
 				await new Promise((r) => setTimeout(r, 300));
 
-				if (Object.keys(sentencesAudio).length - 1 === idx) {
-					speaking = null;
+				if (Object.keys(audioParts).length - 1 === idx) {
+					speaking = false;
 				}
 
-				res(e);
+				res();
 			};
 		});
 	};
@@ -91,113 +136,111 @@
 			try {
 				speechSynthesis.cancel();
 
-				sentencesAudio[speakingIdx].pause();
-				sentencesAudio[speakingIdx].currentTime = 0;
+				if (speakingIdx !== undefined && audioParts[speakingIdx]) {
+					audioParts[speakingIdx]!.pause();
+					audioParts[speakingIdx]!.currentTime = 0;
+				}
 			} catch {}
 
-			speaking = null;
-			speakingIdx = null;
+			speaking = false;
+			speakingIdx = undefined;
+			return;
+		}
+
+		if (!(message?.content ?? '').trim().length) {
+			toast.info($i18n.t('No content to speak'));
+			return;
+		}
+
+		speaking = true;
+
+		if ($config.audio.tts.engine !== '') {
+			loadingSpeech = true;
+
+			const messageContentParts: string[] = getMessageContentParts(
+				message.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			);
+
+			if (!messageContentParts.length) {
+				console.log('No content to speak');
+				toast.info($i18n.t('No content to speak'));
+
+				speaking = false;
+				loadingSpeech = false;
+				return;
+			}
+
+			console.debug('Prepared message content for TTS', messageContentParts);
+
+			audioParts = messageContentParts.reduce(
+				(acc, _sentence, idx) => {
+					acc[idx] = null;
+					return acc;
+				},
+				{} as typeof audioParts
+			);
+
+			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+
+			for (const [idx, sentence] of messageContentParts.entries()) {
+				const res = await synthesizeOpenAISpeech(
+					localStorage.token,
+					$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
+						? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+						: $config?.audio?.tts?.voice,
+					sentence
+				).catch((error) => {
+					console.error(error);
+					toast.error(error);
+
+					speaking = false;
+					loadingSpeech = false;
+				});
+
+				if (res) {
+					const blob = await res.blob();
+					const blobUrl = URL.createObjectURL(blob);
+					const audio = new Audio(blobUrl);
+					audioParts[idx] = audio;
+					loadingSpeech = false;
+					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
+				}
+			}
 		} else {
-			if ((message?.content ?? '').trim() !== '') {
-				speaking = true;
-
-				if ($config.audio.tts.engine !== '') {
-					loadingSpeech = true;
-
-					const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
-						const lastIndex = mergedTexts.length - 1;
-						if (lastIndex >= 0) {
-							const previousText = mergedTexts[lastIndex];
-							const wordCount = previousText.split(/\s+/).length;
-							if (wordCount < 2) {
-								mergedTexts[lastIndex] = previousText + ' ' + currentText;
-							} else {
-								mergedTexts.push(currentText);
-							}
-						} else {
-							mergedTexts.push(currentText);
-						}
-						return mergedTexts;
-					}, []);
-
-					console.log(sentences);
-
-					if (sentences.length > 0) {
-						sentencesAudio = sentences.reduce((a, e, i, arr) => {
-							a[i] = null;
-							return a;
-						}, {});
-
-						let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
-
-						for (const [idx, sentence] of sentences.entries()) {
-							const res = await synthesizeOpenAISpeech(
-								localStorage.token,
-								$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
-									? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-									: $config?.audio?.tts?.voice,
-								sentence
-							).catch((error) => {
-								toast.error(error);
-
-								speaking = null;
-								loadingSpeech = false;
-
-								return null;
-							});
-
-							if (res) {
-								const blob = await res.blob();
-								const blobUrl = URL.createObjectURL(blob);
-								const audio = new Audio(blobUrl);
-								sentencesAudio[idx] = audio;
-								loadingSpeech = false;
-								lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
-							}
+			let voices = [];
+			const getVoicesLoop = setInterval(() => {
+				voices = speechSynthesis.getVoices();
+				if (voices.length > 0) {
+					clearInterval(getVoicesLoop);
+
+					const voice =
+						voices
+							?.filter(
+								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+							)
+							?.at(0) ?? undefined;
+
+					console.log(voice);
+
+					const speak = new SpeechSynthesisUtterance(message.content);
+
+					console.log(speak);
+
+					speak.onend = () => {
+						speaking = false;
+						if ($settings.conversationMode) {
+							document.getElementById('voice-input-button')?.click();
 						}
-					} else {
-						speaking = null;
-						loadingSpeech = false;
+					};
+
+					if (voice) {
+						speak.voice = voice;
 					}
-				} else {
-					let voices = [];
-					const getVoicesLoop = setInterval(async () => {
-						voices = await speechSynthesis.getVoices();
-						if (voices.length > 0) {
-							clearInterval(getVoicesLoop);
-
-							const voice =
-								voices
-									?.filter(
-										(v) =>
-											v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-									)
-									?.at(0) ?? undefined;
-
-							console.log(voice);
-
-							const speak = new SpeechSynthesisUtterance(message.content);
-
-							console.log(speak);
-
-							speak.onend = () => {
-								speaking = null;
-								if ($settings.conversationMode) {
-									document.getElementById('voice-input-button')?.click();
-								}
-							};
-
-							if (voice) {
-								speak.voice = voice;
-							}
-
-							speechSynthesis.speak(speak);
-						}
-					}, 100);
+
+					speechSynthesis.speak(speak);
 				}
-			} else {
-				toast.error($i18n.t('No content to speak'));
-			}
+			}, 100);
 		}
 	};
 
@@ -230,7 +273,7 @@
 		await tick();
 	};
 
-	const generateImage = async (message) => {
+	const generateImage = async (message: MessageType) => {
 		generatingImage = true;
 		const res = await imageGenerations(localStorage.token, message.content).catch((error) => {
 			toast.error(error);
@@ -285,7 +328,7 @@
 			</Name>
 
 			<div>
-				{#if (message?.files ?? []).filter((f) => f.type === 'image').length > 0}
+				{#if message?.files && message.files?.filter((f) => f.type === 'image').length > 0}
 					<div class="my-2.5 w-full flex overflow-x-auto gap-2 flex-wrap">
 						{#each message.files as file}
 							<div>
@@ -304,7 +347,7 @@
 								message?.statusHistory ?? [...(message?.status ? [message?.status] : [])]
 							).at(-1)}
 							<div class="flex items-center gap-2 pt-0.5 pb-1">
-								{#if status.done === false}
+								{#if status?.done === false}
 									<div class="">
 										<Spinner className="size-4" />
 									</div>
@@ -521,7 +564,7 @@
 											: 'invisible group-hover:visible'} p-1.5 hover:bg-black/5 dark:hover:bg-white/5 rounded-lg dark:hover:text-white hover:text-black transition"
 										on:click={() => {
 											if (!loadingSpeech) {
-												toggleSpeakMessage(message);
+												toggleSpeakMessage();
 											}
 										}}
 									>
@@ -661,7 +704,7 @@
 													`${
 														Math.round(
 															((message.info.eval_count ?? 0) /
-																(message.info.eval_duration / 1000000000)) *
+																((message.info.eval_duration ?? 0) / 1000000000)) *
 																100
 														) / 100
 													} tokens` ?? 'N/A'
@@ -669,7 +712,7 @@
 					prompt_token/s: ${
 						Math.round(
 							((message.info.prompt_eval_count ?? 0) /
-								(message.info.prompt_eval_duration / 1000000000)) *
+								((message.info.prompt_eval_duration ?? 0) / 1000000000)) *
 								100
 						) / 100 ?? 'N/A'
 					} tokens<br/>
@@ -688,7 +731,7 @@
 		            eval_duration: ${
 									Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A'
 								}ms<br/>
-		            approximate_total: ${approximateToHumanReadable(message.info.total_duration)}`}
+		            approximate_total: ${approximateToHumanReadable(message.info.total_duration ?? 0)}`}
 										placement="top"
 									>
 										<Tooltip content={$i18n.t('Generation Info')} placement="bottom">

+ 4 - 0
src/lib/i18n/locales/en-GB/translation.json

@@ -138,6 +138,7 @@
 	"Continue Response": "",
 	"Continue with {{provider}}": "",
 	"Controls": "",
+	"Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string.": "",
 	"Copied": "",
 	"Copied shared chat URL to clipboard!": "",
 	"Copied to clipboard": "",
@@ -455,6 +456,7 @@
 	"or": "",
 	"Other": "",
 	"Password": "",
+	"Paragraphs": "",
 	"PDF document (.pdf)": "",
 	"PDF Extract Images (OCR)": "",
 	"pending": "",
@@ -483,6 +485,7 @@
 	"Prompts": "",
 	"Pull \"{{searchValue}}\" from Ollama.com": "",
 	"Pull a model from Ollama.com": "",
+	"Punctuation": "",
 	"Query Params": "",
 	"RAG Template": "",
 	"Read Aloud": "",
@@ -504,6 +507,7 @@
 	"Reset Upload Directory": "",
 	"Reset Vector Storage": "",
 	"Response AutoCopy to Clipboard": "",
+	"Response splitting": "",
 	"Response notifications cannot be activated as the website permissions have been denied. Please visit your browser settings to grant the necessary access.": "",
 	"Role": "",
 	"Rosé Pine": "",

+ 4 - 0
src/lib/i18n/locales/en-US/translation.json

@@ -138,6 +138,7 @@
 	"Continue Response": "",
 	"Continue with {{provider}}": "",
 	"Controls": "",
+	"Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string.": "",
 	"Copied": "",
 	"Copied shared chat URL to clipboard!": "",
 	"Copied to clipboard": "",
@@ -455,6 +456,7 @@
 	"or": "",
 	"Other": "",
 	"Password": "",
+	"Paragraphs": "",
 	"PDF document (.pdf)": "",
 	"PDF Extract Images (OCR)": "",
 	"pending": "",
@@ -483,6 +485,7 @@
 	"Prompts": "",
 	"Pull \"{{searchValue}}\" from Ollama.com": "",
 	"Pull a model from Ollama.com": "",
+	"Punctuation": "",
 	"Query Params": "",
 	"RAG Template": "",
 	"Read Aloud": "",
@@ -504,6 +507,7 @@
 	"Reset Upload Directory": "",
 	"Reset Vector Storage": "",
 	"Response AutoCopy to Clipboard": "",
+	"Response splitting": "",
 	"Response notifications cannot be activated as the website permissions have been denied. Please visit your browser settings to grant the necessary access.": "",
 	"Role": "",
 	"Rosé Pine": "",

+ 5 - 1
src/lib/i18n/locales/fr-CA/translation.json

@@ -137,7 +137,8 @@
 	"Context Length": "Longueur du contexte",
 	"Continue Response": "Continuer la réponse",
 	"Continue with {{provider}}": "Continuer avec {{provider}}",
-	"Controls": "",
+	"Controls": "Contrôles",
+	"Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string.": "Contrôle comment le texte des messages est divisé pour les demandes de TTS. 'Ponctuation' divise en phrases, 'paragraphes' divise en paragraphes et 'aucun' garde le message comme une seule chaîne.",
 	"Copied": "",
 	"Copied shared chat URL to clipboard!": "URL du chat copiée dans le presse-papiers\u00a0!",
 	"Copied to clipboard": "",
@@ -455,6 +456,7 @@
 	"or": "ou",
 	"Other": "Autre",
 	"Password": "Mot de passe",
+	"Paragraphs": "Paragraphes",
 	"PDF document (.pdf)": "Document au format PDF  (.pdf)",
 	"PDF Extract Images (OCR)": "Extraction d'images PDF (OCR)",
 	"pending": "en attente",
@@ -483,6 +485,7 @@
 	"Prompts": "Prompts",
 	"Pull \"{{searchValue}}\" from Ollama.com": "Récupérer « {{searchValue}} » depuis Ollama.com",
 	"Pull a model from Ollama.com": "Télécharger un modèle depuis Ollama.com",
+	"Punctuation": "Ponctuation",
 	"Query Params": "Paramètres de requête",
 	"RAG Template": "Modèle RAG",
 	"Read Aloud": "Lire à haute voix",
@@ -504,6 +507,7 @@
 	"Reset Upload Directory": "Répertoire de téléchargement réinitialisé",
 	"Reset Vector Storage": "Réinitialiser le stockage des vecteurs",
 	"Response AutoCopy to Clipboard": "Copie automatique de la réponse vers le presse-papiers",
+	"Response splitting": "Fractionnement de la réponse",
 	"Response notifications cannot be activated as the website permissions have been denied. Please visit your browser settings to grant the necessary access.": "Les notifications de réponse ne peuvent pas être activées car les autorisations du site web ont été refusées. Veuillez visiter les paramètres de votre navigateur pour accorder l'accès nécessaire.",
 	"Role": "Rôle",
 	"Rosé Pine": "Pin rosé",

+ 4 - 0
src/lib/i18n/locales/fr-FR/translation.json

@@ -138,6 +138,7 @@
 	"Continue Response": "Continuer la réponse",
 	"Continue with {{provider}}": "Continuer avec {{provider}}",
 	"Controls": "Contrôles",
+	"Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string.": "Contrôle la façon dont le texte des messages est divisé pour les demandes de TTS. 'Ponctuation' divise en phrases, 'paragraphes' divise en paragraphes et 'aucun' garde le message en tant que chaîne unique.",
 	"Copied": "Copié",
 	"Copied shared chat URL to clipboard!": "URL du chat copiée dans le presse-papiers\u00a0!",
 	"Copied to clipboard": "",
@@ -455,6 +456,7 @@
 	"or": "ou",
 	"Other": "Autre",
 	"Password": "Mot de passe",
+	"Paragraphs": "Paragraphes",
 	"PDF document (.pdf)": "Document au format PDF  (.pdf)",
 	"PDF Extract Images (OCR)": "Extraction d'images PDF (OCR)",
 	"pending": "en attente",
@@ -483,6 +485,7 @@
 	"Prompts": "Prompts",
 	"Pull \"{{searchValue}}\" from Ollama.com": "Récupérer « {{searchValue}} » depuis Ollama.com",
 	"Pull a model from Ollama.com": "Télécharger un modèle depuis Ollama.com",
+	"Punctuation": "Ponctuation",
 	"Query Params": "Paramètres de requête",
 	"RAG Template": "Modèle RAG",
 	"Read Aloud": "Lire à haute voix",
@@ -504,6 +507,7 @@
 	"Reset Upload Directory": "Répertoire de téléchargement réinitialisé",
 	"Reset Vector Storage": "Réinitialiser le stockage des vecteurs",
 	"Response AutoCopy to Clipboard": "Copie automatique de la réponse vers le presse-papiers",
+	"Response splitting": "Fractionnement de la réponse",
 	"Response notifications cannot be activated as the website permissions have been denied. Please visit your browser settings to grant the necessary access.": "Les notifications de réponse ne peuvent pas être activées car les autorisations du site web ont été refusées. Veuillez visiter les paramètres de votre navigateur pour accorder l'accès nécessaire.",
 	"Role": "Rôle",
 	"Rosé Pine": "Pin rosé",

+ 6 - 0
src/lib/types/index.ts

@@ -7,3 +7,9 @@ export type Banner = {
 	dismissible?: boolean;
 	timestamp: number;
 };
+
+export enum TTS_RESPONSE_SPLIT {
+	PUNCTUATION = 'punctuation',
+	PARAGRAPHS = 'paragraphs',
+	NONE = 'none',
+}

+ 65 - 18
src/lib/utils/index.ts

@@ -1,6 +1,8 @@
 import { v4 as uuidv4 } from 'uuid';
 import sha256 from 'js-sha256';
+
 import { WEBUI_BASE_URL } from '$lib/constants';
+import { TTS_RESPONSE_SPLIT } from '$lib/types';
 
 //////////////////////////
 // Helper functions
@@ -408,7 +410,7 @@ const convertOpenAIMessages = (convo) => {
 	let currentId = '';
 	let lastId = null;
 
-	for (let message_id in mapping) {
+	for (const message_id in mapping) {
 		const message = mapping[message_id];
 		currentId = message_id;
 		try {
@@ -442,7 +444,7 @@ const convertOpenAIMessages = (convo) => {
 		}
 	}
 
-	let history = {};
+	const history: Record<PropertyKey, (typeof messages)[number]> = {};
 	messages.forEach((obj) => (history[obj.id] = obj));
 
 	const chat = {
@@ -481,7 +483,7 @@ const validateChat = (chat) => {
 	}
 
 	// Every message's content should be a string
-	for (let message of messages) {
+	for (const message of messages) {
 		if (typeof message.content !== 'string') {
 			return false;
 		}
@@ -494,7 +496,7 @@ export const convertOpenAIChats = (_chats) => {
 	// Create a list of dictionaries with each conversation from import
 	const chats = [];
 	let failed = 0;
-	for (let convo of _chats) {
+	for (const convo of _chats) {
 		const chat = convertOpenAIMessages(convo);
 
 		if (validateChat(chat)) {
@@ -513,7 +515,7 @@ export const convertOpenAIChats = (_chats) => {
 	return chats;
 };
 
-export const isValidHttpUrl = (string) => {
+export const isValidHttpUrl = (string: string) => {
 	let url;
 
 	try {
@@ -525,7 +527,7 @@ export const isValidHttpUrl = (string) => {
 	return url.protocol === 'http:' || url.protocol === 'https:';
 };
 
-export const removeEmojis = (str) => {
+export const removeEmojis = (str: string) => {
 	// Regular expression to match emojis
 	const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
 
@@ -533,20 +535,24 @@ export const removeEmojis = (str) => {
 	return str.replace(emojiRegex, '');
 };
 
-export const removeFormattings = (str) => {
+export const removeFormattings = (str: string) => {
 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
 };
 
-export const extractSentences = (text) => {
-	// This regular expression matches code blocks marked by triple backticks
-	const codeBlockRegex = /```[\s\S]*?```/g;
+export const cleanText = (content: string) => {
+	return removeFormattings(removeEmojis(content.trim()));
+};
 
-	let codeBlocks = [];
+// This regular expression matches code blocks marked by triple backticks
+const codeBlockRegex = /```[\s\S]*?```/g;
+
+export const extractSentences = (text: string) => {
+	const codeBlocks: string[] = [];
 	let index = 0;
 
 	// Temporarily replace code blocks with placeholders and store the blocks separately
 	text = text.replace(codeBlockRegex, (match) => {
-		let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
 		codeBlocks[index++] = match;
 		return placeholder;
 	});
@@ -560,18 +566,40 @@ export const extractSentences = (text) => {
 		return sentence.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
 	});
 
-	return sentences
-		.map((sentence) => removeFormattings(removeEmojis(sentence.trim())))
-		.filter((sentence) => sentence);
+	return sentences.map(cleanText).filter(Boolean);
 };
 
-export const extractSentencesForAudio = (text) => {
+export const extractParagraphsForAudio = (text: string) => {
+	const codeBlocks: string[] = [];
+	let index = 0;
+
+	// Temporarily replace code blocks with placeholders and store the blocks separately
+	text = text.replace(codeBlockRegex, (match) => {
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		codeBlocks[index++] = match;
+		return placeholder;
+	});
+
+	// Split the modified text into paragraphs based on newlines, avoiding these blocks
+	let paragraphs = text.split(/\n+/);
+
+	// Restore code blocks and process paragraphs
+	paragraphs = paragraphs.map((paragraph) => {
+		// Check if the paragraph includes a placeholder for a code block
+		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
+	});
+
+	return paragraphs.map(cleanText).filter(Boolean);
+};
+
+export const extractSentencesForAudio = (text: string) => {
 	return extractSentences(text).reduce((mergedTexts, currentText) => {
 		const lastIndex = mergedTexts.length - 1;
 		if (lastIndex >= 0) {
 			const previousText = mergedTexts[lastIndex];
 			const wordCount = previousText.split(/\s+/).length;
-			if (wordCount < 2) {
+			const charCount = previousText.length;
+			if (wordCount < 4 || charCount < 50) {
 				mergedTexts[lastIndex] = previousText + ' ' + currentText;
 			} else {
 				mergedTexts.push(currentText);
@@ -580,7 +608,26 @@ export const extractSentencesForAudio = (text) => {
 			mergedTexts.push(currentText);
 		}
 		return mergedTexts;
-	}, []);
+	}, [] as string[]);
+};
+
+export const getMessageContentParts = (content: string, split_on: string = 'punctuation') => {
+	const messageContentParts: string[] = [];
+
+	switch (split_on) {
+		default:
+		case TTS_RESPONSE_SPLIT.PUNCTUATION:
+			messageContentParts.push(...extractSentencesForAudio(content));
+			break;
+		case TTS_RESPONSE_SPLIT.PARAGRAPHS:
+			messageContentParts.push(...extractParagraphsForAudio(content));
+			break;
+		case TTS_RESPONSE_SPLIT.NONE:
+			messageContentParts.push(cleanText(content));
+			break;
+	}
+
+	return messageContentParts;
 };
 
 export const blobToFile = (blob, fileName) => {

+ 10 - 2
src/routes/(app)/+layout.svelte

@@ -81,9 +81,17 @@
 			});
 
 			if (userSettings) {
-				await settings.set(userSettings.ui);
+				settings.set(userSettings.ui);
 			} else {
-				await settings.set(JSON.parse(localStorage.getItem('settings') ?? '{}'));
+				let localStorageSettings = {} as Parameters<(typeof settings)['set']>[0];
+
+				try {
+					localStorageSettings = JSON.parse(localStorage.getItem('settings') ?? '{}');
+				} catch (e: unknown) {
+					console.error('Failed to parse settings from localStorage', e);
+				}
+
+				settings.set(localStorageSettings);
 			}
 
 			await Promise.all([