Timothy J. Baek 8 months ago
parent
commit
f4f7adb377

+ 42 - 18
src/lib/components/chat/Chat.svelte

@@ -30,6 +30,7 @@
 	import {
 		convertMessagesToHistory,
 		copyToClipboard,
+		getMessageContentParts,
 		extractSentencesForAudio,
 		promptTemplate,
 		splitStream
@@ -926,18 +927,26 @@
 										navigator.vibrate(5);
 									}
 
-									const sentences = extractSentencesForAudio(responseMessage.content);
-									sentences.pop();
+									const messageContentParts = getMessageContentParts(
+										responseMessage.content,
+										$config?.audio?.tts?.split_on ?? 'punctuation'
+									);
+									messageContentParts.pop();
 
 									// dispatch only last sentence and make sure it hasn't been dispatched before
 									if (
-										sentences.length > 0 &&
-										sentences[sentences.length - 1] !== responseMessage.lastSentence
+										messageContentParts.length > 0 &&
+										messageContentParts[messageContentParts.length - 1] !==
+											responseMessage.lastSentence
 									) {
-										responseMessage.lastSentence = sentences[sentences.length - 1];
+										responseMessage.lastSentence =
+											messageContentParts[messageContentParts.length - 1];
 										eventTarget.dispatchEvent(
 											new CustomEvent('chat', {
-												detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+												detail: {
+													id: responseMessageId,
+													content: messageContentParts[messageContentParts.length - 1]
+												}
 											})
 										);
 									}
@@ -1040,14 +1049,19 @@
 		stopResponseFlag = false;
 		await tick();
 
-		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
-		if (lastSentence) {
+		let lastMessageContentPart =
+			getMessageContentParts(
+				responseMessage.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			)?.at(-1) ?? '';
+		if (lastMessageContentPart) {
 			eventTarget.dispatchEvent(
 				new CustomEvent('chat', {
-					detail: { id: responseMessageId, content: lastSentence }
+					detail: { id: responseMessageId, content: lastMessageContentPart }
 				})
 			);
 		}
+
 		eventTarget.dispatchEvent(
 			new CustomEvent('chat:finish', {
 				detail: {
@@ -1247,18 +1261,24 @@
 							navigator.vibrate(5);
 						}
 
-						const sentences = extractSentencesForAudio(responseMessage.content);
-						sentences.pop();
+						const messageContentParts = getMessageContentParts(
+							responseMessage.content,
+							$config?.audio?.tts?.split_on ?? 'punctuation'
+						);
+						messageContentParts.pop();
 
 						// dispatch only last sentence and make sure it hasn't been dispatched before
 						if (
-							sentences.length > 0 &&
-							sentences[sentences.length - 1] !== responseMessage.lastSentence
+							messageContentParts.length > 0 &&
+							messageContentParts[messageContentParts.length - 1] !== responseMessage.lastSentence
 						) {
-							responseMessage.lastSentence = sentences[sentences.length - 1];
+							responseMessage.lastSentence = messageContentParts[messageContentParts.length - 1];
 							eventTarget.dispatchEvent(
 								new CustomEvent('chat', {
-									detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+									detail: {
+										id: responseMessageId,
+										content: messageContentParts[messageContentParts.length - 1]
+									}
 								})
 							);
 						}
@@ -1313,11 +1333,15 @@
 		stopResponseFlag = false;
 		await tick();
 
-		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
-		if (lastSentence) {
+		let lastMessageContentPart =
+			getMessageContentParts(
+				responseMessage.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			)?.at(-1) ?? '';
+		if (lastMessageContentPart) {
 			eventTarget.dispatchEvent(
 				new CustomEvent('chat', {
-					detail: { id: responseMessageId, content: lastSentence }
+					detail: { id: responseMessageId, content: lastMessageContentPart }
 				})
 			);
 		}

+ 33 - 29
src/lib/components/chat/Messages/ResponseMessage.svelte

@@ -16,7 +16,8 @@
 		approximateToHumanReadable,
 		extractParagraphsForAudio,
 		extractSentencesForAudio,
-		prepareTextForTTS,
+		cleanText,
+		getMessageContentParts
 	} from '$lib/utils';
 	import { WEBUI_BASE_URL } from '$lib/constants';
 
@@ -35,7 +36,6 @@
 
 	import type { Writable } from 'svelte/store';
 	import type { i18n as i18nType } from 'i18next';
-	import { TTS_RESPONSE_SPLIT } from '$lib/types';
 
 	interface MessageType {
 		id: string;
@@ -44,8 +44,20 @@
 		files?: { type: string; url: string }[];
 		timestamp: number;
 		role: string;
-		statusHistory?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }[];
-		status?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; };
+		statusHistory?: {
+			done: boolean;
+			action: string;
+			description: string;
+			urls?: string[];
+			query?: string;
+		}[];
+		status?: {
+			done: boolean;
+			action: string;
+			description: string;
+			urls?: string[];
+			query?: string;
+		};
 		done: boolean;
 		error?: boolean | { content: string };
 		citations?: string[];
@@ -61,7 +73,7 @@
 			total_duration?: number;
 			load_duration?: number;
 		};
-		annotation?: { type: string; rating: number; };
+		annotation?: { type: string; rating: number };
 	}
 
 	export let message: MessageType;
@@ -145,22 +157,12 @@
 		if ($config.audio.tts.engine !== '') {
 			loadingSpeech = true;
 
-			const preparedMessageContent: string[] = [];
-
-			switch ($config.audio.tts.split_on) {
-				default:
-				case TTS_RESPONSE_SPLIT.PUNCTUATION:
-				preparedMessageContent.push(...extractSentencesForAudio(message.content));
-					break;
-				case TTS_RESPONSE_SPLIT.PARAGRAPHS:
-				preparedMessageContent.push(...extractParagraphsForAudio(message.content));
-					break;
-				case TTS_RESPONSE_SPLIT.NONE:
-				preparedMessageContent.push(prepareTextForTTS(message.content));
-					break;
-			}
+			const messageContentParts: string[] = getMessageContentParts(
+				message.content,
+				$config?.audio?.tts?.split_on ?? 'punctuation'
+			);
 
-			if (!preparedMessageContent.length) {
+			if (!messageContentParts.length) {
 				console.log('No content to speak');
 				toast.info($i18n.t('No content to speak'));
 
@@ -169,16 +171,19 @@
 				return;
 			}
 
-			console.debug('Prepared message content for TTS', preparedMessageContent);
+			console.debug('Prepared message content for TTS', messageContentParts);
 
-			audioParts = preparedMessageContent.reduce((acc, _sentence, idx) => {
-				acc[idx] = null;
-				return acc;
-			}, {} as typeof audioParts);
+			audioParts = messageContentParts.reduce(
+				(acc, _sentence, idx) => {
+					acc[idx] = null;
+					return acc;
+				},
+				{} as typeof audioParts
+			);
 
 			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
 
-			for (const [idx, sentence] of preparedMessageContent.entries()) {
+			for (const [idx, sentence] of messageContentParts.entries()) {
 				const res = await synthesizeOpenAISpeech(
 					localStorage.token,
 					$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
@@ -212,8 +217,7 @@
 					const voice =
 						voices
 							?.filter(
-								(v) =>
-									v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
 							)
 							?.at(0) ?? undefined;
 
@@ -727,7 +731,7 @@
 		            eval_duration: ${
 									Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A'
 								}ms<br/>
-		            approximate_total: ${approximateToHumanReadable((message.info.total_duration ?? 0))}`}
+		            approximate_total: ${approximateToHumanReadable(message.info.total_duration ?? 0)}`}
 										placement="top"
 									>
 										<Tooltip content={$i18n.t('Generation Info')} placement="bottom">

+ 24 - 7
src/lib/utils/index.ts

@@ -1,6 +1,8 @@
 import { v4 as uuidv4 } from 'uuid';
 import sha256 from 'js-sha256';
+
 import { WEBUI_BASE_URL } from '$lib/constants';
+import { TTS_RESPONSE_SPLIT } from '$lib/types';
 
 //////////////////////////
 // Helper functions
@@ -537,7 +539,7 @@ export const removeFormattings = (str: string) => {
 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
 };
 
-export const prepareTextForTTS = (content: string) => {
+export const cleanText = (content: string) => {
 	return removeFormattings(removeEmojis(content.trim()));
 };
 
@@ -564,9 +566,7 @@ export const extractSentences = (text: string) => {
 		return sentence.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
 	});
 
-	return sentences
-		.map(prepareTextForTTS)
-		.filter(Boolean);
+	return sentences.map(cleanText).filter(Boolean);
 };
 
 export const extractParagraphsForAudio = (text: string) => {
@@ -589,9 +589,7 @@ export const extractParagraphsForAudio = (text: string) => {
 		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
 	});
 
-	return paragraphs
-		.map(prepareTextForTTS)
-		.filter(Boolean);
+	return paragraphs.map(cleanText).filter(Boolean);
 };
 
 export const extractSentencesForAudio = (text: string) => {
@@ -613,6 +611,25 @@ export const extractSentencesForAudio = (text: string) => {
 	}, [] as string[]);
 };
 
+export const getMessageContentParts = (content: string, split_on: string = 'punctuation') => {
+	const messageContentParts: string[] = [];
+
+	switch (split_on) {
+		default:
+		case TTS_RESPONSE_SPLIT.PUNCTUATION:
+			messageContentParts.push(...extractSentencesForAudio(content));
+			break;
+		case TTS_RESPONSE_SPLIT.PARAGRAPHS:
+			messageContentParts.push(...extractParagraphsForAudio(content));
+			break;
+		case TTS_RESPONSE_SPLIT.NONE:
+			messageContentParts.push(cleanText(content));
+			break;
+	}
+
+	return messageContentParts;
+};
+
 export const blobToFile = (blob, fileName) => {
 	// Create a new File object from the Blob
 	const file = new File([blob], fileName, { type: blob.type });