123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637 |
- <script lang="ts">
- import { toast } from 'svelte-sonner';
- import { createEventDispatcher, onMount, getContext } from 'svelte';
- const dispatch = createEventDispatcher();
- import { getBackendConfig } from '$lib/apis';
- import {
- getAudioConfig,
- updateAudioConfig,
- getModels as _getModels,
- getVoices as _getVoices
- } from '$lib/apis/audio';
- import { config } from '$lib/stores';
- import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
- import { TTS_RESPONSE_SPLIT } from '$lib/types';
- import type { Writable } from 'svelte/store';
- import type { i18n as i18nType } from 'i18next';
- const i18n = getContext<Writable<i18nType>>('i18n');
- export let saveHandler: () => void;
- // Audio
- let TTS_OPENAI_API_BASE_URL = '';
- let TTS_OPENAI_API_KEY = '';
- let TTS_API_KEY = '';
- let TTS_ENGINE = '';
- let TTS_MODEL = '';
- let TTS_VOICE = '';
- let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
- let TTS_AZURE_SPEECH_REGION = '';
- let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
- let STT_OPENAI_API_BASE_URL = '';
- let STT_OPENAI_API_KEY = '';
- let STT_ENGINE = '';
- let STT_MODEL = '';
- let STT_WHISPER_MODEL = '';
- let STT_DEEPGRAM_API_KEY = '';
- let STT_WHISPER_MODEL_LOADING = false;
- // eslint-disable-next-line no-undef
- let voices: SpeechSynthesisVoice[] = [];
- let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
- const getModels = async () => {
- if (TTS_ENGINE === '') {
- models = [];
- } else {
- const res = await _getModels(localStorage.token).catch((e) => {
- toast.error(`${e}`);
- });
- if (res) {
- console.log(res);
- models = res.models;
- }
- }
- };
- const getVoices = async () => {
- if (TTS_ENGINE === '') {
- const getVoicesLoop = setInterval(() => {
- voices = speechSynthesis.getVoices();
- // do your loop
- if (voices.length > 0) {
- clearInterval(getVoicesLoop);
- voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
- }
- }, 100);
- } else {
- const res = await _getVoices(localStorage.token).catch((e) => {
- toast.error(`${e}`);
- });
- if (res) {
- console.log(res);
- voices = res.voices;
- voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
- }
- }
- };
- const updateConfigHandler = async () => {
- const res = await updateAudioConfig(localStorage.token, {
- tts: {
- OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL,
- OPENAI_API_KEY: TTS_OPENAI_API_KEY,
- API_KEY: TTS_API_KEY,
- ENGINE: TTS_ENGINE,
- MODEL: TTS_MODEL,
- VOICE: TTS_VOICE,
- SPLIT_ON: TTS_SPLIT_ON,
- AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
- AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
- },
- stt: {
- OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
- OPENAI_API_KEY: STT_OPENAI_API_KEY,
- ENGINE: STT_ENGINE,
- MODEL: STT_MODEL,
- WHISPER_MODEL: STT_WHISPER_MODEL,
- DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY
- }
- });
- if (res) {
- saveHandler();
- config.set(await getBackendConfig());
- }
- };
- const sttModelUpdateHandler = async () => {
- STT_WHISPER_MODEL_LOADING = true;
- await updateConfigHandler();
- STT_WHISPER_MODEL_LOADING = false;
- };
- onMount(async () => {
- const res = await getAudioConfig(localStorage.token);
- if (res) {
- console.log(res);
- TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL;
- TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY;
- TTS_API_KEY = res.tts.API_KEY;
- TTS_ENGINE = res.tts.ENGINE;
- TTS_MODEL = res.tts.MODEL;
- TTS_VOICE = res.tts.VOICE;
- TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
- TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
- TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
- STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
- STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
- STT_ENGINE = res.stt.ENGINE;
- STT_MODEL = res.stt.MODEL;
- STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
- STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
- }
- await getVoices();
- await getModels();
- });
- </script>
- <form
- class="flex flex-col h-full justify-between space-y-3 text-sm"
- on:submit|preventDefault={async () => {
- await updateConfigHandler();
- dispatch('save');
- }}
- >
- <div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
- <div class="flex flex-col gap-3">
- <div>
- <div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
- <div class=" py-0.5 flex w-full justify-between">
- <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
- <div class="flex items-center relative">
- <select
- class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
- bind:value={STT_ENGINE}
- placeholder="Select an engine"
- >
- <option value="">{$i18n.t('Whisper (Local)')}</option>
- <option value="openai">OpenAI</option>
- <option value="web">{$i18n.t('Web API')}</option>
- <option value="deepgram">Deepgram</option>
- </select>
- </div>
- </div>
- {#if STT_ENGINE === 'openai'}
- <div>
- <div class="mt-1 flex gap-2 mb-1">
- <input
- class="flex-1 w-full bg-transparent outline-none"
- placeholder={$i18n.t('API Base URL')}
- bind:value={STT_OPENAI_API_BASE_URL}
- required
- />
- <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_OPENAI_API_KEY} />
- </div>
- </div>
- <hr class=" dark:border-gray-850 my-2" />
- <div>
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="model-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={STT_MODEL}
- placeholder="Select a model"
- />
- <datalist id="model-list">
- <option value="whisper-1" />
- </datalist>
- </div>
- </div>
- </div>
- {:else if STT_ENGINE === 'deepgram'}
- <div>
- <div class="mt-1 flex gap-2 mb-1">
- <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_DEEPGRAM_API_KEY} />
- </div>
- </div>
- <hr class=" dark:border-gray-850 my-2" />
- <div>
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={STT_MODEL}
- placeholder="Select a model (optional)"
- />
- </div>
- </div>
- <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
- {$i18n.t('Leave model field empty to use the default model.')}
- <a
- class=" hover:underline dark:text-gray-200 text-gray-800"
- href="https://developers.deepgram.com/docs/models"
- target="_blank"
- >
- {$i18n.t('Click here to see available models.')}
- </a>
- </div>
- </div>
- {:else if STT_ENGINE === ''}
- <div>
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
- <div class="flex w-full">
- <div class="flex-1 mr-2">
- <input
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- placeholder={$i18n.t('Set whisper model')}
- bind:value={STT_WHISPER_MODEL}
- />
- </div>
- <button
- class="px-2.5 bg-gray-50 hover:bg-gray-200 text-gray-800 dark:bg-gray-850 dark:hover:bg-gray-800 dark:text-gray-100 rounded-lg transition"
- on:click={() => {
- sttModelUpdateHandler();
- }}
- disabled={STT_WHISPER_MODEL_LOADING}
- >
- {#if STT_WHISPER_MODEL_LOADING}
- <div class="self-center">
- <svg
- class=" w-4 h-4"
- viewBox="0 0 24 24"
- fill="currentColor"
- xmlns="http://www.w3.org/2000/svg"
- >
- <style>
- .spinner_ajPY {
- transform-origin: center;
- animation: spinner_AtaB 0.75s infinite linear;
- }
- @keyframes spinner_AtaB {
- 100% {
- transform: rotate(360deg);
- }
- }
- </style>
- <path
- d="M12,1A11,11,0,1,0,23,12,11,11,0,0,0,12,1Zm0,19a8,8,0,1,1,8-8A8,8,0,0,1,12,20Z"
- opacity=".25"
- />
- <path
- d="M10.14,1.16a11,11,0,0,0-9,8.92A1.59,1.59,0,0,0,2.46,12,1.52,1.52,0,0,0,4.11,10.7a8,8,0,0,1,6.66-6.61A1.42,1.42,0,0,0,12,2.69h0A1.57,1.57,0,0,0,10.14,1.16Z"
- class="spinner_ajPY"
- />
- </svg>
- </div>
- {:else}
- <svg
- xmlns="http://www.w3.org/2000/svg"
- viewBox="0 0 16 16"
- fill="currentColor"
- class="w-4 h-4"
- >
- <path
- d="M8.75 2.75a.75.75 0 0 0-1.5 0v5.69L5.03 6.22a.75.75 0 0 0-1.06 1.06l3.5 3.5a.75.75 0 0 0 1.06 0l3.5-3.5a.75.75 0 0 0-1.06-1.06L8.75 8.44V2.75Z"
- />
- <path
- d="M3.5 9.75a.75.75 0 0 0-1.5 0v1.5A2.75 2.75 0 0 0 4.75 14h6.5A2.75 2.75 0 0 0 14 11.25v-1.5a.75.75 0 0 0-1.5 0v1.5c0 .69-.56 1.25-1.25 1.25h-6.5c-.69 0-1.25-.56-1.25-1.25v-1.5Z"
- />
- </svg>
- {/if}
- </button>
- </div>
- <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
- {$i18n.t(`Open WebUI uses faster-whisper internally.`)}
- <a
- class=" hover:underline dark:text-gray-200 text-gray-800"
- href="https://github.com/SYSTRAN/faster-whisper"
- target="_blank"
- >
- {$i18n.t(
- `Click here to learn more about faster-whisper and see the available models.`
- )}
- </a>
- </div>
- </div>
- {/if}
- </div>
- <hr class=" dark:border-gray-800" />
- <div>
- <div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
- <div class=" py-0.5 flex w-full justify-between">
- <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
- <div class="flex items-center relative">
- <select
- class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
- bind:value={TTS_ENGINE}
- placeholder="Select a mode"
- on:change={async (e) => {
- await updateConfigHandler();
- await getVoices();
- await getModels();
- if (e.target?.value === 'openai') {
- TTS_VOICE = 'alloy';
- TTS_MODEL = 'tts-1';
- } else {
- TTS_VOICE = '';
- TTS_MODEL = '';
- }
- }}
- >
- <option value="">{$i18n.t('Web API')}</option>
- <option value="transformers">{$i18n.t('Transformers')} ({$i18n.t('Local')})</option>
- <option value="openai">{$i18n.t('OpenAI')}</option>
- <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
- <option value="azure">{$i18n.t('Azure AI Speech')}</option>
- </select>
- </div>
- </div>
- {#if TTS_ENGINE === 'openai'}
- <div>
- <div class="mt-1 flex gap-2 mb-1">
- <input
- class="flex-1 w-full bg-transparent outline-none"
- placeholder={$i18n.t('API Base URL')}
- bind:value={TTS_OPENAI_API_BASE_URL}
- required
- />
- <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_OPENAI_API_KEY} />
- </div>
- </div>
- {:else if TTS_ENGINE === 'elevenlabs'}
- <div>
- <div class="mt-1 flex gap-2 mb-1">
- <input
- class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- placeholder={$i18n.t('API Key')}
- bind:value={TTS_API_KEY}
- required
- />
- </div>
- </div>
- {:else if TTS_ENGINE === 'azure'}
- <div>
- <div class="mt-1 flex gap-2 mb-1">
- <input
- class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- placeholder={$i18n.t('API Key')}
- bind:value={TTS_API_KEY}
- required
- />
- <input
- class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- placeholder={$i18n.t('Azure Region')}
- bind:value={TTS_AZURE_SPEECH_REGION}
- required
- />
- </div>
- </div>
- {/if}
- <hr class=" dark:border-gray-850 my-2" />
- {#if TTS_ENGINE === ''}
- <div>
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <select
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_VOICE}
- >
- <option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option>
- {#each voices as voice}
- <option
- value={voice.voiceURI}
- class="bg-gray-100 dark:bg-gray-700"
- selected={TTS_VOICE === voice.voiceURI}
- >{voice.name.replace('+', ', ')}</option
- >
- {/each}
- </select>
- </div>
- </div>
- </div>
- {:else if TTS_ENGINE === 'transformers'}
- <div>
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="model-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_MODEL}
- placeholder="CMU ARCTIC speaker embedding name"
- />
- <datalist id="model-list">
- <option value="tts-1" />
- </datalist>
- </div>
- </div>
- <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
- {$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)}
- To learn more about SpeechT5,
- <a
- class=" hover:underline dark:text-gray-200 text-gray-800"
- href="https://github.com/microsoft/SpeechT5"
- target="_blank"
- >
- {$i18n.t(`click here`, {
- name: 'SpeechT5'
- })}.
- </a>
- To see the available CMU Arctic speaker embeddings,
- <a
- class=" hover:underline dark:text-gray-200 text-gray-800"
- href="https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors"
- target="_blank"
- >
- {$i18n.t(`click here`)}.
- </a>
- </div>
- </div>
- {:else if TTS_ENGINE === 'openai'}
- <div class=" flex gap-2">
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="voice-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_VOICE}
- placeholder="Select a voice"
- />
- <datalist id="voice-list">
- {#each voices as voice}
- <option value={voice.id}>{voice.name}</option>
- {/each}
- </datalist>
- </div>
- </div>
- </div>
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="tts-model-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_MODEL}
- placeholder="Select a model"
- />
- <datalist id="tts-model-list">
- {#each models as model}
- <option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
- {/each}
- </datalist>
- </div>
- </div>
- </div>
- </div>
- {:else if TTS_ENGINE === 'elevenlabs'}
- <div class=" flex gap-2">
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="voice-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_VOICE}
- placeholder="Select a voice"
- />
- <datalist id="voice-list">
- {#each voices as voice}
- <option value={voice.id}>{voice.name}</option>
- {/each}
- </datalist>
- </div>
- </div>
- </div>
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="tts-model-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_MODEL}
- placeholder="Select a model"
- />
- <datalist id="tts-model-list">
- {#each models as model}
- <option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
- {/each}
- </datalist>
- </div>
- </div>
- </div>
- </div>
- {:else if TTS_ENGINE === 'azure'}
- <div class=" flex gap-2">
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="voice-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_VOICE}
- placeholder="Select a voice"
- />
- <datalist id="voice-list">
- {#each voices as voice}
- <option value={voice.id}>{voice.name}</option>
- {/each}
- </datalist>
- </div>
- </div>
- </div>
- <div class="w-full">
- <div class=" mb-1.5 text-sm font-medium">
- {$i18n.t('Output format')}
- <a
- href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
- target="_blank"
- >
- <small>{$i18n.t('Available list')}</small>
- </a>
- </div>
- <div class="flex w-full">
- <div class="flex-1">
- <input
- list="tts-model-list"
- class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
- bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
- placeholder="Select a output format"
- />
- </div>
- </div>
- </div>
- </div>
- {/if}
- <hr class="dark:border-gray-850 my-2" />
- <div class="pt-0.5 flex w-full justify-between">
- <div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
- <div class="flex items-center relative">
- <select
- class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
- aria-label="Select how to split message text for TTS requests"
- bind:value={TTS_SPLIT_ON}
- >
- {#each Object.values(TTS_RESPONSE_SPLIT) as split}
- <option value={split}
- >{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option
- >
- {/each}
- </select>
- </div>
- </div>
- <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
- {$i18n.t(
- "Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string."
- )}
- </div>
- </div>
- </div>
- </div>
- <div class="flex justify-end text-sm font-medium">
- <button
- class="px-3.5 py-1.5 text-sm font-medium bg-black hover:bg-gray-900 text-white dark:bg-white dark:text-black dark:hover:bg-gray-100 transition rounded-full"
- type="submit"
- >
- {$i18n.t('Save')}
- </button>
- </div>
- </form>
|