Audio.svelte 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. <script lang="ts">
  2. import { toast } from 'svelte-sonner';
  3. import { createEventDispatcher, onMount, getContext } from 'svelte';
  4. const dispatch = createEventDispatcher();
  5. import { getBackendConfig } from '$lib/apis';
  6. import {
  7. getAudioConfig,
  8. updateAudioConfig,
  9. getModels as _getModels,
  10. getVoices as _getVoices
  11. } from '$lib/apis/audio';
  12. import { config } from '$lib/stores';
  13. import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
  14. import { TTS_RESPONSE_SPLIT } from '$lib/types';
  15. import type { Writable } from 'svelte/store';
  16. import type { i18n as i18nType } from 'i18next';
  17. const i18n = getContext<Writable<i18nType>>('i18n');
  18. export let saveHandler: () => void;
  19. // Audio
  20. let TTS_OPENAI_API_BASE_URL = '';
  21. let TTS_OPENAI_API_KEY = '';
  22. let TTS_API_KEY = '';
  23. let TTS_ENGINE = '';
  24. let TTS_MODEL = '';
  25. let TTS_VOICE = '';
  26. let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
  27. let TTS_AZURE_SPEECH_REGION = '';
  28. let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
  29. let STT_OPENAI_API_BASE_URL = '';
  30. let STT_OPENAI_API_KEY = '';
  31. let STT_ENGINE = '';
  32. let STT_MODEL = '';
  33. let STT_WHISPER_MODEL = '';
  34. let STT_WHISPER_MODEL_LOADING = false;
  35. // eslint-disable-next-line no-undef
  36. let voices: SpeechSynthesisVoice[] = [];
  37. let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
  38. const getModels = async () => {
  39. if (TTS_ENGINE === '') {
  40. models = [];
  41. } else {
  42. const res = await _getModels(localStorage.token).catch((e) => {
  43. toast.error(e);
  44. });
  45. if (res) {
  46. console.log(res);
  47. models = res.models;
  48. }
  49. }
  50. };
  51. const getVoices = async () => {
  52. if (TTS_ENGINE === '') {
  53. const getVoicesLoop = setInterval(() => {
  54. voices = speechSynthesis.getVoices();
  55. // do your loop
  56. if (voices.length > 0) {
  57. clearInterval(getVoicesLoop);
  58. voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
  59. }
  60. }, 100);
  61. } else {
  62. const res = await _getVoices(localStorage.token).catch((e) => {
  63. toast.error(e);
  64. });
  65. if (res) {
  66. console.log(res);
  67. voices = res.voices;
  68. voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
  69. }
  70. }
  71. };
  72. const updateConfigHandler = async () => {
  73. const res = await updateAudioConfig(localStorage.token, {
  74. tts: {
  75. OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL,
  76. OPENAI_API_KEY: TTS_OPENAI_API_KEY,
  77. API_KEY: TTS_API_KEY,
  78. ENGINE: TTS_ENGINE,
  79. MODEL: TTS_MODEL,
  80. VOICE: TTS_VOICE,
  81. SPLIT_ON: TTS_SPLIT_ON,
  82. AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
  83. AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
  84. },
  85. stt: {
  86. OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
  87. OPENAI_API_KEY: STT_OPENAI_API_KEY,
  88. ENGINE: STT_ENGINE,
  89. MODEL: STT_MODEL,
  90. WHISPER_MODEL: STT_WHISPER_MODEL
  91. }
  92. });
  93. if (res) {
  94. saveHandler();
  95. config.set(await getBackendConfig());
  96. }
  97. };
  98. const sttModelUpdateHandler = async () => {
  99. STT_WHISPER_MODEL_LOADING = true;
  100. await updateConfigHandler();
  101. STT_WHISPER_MODEL_LOADING = false;
  102. };
  103. onMount(async () => {
  104. const res = await getAudioConfig(localStorage.token);
  105. if (res) {
  106. console.log(res);
  107. TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL;
  108. TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY;
  109. TTS_API_KEY = res.tts.API_KEY;
  110. TTS_ENGINE = res.tts.ENGINE;
  111. TTS_MODEL = res.tts.MODEL;
  112. TTS_VOICE = res.tts.VOICE;
  113. TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
  114. TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
  115. TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
  116. STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
  117. STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
  118. STT_ENGINE = res.stt.ENGINE;
  119. STT_MODEL = res.stt.MODEL;
  120. STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
  121. }
  122. await getVoices();
  123. await getModels();
  124. });
  125. </script>
  126. <form
  127. class="flex flex-col h-full justify-between space-y-3 text-sm"
  128. on:submit|preventDefault={async () => {
  129. await updateConfigHandler();
  130. dispatch('save');
  131. }}
  132. >
  133. <div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
  134. <div class="flex flex-col gap-3">
  135. <div>
  136. <div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
  137. <div class=" py-0.5 flex w-full justify-between">
  138. <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
  139. <div class="flex items-center relative">
  140. <select
  141. class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  142. bind:value={STT_ENGINE}
  143. placeholder="Select an engine"
  144. >
  145. <option value="">{$i18n.t('Whisper (Local)')}</option>
  146. <option value="openai">OpenAI</option>
  147. <option value="web">{$i18n.t('Web API')}</option>
  148. </select>
  149. </div>
  150. </div>
  151. {#if STT_ENGINE === 'openai'}
  152. <div>
  153. <div class="mt-1 flex gap-2 mb-1">
  154. <input
  155. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  156. placeholder={$i18n.t('API Base URL')}
  157. bind:value={STT_OPENAI_API_BASE_URL}
  158. required
  159. />
  160. <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_OPENAI_API_KEY} />
  161. </div>
  162. </div>
  163. <hr class=" dark:border-gray-850 my-2" />
  164. <div>
  165. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
  166. <div class="flex w-full">
  167. <div class="flex-1">
  168. <input
  169. list="model-list"
  170. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  171. bind:value={STT_MODEL}
  172. placeholder="Select a model"
  173. />
  174. <datalist id="model-list">
  175. <option value="whisper-1" />
  176. </datalist>
  177. </div>
  178. </div>
  179. </div>
  180. {:else if STT_ENGINE === ''}
  181. <div>
  182. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
  183. <div class="flex w-full">
  184. <div class="flex-1 mr-2">
  185. <input
  186. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  187. placeholder={$i18n.t('Set whisper model')}
  188. bind:value={STT_WHISPER_MODEL}
  189. />
  190. </div>
  191. <button
  192. class="px-2.5 bg-gray-50 hover:bg-gray-200 text-gray-800 dark:bg-gray-850 dark:hover:bg-gray-800 dark:text-gray-100 rounded-lg transition"
  193. on:click={() => {
  194. sttModelUpdateHandler();
  195. }}
  196. disabled={STT_WHISPER_MODEL_LOADING}
  197. >
  198. {#if STT_WHISPER_MODEL_LOADING}
  199. <div class="self-center">
  200. <svg
  201. class=" w-4 h-4"
  202. viewBox="0 0 24 24"
  203. fill="currentColor"
  204. xmlns="http://www.w3.org/2000/svg"
  205. >
  206. <style>
  207. .spinner_ajPY {
  208. transform-origin: center;
  209. animation: spinner_AtaB 0.75s infinite linear;
  210. }
  211. @keyframes spinner_AtaB {
  212. 100% {
  213. transform: rotate(360deg);
  214. }
  215. }
  216. </style>
  217. <path
  218. d="M12,1A11,11,0,1,0,23,12,11,11,0,0,0,12,1Zm0,19a8,8,0,1,1,8-8A8,8,0,0,1,12,20Z"
  219. opacity=".25"
  220. />
  221. <path
  222. d="M10.14,1.16a11,11,0,0,0-9,8.92A1.59,1.59,0,0,0,2.46,12,1.52,1.52,0,0,0,4.11,10.7a8,8,0,0,1,6.66-6.61A1.42,1.42,0,0,0,12,2.69h0A1.57,1.57,0,0,0,10.14,1.16Z"
  223. class="spinner_ajPY"
  224. />
  225. </svg>
  226. </div>
  227. {:else}
  228. <svg
  229. xmlns="http://www.w3.org/2000/svg"
  230. viewBox="0 0 16 16"
  231. fill="currentColor"
  232. class="w-4 h-4"
  233. >
  234. <path
  235. d="M8.75 2.75a.75.75 0 0 0-1.5 0v5.69L5.03 6.22a.75.75 0 0 0-1.06 1.06l3.5 3.5a.75.75 0 0 0 1.06 0l3.5-3.5a.75.75 0 0 0-1.06-1.06L8.75 8.44V2.75Z"
  236. />
  237. <path
  238. d="M3.5 9.75a.75.75 0 0 0-1.5 0v1.5A2.75 2.75 0 0 0 4.75 14h6.5A2.75 2.75 0 0 0 14 11.25v-1.5a.75.75 0 0 0-1.5 0v1.5c0 .69-.56 1.25-1.25 1.25h-6.5c-.69 0-1.25-.56-1.25-1.25v-1.5Z"
  239. />
  240. </svg>
  241. {/if}
  242. </button>
  243. </div>
  244. <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
  245. {$i18n.t(`Open WebUI uses faster-whisper internally.`)}
  246. <a
  247. class=" hover:underline dark:text-gray-200 text-gray-800"
  248. href="https://github.com/SYSTRAN/faster-whisper"
  249. target="_blank"
  250. >
  251. {$i18n.t(
  252. `Click here to learn more about faster-whisper and see the available models.`
  253. )}
  254. </a>
  255. </div>
  256. </div>
  257. {/if}
  258. </div>
  259. <hr class=" dark:border-gray-800" />
  260. <div>
  261. <div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
  262. <div class=" py-0.5 flex w-full justify-between">
  263. <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
  264. <div class="flex items-center relative">
  265. <select
  266. class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  267. bind:value={TTS_ENGINE}
  268. placeholder="Select a mode"
  269. on:change={async (e) => {
  270. await updateConfigHandler();
  271. await getVoices();
  272. await getModels();
  273. if (e.target?.value === 'openai') {
  274. TTS_VOICE = 'alloy';
  275. TTS_MODEL = 'tts-1';
  276. } else {
  277. TTS_VOICE = '';
  278. TTS_MODEL = '';
  279. }
  280. }}
  281. >
  282. <option value="">{$i18n.t('Web API')}</option>
  283. <option value="openai">{$i18n.t('OpenAI')}</option>
  284. <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
  285. <option value="azure">{$i18n.t('Azure AI Speech')}</option>
  286. </select>
  287. </div>
  288. </div>
  289. {#if TTS_ENGINE === 'openai'}
  290. <div>
  291. <div class="mt-1 flex gap-2 mb-1">
  292. <input
  293. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  294. placeholder={$i18n.t('API Base URL')}
  295. bind:value={TTS_OPENAI_API_BASE_URL}
  296. required
  297. />
  298. <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_OPENAI_API_KEY} />
  299. </div>
  300. </div>
  301. {:else if TTS_ENGINE === 'elevenlabs'}
  302. <div>
  303. <div class="mt-1 flex gap-2 mb-1">
  304. <input
  305. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  306. placeholder={$i18n.t('API Key')}
  307. bind:value={TTS_API_KEY}
  308. required
  309. />
  310. </div>
  311. </div>
  312. {:else if TTS_ENGINE === 'azure'}
  313. <div>
  314. <div class="mt-1 flex gap-2 mb-1">
  315. <input
  316. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  317. placeholder={$i18n.t('API Key')}
  318. bind:value={TTS_API_KEY}
  319. required
  320. />
  321. <input
  322. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  323. placeholder={$i18n.t('Azure Region')}
  324. bind:value={TTS_AZURE_SPEECH_REGION}
  325. required
  326. />
  327. </div>
  328. </div>
  329. {/if}
  330. <hr class=" dark:border-gray-850 my-2" />
  331. {#if TTS_ENGINE === ''}
  332. <div>
  333. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  334. <div class="flex w-full">
  335. <div class="flex-1">
  336. <select
  337. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  338. bind:value={TTS_VOICE}
  339. >
  340. <option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option>
  341. {#each voices as voice}
  342. <option
  343. value={voice.voiceURI}
  344. class="bg-gray-100 dark:bg-gray-700"
  345. selected={TTS_VOICE === voice.voiceURI}
  346. >{voice.name.replace('+', ', ')}</option
  347. >
  348. {/each}
  349. </select>
  350. </div>
  351. </div>
  352. </div>
  353. {:else if TTS_ENGINE === 'openai'}
  354. <div class=" flex gap-2">
  355. <div class="w-full">
  356. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  357. <div class="flex w-full">
  358. <div class="flex-1">
  359. <input
  360. list="voice-list"
  361. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  362. bind:value={TTS_VOICE}
  363. placeholder="Select a voice"
  364. />
  365. <datalist id="voice-list">
  366. {#each voices as voice}
  367. <option value={voice.id}>{voice.name}</option>
  368. {/each}
  369. </datalist>
  370. </div>
  371. </div>
  372. </div>
  373. <div class="w-full">
  374. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
  375. <div class="flex w-full">
  376. <div class="flex-1">
  377. <input
  378. list="tts-model-list"
  379. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  380. bind:value={TTS_MODEL}
  381. placeholder="Select a model"
  382. />
  383. <datalist id="tts-model-list">
  384. {#each models as model}
  385. <option value={model.id} />
  386. {/each}
  387. </datalist>
  388. </div>
  389. </div>
  390. </div>
  391. </div>
  392. {:else if TTS_ENGINE === 'elevenlabs'}
  393. <div class=" flex gap-2">
  394. <div class="w-full">
  395. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  396. <div class="flex w-full">
  397. <div class="flex-1">
  398. <input
  399. list="voice-list"
  400. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  401. bind:value={TTS_VOICE}
  402. placeholder="Select a voice"
  403. />
  404. <datalist id="voice-list">
  405. {#each voices as voice}
  406. <option value={voice.id}>{voice.name}</option>
  407. {/each}
  408. </datalist>
  409. </div>
  410. </div>
  411. </div>
  412. <div class="w-full">
  413. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
  414. <div class="flex w-full">
  415. <div class="flex-1">
  416. <input
  417. list="tts-model-list"
  418. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  419. bind:value={TTS_MODEL}
  420. placeholder="Select a model"
  421. />
  422. <datalist id="tts-model-list">
  423. {#each models as model}
  424. <option value={model.id} />
  425. {/each}
  426. </datalist>
  427. </div>
  428. </div>
  429. </div>
  430. </div>
  431. {:else if TTS_ENGINE === 'azure'}
  432. <div class=" flex gap-2">
  433. <div class="w-full">
  434. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  435. <div class="flex w-full">
  436. <div class="flex-1">
  437. <input
  438. list="voice-list"
  439. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  440. bind:value={TTS_VOICE}
  441. placeholder="Select a voice"
  442. />
  443. <datalist id="voice-list">
  444. {#each voices as voice}
  445. <option value={voice.id}>{voice.name}</option>
  446. {/each}
  447. </datalist>
  448. </div>
  449. </div>
  450. </div>
  451. <div class="w-full">
  452. <div class=" mb-1.5 text-sm font-medium">
  453. {$i18n.t('Output format')}
  454. <a
  455. href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
  456. target="_blank"
  457. >
  458. <small>{$i18n.t('Available list')}</small>
  459. </a>
  460. </div>
  461. <div class="flex w-full">
  462. <div class="flex-1">
  463. <input
  464. list="tts-model-list"
  465. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  466. bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
  467. placeholder="Select a output format"
  468. />
  469. </div>
  470. </div>
  471. </div>
  472. </div>
  473. {/if}
  474. <hr class="dark:border-gray-850 my-2" />
  475. <div class="pt-0.5 flex w-full justify-between">
  476. <div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
  477. <div class="flex items-center relative">
  478. <select
  479. class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  480. aria-label="Select how to split message text for TTS requests"
  481. bind:value={TTS_SPLIT_ON}
  482. >
  483. {#each Object.values(TTS_RESPONSE_SPLIT) as split}
  484. <option value={split}
  485. >{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option
  486. >
  487. {/each}
  488. </select>
  489. </div>
  490. </div>
  491. <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
  492. {$i18n.t(
  493. "Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string."
  494. )}
  495. </div>
  496. </div>
  497. </div>
  498. </div>
  499. <div class="flex justify-end text-sm font-medium">
  500. <button
  501. class="px-3 py-1.5 text-sm font-medium bg-black hover:bg-gray-900 text-white dark:bg-white dark:text-black dark:hover:bg-gray-100 transition rounded-full"
  502. type="submit"
  503. >
  504. {$i18n.t('Save')}
  505. </button>
  506. </div>
  507. </form>