Audio.svelte 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. <script lang="ts">
  2. import { toast } from 'svelte-sonner';
  3. import { createEventDispatcher, onMount, getContext } from 'svelte';
  4. const dispatch = createEventDispatcher();
  5. import { getBackendConfig } from '$lib/apis';
  6. import {
  7. getAudioConfig,
  8. updateAudioConfig,
  9. getModels as _getModels,
  10. getVoices as _getVoices
  11. } from '$lib/apis/audio';
  12. import { config } from '$lib/stores';
  13. import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
  14. import { TTS_RESPONSE_SPLIT } from '$lib/types';
  15. import type { Writable } from 'svelte/store';
  16. import type { i18n as i18nType } from 'i18next';
  17. const i18n = getContext<Writable<i18nType>>('i18n');
  18. export let saveHandler: () => void;
  19. // Audio
  20. let TTS_OPENAI_API_BASE_URL = '';
  21. let TTS_OPENAI_API_KEY = '';
  22. let TTS_API_KEY = '';
  23. let TTS_ENGINE = '';
  24. let TTS_MODEL = '';
  25. let TTS_VOICE = '';
  26. let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
  27. let TTS_AZURE_SPEECH_REGION = '';
  28. let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
  29. let STT_OPENAI_API_BASE_URL = '';
  30. let STT_OPENAI_API_KEY = '';
  31. let STT_ENGINE = '';
  32. let STT_MODEL = '';
  33. // eslint-disable-next-line no-undef
  34. let voices: SpeechSynthesisVoice[] = [];
  35. let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
  36. const getModels = async () => {
  37. if (TTS_ENGINE === '') {
  38. models = [];
  39. } else {
  40. const res = await _getModels(localStorage.token).catch((e) => {
  41. toast.error(e);
  42. });
  43. if (res) {
  44. console.log(res);
  45. models = res.models;
  46. }
  47. }
  48. };
  49. const getVoices = async () => {
  50. if (TTS_ENGINE === '') {
  51. const getVoicesLoop = setInterval(() => {
  52. voices = speechSynthesis.getVoices();
  53. // do your loop
  54. if (voices.length > 0) {
  55. clearInterval(getVoicesLoop);
  56. voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
  57. }
  58. }, 100);
  59. } else {
  60. const res = await _getVoices(localStorage.token).catch((e) => {
  61. toast.error(e);
  62. });
  63. if (res) {
  64. console.log(res);
  65. voices = res.voices;
  66. voices.sort((a, b) => a.name.localeCompare(b.name, $i18n.resolvedLanguage));
  67. }
  68. }
  69. };
  70. const updateConfigHandler = async () => {
  71. const res = await updateAudioConfig(localStorage.token, {
  72. tts: {
  73. OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL,
  74. OPENAI_API_KEY: TTS_OPENAI_API_KEY,
  75. API_KEY: TTS_API_KEY,
  76. ENGINE: TTS_ENGINE,
  77. MODEL: TTS_MODEL,
  78. VOICE: TTS_VOICE,
  79. SPLIT_ON: TTS_SPLIT_ON,
  80. AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
  81. AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
  82. },
  83. stt: {
  84. OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
  85. OPENAI_API_KEY: STT_OPENAI_API_KEY,
  86. ENGINE: STT_ENGINE,
  87. MODEL: STT_MODEL
  88. }
  89. });
  90. if (res) {
  91. saveHandler();
  92. getBackendConfig()
  93. .then(config.set)
  94. .catch(() => {});
  95. }
  96. };
  97. onMount(async () => {
  98. const res = await getAudioConfig(localStorage.token);
  99. if (res) {
  100. console.log(res);
  101. TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL;
  102. TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY;
  103. TTS_API_KEY = res.tts.API_KEY;
  104. TTS_ENGINE = res.tts.ENGINE;
  105. TTS_MODEL = res.tts.MODEL;
  106. TTS_VOICE = res.tts.VOICE;
  107. TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
  108. TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
  109. TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
  110. STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
  111. STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
  112. STT_ENGINE = res.stt.ENGINE;
  113. STT_MODEL = res.stt.MODEL;
  114. }
  115. await getVoices();
  116. await getModels();
  117. });
  118. </script>
  119. <form
  120. class="flex flex-col h-full justify-between space-y-3 text-sm"
  121. on:submit|preventDefault={async () => {
  122. await updateConfigHandler();
  123. dispatch('save');
  124. }}
  125. >
  126. <div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
  127. <div class="flex flex-col gap-3">
  128. <div>
  129. <div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
  130. <div class=" py-0.5 flex w-full justify-between">
  131. <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
  132. <div class="flex items-center relative">
  133. <select
  134. class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  135. bind:value={STT_ENGINE}
  136. placeholder="Select an engine"
  137. >
  138. <option value="">{$i18n.t('Whisper (Local)')}</option>
  139. <option value="openai">OpenAI</option>
  140. <option value="web">{$i18n.t('Web API')}</option>
  141. </select>
  142. </div>
  143. </div>
  144. {#if STT_ENGINE === 'openai'}
  145. <div>
  146. <div class="mt-1 flex gap-2 mb-1">
  147. <input
  148. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  149. placeholder={$i18n.t('API Base URL')}
  150. bind:value={STT_OPENAI_API_BASE_URL}
  151. required
  152. />
  153. <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_OPENAI_API_KEY} />
  154. </div>
  155. </div>
  156. <hr class=" dark:border-gray-850 my-2" />
  157. <div>
  158. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
  159. <div class="flex w-full">
  160. <div class="flex-1">
  161. <input
  162. list="model-list"
  163. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  164. bind:value={STT_MODEL}
  165. placeholder="Select a model"
  166. />
  167. <datalist id="model-list">
  168. <option value="whisper-1" />
  169. </datalist>
  170. </div>
  171. </div>
  172. </div>
  173. {/if}
  174. </div>
  175. <hr class=" dark:border-gray-800" />
  176. <div>
  177. <div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
  178. <div class=" py-0.5 flex w-full justify-between">
  179. <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
  180. <div class="flex items-center relative">
  181. <select
  182. class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  183. bind:value={TTS_ENGINE}
  184. placeholder="Select a mode"
  185. on:change={async (e) => {
  186. await updateConfigHandler();
  187. await getVoices();
  188. await getModels();
  189. if (e.target?.value === 'openai') {
  190. TTS_VOICE = 'alloy';
  191. TTS_MODEL = 'tts-1';
  192. } else {
  193. TTS_VOICE = '';
  194. TTS_MODEL = '';
  195. }
  196. }}
  197. >
  198. <option value="">{$i18n.t('Web API')}</option>
  199. <option value="openai">{$i18n.t('OpenAI')}</option>
  200. <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
  201. <option value="azure">{$i18n.t('Azure AI Speech')}</option>
  202. </select>
  203. </div>
  204. </div>
  205. {#if TTS_ENGINE === 'openai'}
  206. <div>
  207. <div class="mt-1 flex gap-2 mb-1">
  208. <input
  209. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  210. placeholder={$i18n.t('API Base URL')}
  211. bind:value={TTS_OPENAI_API_BASE_URL}
  212. required
  213. />
  214. <SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_OPENAI_API_KEY} />
  215. </div>
  216. </div>
  217. {:else if TTS_ENGINE === 'elevenlabs'}
  218. <div>
  219. <div class="mt-1 flex gap-2 mb-1">
  220. <input
  221. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  222. placeholder={$i18n.t('API Key')}
  223. bind:value={TTS_API_KEY}
  224. required
  225. />
  226. </div>
  227. </div>
  228. {:else if TTS_ENGINE === 'azure'}
  229. <div>
  230. <div class="mt-1 flex gap-2 mb-1">
  231. <input
  232. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  233. placeholder={$i18n.t('API Key')}
  234. bind:value={TTS_API_KEY}
  235. required
  236. />
  237. <input
  238. class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  239. placeholder={$i18n.t('Azure Region')}
  240. bind:value={TTS_AZURE_SPEECH_REGION}
  241. required
  242. />
  243. </div>
  244. </div>
  245. {/if}
  246. <hr class=" dark:border-gray-850 my-2" />
  247. {#if TTS_ENGINE === ''}
  248. <div>
  249. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  250. <div class="flex w-full">
  251. <div class="flex-1">
  252. <select
  253. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  254. bind:value={TTS_VOICE}
  255. >
  256. <option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option>
  257. {#each voices as voice}
  258. <option
  259. value={voice.voiceURI}
  260. class="bg-gray-100 dark:bg-gray-700"
  261. selected={TTS_VOICE === voice.voiceURI}
  262. >{voice.name.replace('+', ', ')}</option
  263. >
  264. {/each}
  265. </select>
  266. </div>
  267. </div>
  268. </div>
  269. {:else if TTS_ENGINE === 'openai'}
  270. <div class=" flex gap-2">
  271. <div class="w-full">
  272. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  273. <div class="flex w-full">
  274. <div class="flex-1">
  275. <input
  276. list="voice-list"
  277. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  278. bind:value={TTS_VOICE}
  279. placeholder="Select a voice"
  280. />
  281. <datalist id="voice-list">
  282. {#each voices as voice}
  283. <option value={voice.id}>{voice.name}</option>
  284. {/each}
  285. </datalist>
  286. </div>
  287. </div>
  288. </div>
  289. <div class="w-full">
  290. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
  291. <div class="flex w-full">
  292. <div class="flex-1">
  293. <input
  294. list="tts-model-list"
  295. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  296. bind:value={TTS_MODEL}
  297. placeholder="Select a model"
  298. />
  299. <datalist id="tts-model-list">
  300. {#each models as model}
  301. <option value={model.id} />
  302. {/each}
  303. </datalist>
  304. </div>
  305. </div>
  306. </div>
  307. </div>
  308. {:else if TTS_ENGINE === 'elevenlabs'}
  309. <div class=" flex gap-2">
  310. <div class="w-full">
  311. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  312. <div class="flex w-full">
  313. <div class="flex-1">
  314. <input
  315. list="voice-list"
  316. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  317. bind:value={TTS_VOICE}
  318. placeholder="Select a voice"
  319. />
  320. <datalist id="voice-list">
  321. {#each voices as voice}
  322. <option value={voice.id}>{voice.name}</option>
  323. {/each}
  324. </datalist>
  325. </div>
  326. </div>
  327. </div>
  328. <div class="w-full">
  329. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
  330. <div class="flex w-full">
  331. <div class="flex-1">
  332. <input
  333. list="tts-model-list"
  334. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  335. bind:value={TTS_MODEL}
  336. placeholder="Select a model"
  337. />
  338. <datalist id="tts-model-list">
  339. {#each models as model}
  340. <option value={model.id} />
  341. {/each}
  342. </datalist>
  343. </div>
  344. </div>
  345. </div>
  346. </div>
  347. {:else if TTS_ENGINE === 'azure'}
  348. <div class=" flex gap-2">
  349. <div class="w-full">
  350. <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
  351. <div class="flex w-full">
  352. <div class="flex-1">
  353. <input
  354. list="voice-list"
  355. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  356. bind:value={TTS_VOICE}
  357. placeholder="Select a voice"
  358. />
  359. <datalist id="voice-list">
  360. {#each voices as voice}
  361. <option value={voice.id}>{voice.name}</option>
  362. {/each}
  363. </datalist>
  364. </div>
  365. </div>
  366. </div>
  367. <div class="w-full">
  368. <div class=" mb-1.5 text-sm font-medium">
  369. {$i18n.t('Output format')}
  370. <a
  371. href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
  372. target="_blank"
  373. >
  374. <small>{$i18n.t('Available list')}</small>
  375. </a>
  376. </div>
  377. <div class="flex w-full">
  378. <div class="flex-1">
  379. <input
  380. list="tts-model-list"
  381. class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
  382. bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
  383. placeholder="Select a output format"
  384. />
  385. </div>
  386. </div>
  387. </div>
  388. </div>
  389. {/if}
  390. <hr class="dark:border-gray-850 my-2" />
  391. <div class="pt-0.5 flex w-full justify-between">
  392. <div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
  393. <div class="flex items-center relative">
  394. <select
  395. class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
  396. aria-label="Select how to split message text for TTS requests"
  397. bind:value={TTS_SPLIT_ON}
  398. >
  399. {#each Object.values(TTS_RESPONSE_SPLIT) as split}
  400. <option value={split}
  401. >{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option
  402. >
  403. {/each}
  404. </select>
  405. </div>
  406. </div>
  407. <div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
  408. {$i18n.t(
  409. "Control how message text is split for TTS requests. 'Punctuation' splits into sentences, 'paragraphs' splits into paragraphs, and 'none' keeps the message as a single string."
  410. )}
  411. </div>
  412. </div>
  413. </div>
  414. </div>
  415. <div class="flex justify-end text-sm font-medium">
  416. <button
  417. class=" px-4 py-2 bg-emerald-700 hover:bg-emerald-800 text-gray-100 transition rounded-lg"
  418. type="submit"
  419. >
  420. {$i18n.t('Save')}
  421. </button>
  422. </div>
  423. </form>