CallOverlay.svelte 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. <script lang="ts">
  2. import { settings, showCallOverlay } from '$lib/stores';
  3. import { onMount, tick, getContext } from 'svelte';
  4. import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
  5. import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
  6. import { toast } from 'svelte-sonner';
  7. import Tooltip from '$lib/components/common/Tooltip.svelte';
  8. const i18n = getContext('i18n');
  9. export let submitPrompt: Function;
  10. let loading = false;
  11. let confirmed = false;
  12. let assistantSpeaking = false;
  13. let assistantAudio = {};
  14. let assistantAudioIdx = null;
  15. let rmsLevel = 0;
  16. let hasStartedSpeaking = false;
  17. let audioContext;
  18. let analyser;
  19. let dataArray;
  20. let audioElement;
  21. let animationFrameId;
  22. let speechRecognition;
  23. let currentUtterance = null;
  24. let mediaRecorder;
  25. let audioChunks = [];
  26. const MIN_DECIBELS = -45;
  27. const VISUALIZER_BUFFER_LENGTH = 300;
  28. let visualizerData = Array(VISUALIZER_BUFFER_LENGTH).fill(0);
  29. const startAudio = () => {
  30. audioContext = new (window.AudioContext || window.webkitAudioContext)();
  31. analyser = audioContext.createAnalyser();
  32. const source = audioContext.createMediaElementSource(audioElement);
  33. source.connect(analyser);
  34. analyser.connect(audioContext.destination);
  35. analyser.fftSize = 32; // Adjust the fftSize
  36. dataArray = new Uint8Array(analyser.frequencyBinCount);
  37. visualize();
  38. };
  39. const visualize = () => {
  40. analyser.getByteFrequencyData(dataArray);
  41. div1Height = dataArray[1] / 2;
  42. div2Height = dataArray[3] / 2;
  43. div3Height = dataArray[5] / 2;
  44. div4Height = dataArray[7] / 2;
  45. animationFrameId = requestAnimationFrame(visualize);
  46. };
  47. // Function to calculate the RMS level from time domain data
  48. const calculateRMS = (data: Uint8Array) => {
  49. let sumSquares = 0;
  50. for (let i = 0; i < data.length; i++) {
  51. const normalizedValue = (data[i] - 128) / 128; // Normalize the data
  52. sumSquares += normalizedValue * normalizedValue;
  53. }
  54. return Math.sqrt(sumSquares / data.length);
  55. };
  56. const normalizeRMS = (rms) => {
  57. rms = rms * 10;
  58. const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
  59. const scaledRMS = Math.pow(rms, exp);
  60. // Scale between 0.01 (1%) and 1.0 (100%)
  61. return Math.min(1.0, Math.max(0.01, scaledRMS));
  62. };
  63. const analyseAudio = (stream) => {
  64. const audioContext = new AudioContext();
  65. const audioStreamSource = audioContext.createMediaStreamSource(stream);
  66. const analyser = audioContext.createAnalyser();
  67. analyser.minDecibels = MIN_DECIBELS;
  68. audioStreamSource.connect(analyser);
  69. const bufferLength = analyser.frequencyBinCount;
  70. const domainData = new Uint8Array(bufferLength);
  71. const timeDomainData = new Uint8Array(analyser.fftSize);
  72. let lastSoundTime = Date.now();
  73. hasStartedSpeaking = false;
  74. const detectSound = () => {
  75. const processFrame = () => {
  76. if (!mediaRecorder || !$showCallOverlay) {
  77. if (mediaRecorder) {
  78. mediaRecorder.stop();
  79. }
  80. return;
  81. }
  82. analyser.getByteTimeDomainData(timeDomainData);
  83. analyser.getByteFrequencyData(domainData);
  84. // Calculate RMS level from time domain data
  85. rmsLevel = calculateRMS(timeDomainData);
  86. // Check if initial speech/noise has started
  87. const hasSound = domainData.some((value) => value > 0);
  88. if (hasSound) {
  89. stopAllAudio();
  90. hasStartedSpeaking = true;
  91. lastSoundTime = Date.now();
  92. }
  93. // Start silence detection only after initial speech/noise has been detected
  94. if (hasStartedSpeaking) {
  95. if (Date.now() - lastSoundTime > 2000) {
  96. confirmed = true;
  97. if (mediaRecorder) {
  98. mediaRecorder.stop();
  99. }
  100. }
  101. }
  102. window.requestAnimationFrame(processFrame);
  103. };
  104. window.requestAnimationFrame(processFrame);
  105. };
  106. detectSound();
  107. };
  108. const stopAllAudio = () => {
  109. if (currentUtterance) {
  110. speechSynthesis.cancel();
  111. currentUtterance = null;
  112. }
  113. if (assistantAudio[assistantAudioIdx]) {
  114. assistantAudio[assistantAudioIdx].pause();
  115. assistantAudio[assistantAudioIdx].currentTime = 0;
  116. }
  117. const audioElement = document.getElementById('audioElement');
  118. audioElement.pause();
  119. audioElement.currentTime = 0;
  120. assistantSpeaking = false;
  121. };
  122. const playAudio = (idx) => {
  123. return new Promise((res) => {
  124. assistantAudioIdx = idx;
  125. const audioElement = document.getElementById('audioElement');
  126. const audio = assistantAudio[idx];
  127. audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
  128. audioElement.play();
  129. audioElement.onended = async (e) => {
  130. await new Promise((r) => setTimeout(r, 300));
  131. if (Object.keys(assistantAudio).length - 1 === idx) {
  132. assistantSpeaking = false;
  133. }
  134. res(e);
  135. };
  136. });
  137. };
  138. const getOpenAISpeech = async (text) => {
  139. const res = await synthesizeOpenAISpeech(
  140. localStorage.token,
  141. $settings?.audio?.speaker ?? 'alloy',
  142. text,
  143. $settings?.audio?.model ?? 'tts-1'
  144. ).catch((error) => {
  145. toast.error(error);
  146. assistantSpeaking = false;
  147. return null;
  148. });
  149. if (res) {
  150. const blob = await res.blob();
  151. const blobUrl = URL.createObjectURL(blob);
  152. const audio = new Audio(blobUrl);
  153. assistantAudio = audio;
  154. }
  155. };
  156. const transcribeHandler = async (audioBlob) => {
  157. // Create a blob from the audio chunks
  158. await tick();
  159. const file = blobToFile(audioBlob, 'recording.wav');
  160. const res = await transcribeAudio(localStorage.token, file).catch((error) => {
  161. toast.error(error);
  162. return null;
  163. });
  164. if (res) {
  165. console.log(res.text);
  166. if (res.text !== '') {
  167. const _responses = await submitPrompt(res.text);
  168. console.log(_responses);
  169. if (_responses.at(0)) {
  170. const content = _responses[0];
  171. if (content) {
  172. assistantSpeakingHandler(content);
  173. }
  174. }
  175. }
  176. }
  177. };
  178. const assistantSpeakingHandler = async (content) => {
  179. assistantSpeaking = true;
  180. if (($settings?.audio?.TTSEngine ?? '') == '') {
  181. currentUtterance = new SpeechSynthesisUtterance(content);
  182. speechSynthesis.speak(currentUtterance);
  183. } else if ($settings?.audio?.TTSEngine === 'openai') {
  184. console.log('openai');
  185. const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
  186. const lastIndex = mergedTexts.length - 1;
  187. if (lastIndex >= 0) {
  188. const previousText = mergedTexts[lastIndex];
  189. const wordCount = previousText.split(/\s+/).length;
  190. if (wordCount < 2) {
  191. mergedTexts[lastIndex] = previousText + ' ' + currentText;
  192. } else {
  193. mergedTexts.push(currentText);
  194. }
  195. } else {
  196. mergedTexts.push(currentText);
  197. }
  198. return mergedTexts;
  199. }, []);
  200. console.log(sentences);
  201. let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
  202. for (const [idx, sentence] of sentences.entries()) {
  203. const res = await synthesizeOpenAISpeech(
  204. localStorage.token,
  205. $settings?.audio?.speaker,
  206. sentence,
  207. $settings?.audio?.model
  208. ).catch((error) => {
  209. toast.error(error);
  210. assistantSpeaking = false;
  211. return null;
  212. });
  213. if (res) {
  214. const blob = await res.blob();
  215. const blobUrl = URL.createObjectURL(blob);
  216. const audio = new Audio(blobUrl);
  217. assistantAudio[idx] = audio;
  218. lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
  219. }
  220. }
  221. }
  222. };
  223. const stopRecordingCallback = async () => {
  224. if ($showCallOverlay) {
  225. if (confirmed) {
  226. loading = true;
  227. const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
  228. await transcribeHandler(audioBlob);
  229. confirmed = false;
  230. loading = false;
  231. }
  232. audioChunks = [];
  233. mediaRecorder = false;
  234. startRecording();
  235. } else {
  236. audioChunks = [];
  237. mediaRecorder = false;
  238. }
  239. };
  240. const startRecording = async () => {
  241. const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  242. mediaRecorder = new MediaRecorder(stream);
  243. mediaRecorder.onstart = () => {
  244. console.log('Recording started');
  245. audioChunks = [];
  246. analyseAudio(stream);
  247. };
  248. mediaRecorder.ondataavailable = (event) => {
  249. if (hasStartedSpeaking) {
  250. audioChunks.push(event.data);
  251. }
  252. };
  253. mediaRecorder.onstop = async () => {
  254. console.log('Recording stopped');
  255. await stopRecordingCallback();
  256. };
  257. mediaRecorder.start();
  258. };
  259. $: if ($showCallOverlay) {
  260. startRecording();
  261. }
  262. </script>
  263. {#if $showCallOverlay}
  264. <audio id="audioElement" src="" style="display: none;" />
  265. <div class=" absolute w-full h-full flex z-[999]">
  266. <div
  267. class="absolute w-full h-full bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"
  268. >
  269. <div class="max-w-lg w-full h-screen flex flex-col justify-between p-6">
  270. <div>
  271. <!-- navbar -->
  272. </div>
  273. <div class="flex justify-center items-center w-ull">
  274. {#if loading}
  275. <svg
  276. class="size-44 text-gray-900 dark:text-gray-400"
  277. viewBox="0 0 24 24"
  278. fill="currentColor"
  279. xmlns="http://www.w3.org/2000/svg"
  280. ><style>
  281. .spinner_qM83 {
  282. animation: spinner_8HQG 1.05s infinite;
  283. }
  284. .spinner_oXPr {
  285. animation-delay: 0.1s;
  286. }
  287. .spinner_ZTLf {
  288. animation-delay: 0.2s;
  289. }
  290. @keyframes spinner_8HQG {
  291. 0%,
  292. 57.14% {
  293. animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
  294. transform: translate(0);
  295. }
  296. 28.57% {
  297. animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
  298. transform: translateY(-6px);
  299. }
  300. 100% {
  301. transform: translate(0);
  302. }
  303. }
  304. </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle
  305. class="spinner_qM83 spinner_oXPr"
  306. cx="12"
  307. cy="12"
  308. r="3"
  309. /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
  310. >
  311. {:else}
  312. <div
  313. class=" {rmsLevel * 100 > 4
  314. ? ' size-52'
  315. : rmsLevel * 100 > 2
  316. ? 'size-48'
  317. : rmsLevel * 100 > 1
  318. ? 'size-[11.5rem]'
  319. : 'size-44'} transition-all bg-black dark:bg-white rounded-full"
  320. />
  321. {/if}
  322. </div>
  323. <div class="flex justify-between items-center pb-2 w-full">
  324. <div>
  325. <Tooltip content="WIP 🚧">
  326. <button class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900">
  327. <svg
  328. xmlns="http://www.w3.org/2000/svg"
  329. fill="none"
  330. viewBox="0 0 24 24"
  331. stroke-width="1.5"
  332. stroke="currentColor"
  333. class="size-5"
  334. >
  335. <path
  336. stroke-linecap="round"
  337. stroke-linejoin="round"
  338. d="M6.827 6.175A2.31 2.31 0 0 1 5.186 7.23c-.38.054-.757.112-1.134.175C2.999 7.58 2.25 8.507 2.25 9.574V18a2.25 2.25 0 0 0 2.25 2.25h15A2.25 2.25 0 0 0 21.75 18V9.574c0-1.067-.75-1.994-1.802-2.169a47.865 47.865 0 0 0-1.134-.175 2.31 2.31 0 0 1-1.64-1.055l-.822-1.316a2.192 2.192 0 0 0-1.736-1.039 48.774 48.774 0 0 0-5.232 0 2.192 2.192 0 0 0-1.736 1.039l-.821 1.316Z"
  339. />
  340. <path
  341. stroke-linecap="round"
  342. stroke-linejoin="round"
  343. d="M16.5 12.75a4.5 4.5 0 1 1-9 0 4.5 4.5 0 0 1 9 0ZM18.75 10.5h.008v.008h-.008V10.5Z"
  344. />
  345. </svg>
  346. </button>
  347. </Tooltip>
  348. </div>
  349. <div>
  350. <button type="button">
  351. <div class=" line-clamp-1 text-sm font-medium">
  352. {#if loading}
  353. Thinking...
  354. {:else}
  355. Listening...
  356. {/if}
  357. </div>
  358. </button>
  359. </div>
  360. <div>
  361. <button
  362. class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900"
  363. on:click={async () => {
  364. showCallOverlay.set(false);
  365. }}
  366. type="button"
  367. >
  368. <svg
  369. xmlns="http://www.w3.org/2000/svg"
  370. viewBox="0 0 20 20"
  371. fill="currentColor"
  372. class="size-5"
  373. >
  374. <path
  375. d="M6.28 5.22a.75.75 0 0 0-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 1 0 1.06 1.06L10 11.06l3.72 3.72a.75.75 0 1 0 1.06-1.06L11.06 10l3.72-3.72a.75.75 0 0 0-1.06-1.06L10 8.94 6.28 5.22Z"
  376. />
  377. </svg>
  378. </button>
  379. </div>
  380. </div>
  381. </div>
  382. </div>
  383. </div>
  384. {/if}