Преглед изворни кода

Merge pull request #9242 from NachoNoCheese/dev

feat: Add Deepgram STT Support
Timothy Jaeryang Baek пре 2 месеци
родитељ
комит
8c2164928a

+ 6 - 0
backend/open_webui/config.py

@@ -2012,6 +2012,12 @@ WHISPER_MODEL_AUTO_UPDATE = (
     and os.environ.get("WHISPER_MODEL_AUTO_UPDATE", "").lower() == "true"
     and os.environ.get("WHISPER_MODEL_AUTO_UPDATE", "").lower() == "true"
 )
 )
 
 
+# Add Deepgram configuration
+DEEPGRAM_API_KEY = PersistentConfig(
+    "DEEPGRAM_API_KEY",
+    "audio.stt.deepgram.api_key",
+    os.getenv("DEEPGRAM_API_KEY", ""),
+)
 
 
 AUDIO_STT_OPENAI_API_BASE_URL = PersistentConfig(
 AUDIO_STT_OPENAI_API_BASE_URL = PersistentConfig(
     "AUDIO_STT_OPENAI_API_BASE_URL",
     "AUDIO_STT_OPENAI_API_BASE_URL",

+ 2 - 0
backend/open_webui/main.py

@@ -130,6 +130,7 @@ from open_webui.config import (
     AUDIO_TTS_AZURE_SPEECH_REGION,
     AUDIO_TTS_AZURE_SPEECH_REGION,
     AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
     AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
     WHISPER_MODEL,
     WHISPER_MODEL,
+    DEEPGRAM_API_KEY,
     WHISPER_MODEL_AUTO_UPDATE,
     WHISPER_MODEL_AUTO_UPDATE,
     WHISPER_MODEL_DIR,
     WHISPER_MODEL_DIR,
     # Retrieval
     # Retrieval
@@ -611,6 +612,7 @@ app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
 app.state.config.STT_MODEL = AUDIO_STT_MODEL
 app.state.config.STT_MODEL = AUDIO_STT_MODEL
 
 
 app.state.config.WHISPER_MODEL = WHISPER_MODEL
 app.state.config.WHISPER_MODEL = WHISPER_MODEL
+app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
 
 
 app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
 app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
 app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
 app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY

+ 64 - 0
backend/open_webui/routers/audio.py

@@ -11,6 +11,7 @@ from pydub.silence import split_on_silence
 import aiohttp
 import aiohttp
 import aiofiles
 import aiofiles
 import requests
 import requests
+import mimetypes
 
 
 from fastapi import (
 from fastapi import (
     Depends,
     Depends,
@@ -138,6 +139,7 @@ class STTConfigForm(BaseModel):
     ENGINE: str
     ENGINE: str
     MODEL: str
     MODEL: str
     WHISPER_MODEL: str
     WHISPER_MODEL: str
+    DEEPGRAM_API_KEY: str
 
 
 
 
 class AudioConfigUpdateForm(BaseModel):
 class AudioConfigUpdateForm(BaseModel):
@@ -165,6 +167,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
             "ENGINE": request.app.state.config.STT_ENGINE,
             "ENGINE": request.app.state.config.STT_ENGINE,
             "MODEL": request.app.state.config.STT_MODEL,
             "MODEL": request.app.state.config.STT_MODEL,
             "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
             "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
+            "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
         },
         },
     }
     }
 
 
@@ -190,6 +193,7 @@ async def update_audio_config(
     request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
     request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
     request.app.state.config.STT_MODEL = form_data.stt.MODEL
     request.app.state.config.STT_MODEL = form_data.stt.MODEL
     request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
     request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
+    request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
 
 
     if request.app.state.config.STT_ENGINE == "":
     if request.app.state.config.STT_ENGINE == "":
         request.app.state.faster_whisper_model = set_faster_whisper_model(
         request.app.state.faster_whisper_model = set_faster_whisper_model(
@@ -214,6 +218,7 @@ async def update_audio_config(
             "ENGINE": request.app.state.config.STT_ENGINE,
             "ENGINE": request.app.state.config.STT_ENGINE,
             "MODEL": request.app.state.config.STT_MODEL,
             "MODEL": request.app.state.config.STT_MODEL,
             "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
             "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
+            "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
         },
         },
     }
     }
 
 
@@ -521,6 +526,65 @@ def transcribe(request: Request, file_path):
 
 
             raise Exception(detail if detail else "Open WebUI: Server Connection Error")
             raise Exception(detail if detail else "Open WebUI: Server Connection Error")
 
 
+    elif request.app.state.config.STT_ENGINE == "deepgram":
+        try:
+            # Determine the MIME type of the file
+            mime, _ = mimetypes.guess_type(file_path)
+            if not mime:
+                mime = "audio/wav"  # fallback to wav if undetectable
+
+            # Read the audio file
+            with open(file_path, "rb") as f:
+                file_data = f.read()
+
+            # Build headers and parameters
+            headers = {
+                "Authorization": f"Token {request.app.state.config.DEEPGRAM_API_KEY}",
+                "Content-Type": mime,
+            }
+
+            # Add model if specified
+            params = {}
+            if request.app.state.config.STT_MODEL:
+                params["model"] = request.app.state.config.STT_MODEL
+
+            # Make request to Deepgram API
+            r = requests.post(
+                "https://api.deepgram.com/v1/listen",
+                headers=headers,
+                params=params,
+                data=file_data,
+            )
+            r.raise_for_status()
+            response_data = r.json()
+
+            # Extract transcript from Deepgram response
+            try:
+                transcript = response_data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
+            except (KeyError, IndexError) as e:
+                log.error(f"Malformed response from Deepgram: {str(e)}")
+                raise Exception("Failed to parse Deepgram response - unexpected response format")
+            data = {"text": transcript.strip()}
+
+            # Save transcript
+            transcript_file = f"{file_dir}/{id}.json"
+            with open(transcript_file, "w") as f:
+                json.dump(data, f)
+
+            return data
+
+        except Exception as e:
+            log.exception(e)
+            detail = None
+            if r is not None:
+                try:
+                    res = r.json()
+                    if "error" in res:
+                        detail = f"External: {res['error'].get('message', '')}"
+                except Exception:
+                    detail = f"External: {e}"
+            raise Exception(detail if detail else "Open WebUI: Server Connection Error")
+
 
 
 def compress_audio(file_path):
 def compress_audio(file_path):
     if os.path.getsize(file_path) > MAX_FILE_SIZE:
     if os.path.getsize(file_path) > MAX_FILE_SIZE:

+ 36 - 1
src/lib/components/admin/Settings/Audio.svelte

@@ -39,6 +39,7 @@
 	let STT_ENGINE = '';
 	let STT_ENGINE = '';
 	let STT_MODEL = '';
 	let STT_MODEL = '';
 	let STT_WHISPER_MODEL = '';
 	let STT_WHISPER_MODEL = '';
+	let STT_DEEPGRAM_API_KEY = '';
 
 
 	let STT_WHISPER_MODEL_LOADING = false;
 	let STT_WHISPER_MODEL_LOADING = false;
 
 
@@ -103,7 +104,8 @@
 				OPENAI_API_KEY: STT_OPENAI_API_KEY,
 				OPENAI_API_KEY: STT_OPENAI_API_KEY,
 				ENGINE: STT_ENGINE,
 				ENGINE: STT_ENGINE,
 				MODEL: STT_MODEL,
 				MODEL: STT_MODEL,
-				WHISPER_MODEL: STT_WHISPER_MODEL
+				WHISPER_MODEL: STT_WHISPER_MODEL,
+				DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY
 			}
 			}
 		});
 		});
 
 
@@ -143,6 +145,7 @@
 			STT_ENGINE = res.stt.ENGINE;
 			STT_ENGINE = res.stt.ENGINE;
 			STT_MODEL = res.stt.MODEL;
 			STT_MODEL = res.stt.MODEL;
 			STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
 			STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
+			STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
 		}
 		}
 
 
 		await getVoices();
 		await getVoices();
@@ -173,6 +176,7 @@
 							<option value="">{$i18n.t('Whisper (Local)')}</option>
 							<option value="">{$i18n.t('Whisper (Local)')}</option>
 							<option value="openai">OpenAI</option>
 							<option value="openai">OpenAI</option>
 							<option value="web">{$i18n.t('Web API')}</option>
 							<option value="web">{$i18n.t('Web API')}</option>
+							<option value="deepgram">Deepgram</option>
 						</select>
 						</select>
 					</div>
 					</div>
 				</div>
 				</div>
@@ -210,6 +214,37 @@
 							</div>
 							</div>
 						</div>
 						</div>
 					</div>
 					</div>
+				{:else if STT_ENGINE === 'deepgram'}
+					<div>
+						<div class="mt-1 flex gap-2 mb-1">
+							<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_DEEPGRAM_API_KEY} />
+						</div>
+					</div>
+
+					<hr class=" dark:border-gray-850 my-2" />
+
+					<div>
+						<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
+						<div class="flex w-full">
+							<div class="flex-1">
+								<input
+									class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+									bind:value={STT_MODEL}
+									placeholder="Select a model (optional)"
+								/>
+							</div>
+						</div>
+						<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
+							{$i18n.t('Leave model field empty to use the default model.')}
+							<a
+								class=" hover:underline dark:text-gray-200 text-gray-800"
+								href="https://developers.deepgram.com/docs/models"
+								target="_blank"
+							>
+								{$i18n.t('Click here to see available models.')}
+							</a>
+						</div>
+					</div>
 				{:else if STT_ENGINE === ''}
 				{:else if STT_ENGINE === ''}
 					<div>
 					<div>
 						<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
 						<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>