Browse Source

Merge pull request #7422 from alpha-pet/feat-youtube-transscript-proxy

feat: Optional proxy setting for downloading Youtube transscripts
Timothy Jaeryang Baek 5 months ago
parent
commit
9f981db0b9

+ 19 - 2
backend/open_webui/apps/retrieval/loaders/youtube.py

@@ -1,7 +1,12 @@
+import logging
+
 from typing import Any, Dict, Generator, List, Optional, Sequence, Union
 from urllib.parse import parse_qs, urlparse
 from langchain_core.documents import Document
+from open_webui.env import SRC_LOG_LEVELS
 
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["RAG"])
 
 ALLOWED_SCHEMES = {"http", "https"}
 ALLOWED_NETLOCS = {
@@ -51,12 +56,14 @@ class YoutubeLoader:
         self,
         video_id: str,
         language: Union[str, Sequence[str]] = "en",
+        proxy_url: Optional[str] = None,
     ):
         """Initialize with YouTube video ID."""
         _video_id = _parse_video_id(video_id)
         self.video_id = _video_id if _video_id is not None else video_id
         self._metadata = {"source": video_id}
         self.language = language
+        self.proxy_url = proxy_url
         if isinstance(language, str):
             self.language = [language]
         else:
@@ -76,10 +83,20 @@ class YoutubeLoader:
                 "Please install it with `pip install youtube-transcript-api`."
             )
 
+        if self.proxy_url:
+            youtube_proxies = {
+                'http': self.proxy_url,
+                'https': self.proxy_url,
+            }
+            # Don't log complete URL because it might contain secrets
+            log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+        else:
+            youtube_proxies = None
+
         try:
-            transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
+            transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies)
         except Exception as e:
-            print(e)
+            log.exception("Loading YouTube transcript failed")
             return []
 
         try:

+ 9 - 1
backend/open_webui/apps/retrieval/main.py

@@ -105,6 +105,7 @@ from open_webui.config import (
     TIKA_SERVER_URL,
     UPLOAD_DIR,
     YOUTUBE_LOADER_LANGUAGE,
+    YOUTUBE_LOADER_PROXY_URL,
     DEFAULT_LOCALE,
     AppConfig,
 )
@@ -171,6 +172,7 @@ app.state.config.OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
 app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
 
 app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
+app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
 app.state.YOUTUBE_LOADER_TRANSLATION = None
 
 
@@ -471,6 +473,7 @@ async def get_rag_config(user=Depends(get_admin_user)):
         "youtube": {
             "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
             "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
+            "proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
         },
         "web": {
             "web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
@@ -518,6 +521,7 @@ class ChunkParamUpdateForm(BaseModel):
 class YoutubeLoaderConfig(BaseModel):
     language: list[str]
     translation: Optional[str] = None
+    proxy_url: str = ""
 
 
 class WebSearchConfig(BaseModel):
@@ -580,6 +584,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
 
     if form_data.youtube is not None:
         app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
+        app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url
         app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
 
     if form_data.web is not None:
@@ -640,6 +645,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
         },
         "youtube": {
             "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
+            "proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
             "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
         },
         "web": {
@@ -1081,7 +1087,9 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
             collection_name = calculate_sha256_string(form_data.url)[:63]
 
         loader = YoutubeLoader(
-            form_data.url, language=app.state.config.YOUTUBE_LOADER_LANGUAGE
+            form_data.url,
+            language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
+            proxy_url=app.state.config.YOUTUBE_LOADER_PROXY_URL,
         )
 
         docs = loader.load()

+ 6 - 0
backend/open_webui/config.py

@@ -1305,6 +1305,12 @@ YOUTUBE_LOADER_LANGUAGE = PersistentConfig(
     os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","),
 )
 
+YOUTUBE_LOADER_PROXY_URL = PersistentConfig(
+    "YOUTUBE_LOADER_PROXY_URL",
+    "rag.youtube_loader_proxy_url",
+    os.getenv("YOUTUBE_LOADER_PROXY_URL", ""),
+)
+
 
 ENABLE_RAG_WEB_SEARCH = PersistentConfig(
     "ENABLE_RAG_WEB_SEARCH",

+ 1 - 0
src/lib/apis/retrieval/index.ts

@@ -40,6 +40,7 @@ type ContentExtractConfigForm = {
 type YoutubeConfigForm = {
 	language: string[];
 	translation?: string | null;
+	proxy_url: string;
 };
 
 type RAGConfigForm = {

+ 19 - 1
src/lib/components/admin/Settings/WebSearch.svelte

@@ -29,13 +29,15 @@
 
 	let youtubeLanguage = 'en';
 	let youtubeTranslation = null;
+	let youtubeProxyUrl = '';
 
 	const submitHandler = async () => {
 		const res = await updateRAGConfig(localStorage.token, {
 			web: webConfig,
 			youtube: {
 				language: youtubeLanguage.split(',').map((lang) => lang.trim()),
-				translation: youtubeTranslation
+				translation: youtubeTranslation,
+				proxy_url: youtubeProxyUrl
 			}
 		});
 	};
@@ -48,6 +50,7 @@
 
 			youtubeLanguage = res.youtube.language.join(',');
 			youtubeTranslation = res.youtube.translation;
+			youtubeProxyUrl = res.youtube.proxy_url;
 		}
 	});
 </script>
@@ -358,6 +361,21 @@
 						</div>
 					</div>
 				</div>
+
+				<div>
+					<div class=" py-0.5 flex w-full justify-between">
+						<div class=" w-20 text-xs font-medium self-center">{$i18n.t('Proxy URL')}</div>
+						<div class=" flex-1 self-center">
+							<input
+								class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+								type="text"
+								placeholder={$i18n.t('Enter proxy URL (e.g. https://user:password@host:port)')}
+								bind:value={youtubeProxyUrl}
+								autocomplete="off"
+							/>
+						</div>
+					</div>
+				</div>
 			</div>
 		{/if}
 	</div>