浏览代码

feat: non-english youtube support

Timothy J. Baek 1 年之前
父节点
当前提交
d3822f782c

+ 36 - 1
backend/apps/rag/main.py

@@ -124,6 +124,10 @@ app.state.OPENAI_API_KEY = RAG_OPENAI_API_KEY
 app.state.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
 app.state.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
 
 
 
 
+app.state.YOUTUBE_LOADER_LANGUAGE = ["en"]
+app.state.YOUTUBE_LOADER_TRANSLATION = None
+
+
 def update_embedding_model(
 def update_embedding_model(
     embedding_model: str,
     embedding_model: str,
     update_model: bool = False,
     update_model: bool = False,
@@ -314,6 +318,10 @@ async def get_rag_config(user=Depends(get_admin_user)):
             "chunk_overlap": app.state.CHUNK_OVERLAP,
             "chunk_overlap": app.state.CHUNK_OVERLAP,
         },
         },
         "web_loader_ssl_verification": app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
         "web_loader_ssl_verification": app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
+        "youtube": {
+            "language": app.state.YOUTUBE_LOADER_LANGUAGE,
+            "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
+        },
     }
     }
 
 
 
 
@@ -322,10 +330,16 @@ class ChunkParamUpdateForm(BaseModel):
     chunk_overlap: int
     chunk_overlap: int
 
 
 
 
+class YoutubeLoaderConfig(BaseModel):
+    language: List[str]
+    translation: Optional[str] = None
+
+
 class ConfigUpdateForm(BaseModel):
 class ConfigUpdateForm(BaseModel):
     pdf_extract_images: Optional[bool] = None
     pdf_extract_images: Optional[bool] = None
     chunk: Optional[ChunkParamUpdateForm] = None
     chunk: Optional[ChunkParamUpdateForm] = None
     web_loader_ssl_verification: Optional[bool] = None
     web_loader_ssl_verification: Optional[bool] = None
+    youtube: Optional[YoutubeLoaderConfig] = None
 
 
 
 
 @app.post("/config/update")
 @app.post("/config/update")
@@ -352,6 +366,18 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
         else app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
         else app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
     )
     )
 
 
+    app.state.YOUTUBE_LOADER_LANGUAGE = (
+        form_data.youtube.language
+        if form_data.youtube != None
+        else app.state.YOUTUBE_LOADER_LANGUAGE
+    )
+
+    app.state.YOUTUBE_LOADER_TRANSLATION = (
+        form_data.youtube.translation
+        if form_data.youtube != None
+        else app.state.YOUTUBE_LOADER_TRANSLATION
+    )
+
     return {
     return {
         "status": True,
         "status": True,
         "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
         "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
@@ -360,6 +386,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
             "chunk_overlap": app.state.CHUNK_OVERLAP,
             "chunk_overlap": app.state.CHUNK_OVERLAP,
         },
         },
         "web_loader_ssl_verification": app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
         "web_loader_ssl_verification": app.state.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
+        "youtube": {
+            "language": app.state.YOUTUBE_LOADER_LANGUAGE,
+            "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
+        },
     }
     }
 
 
 
 
@@ -486,7 +516,12 @@ def query_collection_handler(
 @app.post("/youtube")
 @app.post("/youtube")
 def store_youtube_video(form_data: UrlForm, user=Depends(get_current_user)):
 def store_youtube_video(form_data: UrlForm, user=Depends(get_current_user)):
     try:
     try:
-        loader = YoutubeLoader.from_youtube_url(form_data.url, add_video_info=False)
+        loader = YoutubeLoader.from_youtube_url(
+            form_data.url,
+            add_video_info=True,
+            language=app.state.YOUTUBE_LOADER_LANGUAGE,
+            translation=app.state.YOUTUBE_LOADER_TRANSLATION,
+        )
         data = loader.load()
         data = loader.load()
 
 
         collection_name = form_data.collection_name
         collection_name = form_data.collection_name

+ 1 - 0
backend/requirements.txt

@@ -57,3 +57,4 @@ PyJWT[crypto]==2.8.0
 black==24.4.2
 black==24.4.2
 langfuse==2.27.3
 langfuse==2.27.3
 youtube-transcript-api==0.6.2
 youtube-transcript-api==0.6.2
+pytube

+ 6 - 0
src/lib/apis/rag/index.ts

@@ -32,10 +32,16 @@ type ChunkConfigForm = {
 	chunk_overlap: number;
 	chunk_overlap: number;
 };
 };
 
 
+type YoutubeConfigForm = {
+	language: string[];
+	translation?: string | null;
+};
+
 type RAGConfigForm = {
 type RAGConfigForm = {
 	pdf_extract_images?: boolean;
 	pdf_extract_images?: boolean;
 	chunk?: ChunkConfigForm;
 	chunk?: ChunkConfigForm;
 	web_loader_ssl_verification?: boolean;
 	web_loader_ssl_verification?: boolean;
+	youtube?: YoutubeConfigForm;
 };
 };
 
 
 export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
 export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {

+ 30 - 2
src/lib/components/documents/Settings/WebParams.svelte

@@ -11,9 +11,16 @@
 
 
 	let webLoaderSSLVerification = true;
 	let webLoaderSSLVerification = true;
 
 
+	let youtubeLanguage = 'en';
+	let youtubeTranslation = null;
+
 	const submitHandler = async () => {
 	const submitHandler = async () => {
 		const res = await updateRAGConfig(localStorage.token, {
 		const res = await updateRAGConfig(localStorage.token, {
-			web_loader_ssl_verification: webLoaderSSLVerification
+			web_loader_ssl_verification: webLoaderSSLVerification,
+			youtube: {
+				language: youtubeLanguage.split(',').map((lang) => lang.trim()),
+				translation: youtubeTranslation
+			}
 		});
 		});
 	};
 	};
 
 
@@ -22,6 +29,8 @@
 
 
 		if (res) {
 		if (res) {
 			webLoaderSSLVerification = res.web_loader_ssl_verification;
 			webLoaderSSLVerification = res.web_loader_ssl_verification;
+			youtubeLanguage = res.youtube.language.join(',');
+			youtubeTranslation = res.youtube.translation;
 		}
 		}
 	});
 	});
 </script>
 </script>
@@ -36,7 +45,7 @@
 	<div class=" space-y-3 pr-1.5 overflow-y-scroll h-full max-h-[22rem]">
 	<div class=" space-y-3 pr-1.5 overflow-y-scroll h-full max-h-[22rem]">
 		<div>
 		<div>
 			<div class=" mb-1 text-sm font-medium">
 			<div class=" mb-1 text-sm font-medium">
-				{$i18n.t('Retrieval Augmented Generation Settings')}
+				{$i18n.t('Web Loader Settings')}
 			</div>
 			</div>
 
 
 			<div>
 			<div>
@@ -61,6 +70,25 @@
 					</button>
 					</button>
 				</div>
 				</div>
 			</div>
 			</div>
+
+			<div class=" mt-2 mb-1 text-sm font-medium">
+				{$i18n.t('Youtube Loader Settings')}
+			</div>
+
+			<div>
+				<div class=" py-0.5 flex w-full justify-between">
+					<div class=" w-20 text-xs font-medium self-center">{$i18n.t('Language')}</div>
+					<div class=" flex-1 self-center">
+						<input
+							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
+							type="text"
+							placeholder={$i18n.t('Enter language codes')}
+							bind:value={youtubeLanguage}
+							autocomplete="off"
+						/>
+					</div>
+				</div>
+			</div>
 		</div>
 		</div>
 	</div>
 	</div>
 	<div class="flex justify-end pt-3 text-sm font-medium">
 	<div class="flex justify-end pt-3 text-sm font-medium">