소스 검색

feat: toggle pdf ocr

Timothy J. Baek 1 년 전
부모
커밋
98948814fd
3개의 변경된 파일138개의 추가작업 그리고 91개의 파일을 삭제
  1. 24 13
      backend/apps/rag/main.py
  2. 15 6
      src/lib/apis/rag/index.ts
  3. 99 72
      src/lib/components/documents/Settings/General.svelte

+ 24 - 13
backend/apps/rag/main.py

@@ -77,6 +77,7 @@ from constants import ERROR_MESSAGES
 
 app = FastAPI()
 
+app.state.PDF_EXTRACT_IMAGES = False
 app.state.CHUNK_SIZE = CHUNK_SIZE
 app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
 app.state.RAG_TEMPLATE = RAG_TEMPLATE
@@ -184,12 +185,15 @@ async def update_embedding_model(
     }
 
 
-@app.get("/chunk")
-async def get_chunk_params(user=Depends(get_admin_user)):
+@app.get("/config")
+async def get_rag_config(user=Depends(get_admin_user)):
     return {
         "status": True,
-        "chunk_size": app.state.CHUNK_SIZE,
-        "chunk_overlap": app.state.CHUNK_OVERLAP,
+        "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
+        "chunk": {
+            "chunk_size": app.state.CHUNK_SIZE,
+            "chunk_overlap": app.state.CHUNK_OVERLAP,
+        },
     }
 
 
@@ -198,17 +202,24 @@ class ChunkParamUpdateForm(BaseModel):
     chunk_overlap: int
 
 
-@app.post("/chunk/update")
-async def update_chunk_params(
-    form_data: ChunkParamUpdateForm, user=Depends(get_admin_user)
-):
-    app.state.CHUNK_SIZE = form_data.chunk_size
-    app.state.CHUNK_OVERLAP = form_data.chunk_overlap
+class ConfigUpdateForm(BaseModel):
+    pdf_extract_images: bool
+    chunk: ChunkParamUpdateForm
+
+
+@app.post("/config/update")
+async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
+    app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
+    app.state.CHUNK_SIZE = form_data.chunk.chunk_size
+    app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
 
     return {
         "status": True,
-        "chunk_size": app.state.CHUNK_SIZE,
-        "chunk_overlap": app.state.CHUNK_OVERLAP,
+        "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
+        "chunk": {
+            "chunk_size": app.state.CHUNK_SIZE,
+            "chunk_overlap": app.state.CHUNK_OVERLAP,
+        },
     }
 
 
@@ -364,7 +375,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
     ]
 
     if file_ext == "pdf":
-        loader = PyPDFLoader(file_path, extract_images=True)
+        loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
     elif file_ext == "csv":
         loader = CSVLoader(file_path)
     elif file_ext == "rst":

+ 15 - 6
src/lib/apis/rag/index.ts

@@ -1,9 +1,9 @@
 import { RAG_API_BASE_URL } from '$lib/constants';
 
-export const getChunkParams = async (token: string) => {
+export const getRAGConfig = async (token: string) => {
 	let error = null;
 
-	const res = await fetch(`${RAG_API_BASE_URL}/chunk`, {
+	const res = await fetch(`${RAG_API_BASE_URL}/config`, {
 		method: 'GET',
 		headers: {
 			'Content-Type': 'application/json',
@@ -27,18 +27,27 @@ export const getChunkParams = async (token: string) => {
 	return res;
 };
 
-export const updateChunkParams = async (token: string, size: number, overlap: number) => {
+type ChunkConfigForm = {
+	chunk_size: number;
+	chunk_overlap: number;
+};
+
+type RAGConfigForm = {
+	pdf_extract_images: boolean;
+	chunk: ChunkConfigForm;
+};
+
+export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
 	let error = null;
 
-	const res = await fetch(`${RAG_API_BASE_URL}/chunk/update`, {
+	const res = await fetch(`${RAG_API_BASE_URL}/config/update`, {
 		method: 'POST',
 		headers: {
 			'Content-Type': 'application/json',
 			Authorization: `Bearer ${token}`
 		},
 		body: JSON.stringify({
-			chunk_size: size,
-			chunk_overlap: overlap
+			...payload
 		})
 	})
 		.then(async (res) => {

+ 99 - 72
src/lib/components/documents/Settings/General.svelte

@@ -1,10 +1,10 @@
 <script lang="ts">
 	import { getDocs } from '$lib/apis/documents';
 	import {
-		getChunkParams,
+		getRAGConfig,
+		updateRAGConfig,
 		getQuerySettings,
 		scanDocs,
-		updateChunkParams,
 		updateQuerySettings
 	} from '$lib/apis/rag';
 	import { documents } from '$lib/stores';
@@ -17,6 +17,7 @@
 
 	let chunkSize = 0;
 	let chunkOverlap = 0;
+	let pdfExtractImages = true;
 
 	let querySettings = {
 		template: '',
@@ -35,16 +36,24 @@
 	};
 
 	const submitHandler = async () => {
-		const res = await updateChunkParams(localStorage.token, chunkSize, chunkOverlap);
+		const res = await updateRAGConfig(localStorage.token, {
+			pdf_extract_images: pdfExtractImages,
+			chunk: {
+				chunk_overlap: chunkOverlap,
+				chunk_size: chunkSize
+			}
+		});
 		querySettings = await updateQuerySettings(localStorage.token, querySettings);
 	};
 
 	onMount(async () => {
-		const res = await getChunkParams(localStorage.token);
+		const res = await getRAGConfig(localStorage.token);
 
 		if (res) {
-			chunkSize = res.chunk_size;
-			chunkOverlap = res.chunk_overlap;
+			pdfExtractImages = res.pdf_extract_images;
+
+			chunkSize = res.chunk.chunk_size;
+			chunkOverlap = res.chunk.chunk_overlap;
 		}
 
 		querySettings = await getQuerySettings(localStorage.token);
@@ -124,82 +133,100 @@
 
 		<hr class=" dark:border-gray-700" />
 
-		<div class=" ">
-			<div class=" text-sm font-medium">Chunk Params</div>
-
-			<div class=" flex">
-				<div class="  flex w-full justify-between">
-					<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
-
-					<div class="self-center p-3">
-						<input
-							class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
-							type="number"
-							placeholder="Enter Chunk Size"
-							bind:value={chunkSize}
-							autocomplete="off"
-							min="0"
-						/>
+		<div class=" space-y-3">
+			<div class=" space-y-3">
+				<div class=" text-sm font-medium">Chunk Params</div>
+
+				<div class=" flex gap-2">
+					<div class="  flex w-full justify-between gap-2">
+						<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
+
+						<div class="self-center">
+							<input
+								class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
+								type="number"
+								placeholder="Enter Chunk Size"
+								bind:value={chunkSize}
+								autocomplete="off"
+								min="0"
+							/>
+						</div>
 					</div>
-				</div>
 
-				<div class="flex w-full">
-					<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
-
-					<div class="self-center p-3">
-						<input
-							class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
-							type="number"
-							placeholder="Enter Chunk Overlap"
-							bind:value={chunkOverlap}
-							autocomplete="off"
-							min="0"
-						/>
+					<div class="flex w-full gap-2">
+						<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
+
+						<div class="self-center">
+							<input
+								class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
+								type="number"
+								placeholder="Enter Chunk Overlap"
+								bind:value={chunkOverlap}
+								autocomplete="off"
+								min="0"
+							/>
+						</div>
 					</div>
 				</div>
-			</div>
-
-			<div class=" text-sm font-medium">Query Params</div>
 
-			<div class=" flex">
-				<div class="  flex w-full justify-between">
-					<div class="self-center text-xs font-medium flex-1">Top K</div>
-
-					<div class="self-center p-3">
-						<input
-							class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
-							type="number"
-							placeholder="Enter Top K"
-							bind:value={querySettings.k}
-							autocomplete="off"
-							min="0"
-						/>
+				<div>
+					<div class="flex justify-between items-center text-xs">
+						<div class=" text-xs font-medium">PDF Extract Images (OCR)</div>
+
+						<button
+							class=" text-xs font-medium text-gray-500"
+							type="button"
+							on:click={() => {
+								pdfExtractImages = !pdfExtractImages;
+							}}>{pdfExtractImages ? 'On' : 'Off'}</button
+						>
 					</div>
 				</div>
-
-				<!-- <div class="flex w-full">
-					<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
-
-					<div class="self-center p-3">
-						<input
-							class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
-							type="number"
-							placeholder="Enter Chunk Overlap"
-							bind:value={chunkOverlap}
-							autocomplete="off"
-							min="0"
-						/>
-					</div>
-				</div> -->
 			</div>
 
 			<div>
-				<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
-				<textarea
-					bind:value={querySettings.template}
-					class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
-					rows="4"
-				/>
+				<div class=" text-sm font-medium">Query Params</div>
+
+				<div class=" flex py-2">
+					<div class="  flex w-full justify-between gap-2">
+						<div class="self-center text-xs font-medium flex-1">Top K</div>
+
+						<div class="self-center">
+							<input
+								class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
+								type="number"
+								placeholder="Enter Top K"
+								bind:value={querySettings.k}
+								autocomplete="off"
+								min="0"
+							/>
+						</div>
+					</div>
+
+					<!-- <div class="flex w-full">
+						<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
+	
+						<div class="self-center p-3">
+							<input
+								class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
+								type="number"
+								placeholder="Enter Chunk Overlap"
+								bind:value={chunkOverlap}
+								autocomplete="off"
+								min="0"
+							/>
+						</div>
+					</div> -->
+				</div>
+
+				<div>
+					<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
+					<textarea
+						bind:value={querySettings.template}
+						class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
+						rows="4"
+					/>
+				</div>
 			</div>
 		</div>
 	</div>