Ver Fonte

Merge pull request #11464 from FabioPolito24/docling_context_extraction_engine

feat: Docling context extraction engine
Timothy Jaeryang Baek há 2 meses atrás
pai
commit
f8ac44cfbd

+ 6 - 0
backend/open_webui/config.py

@@ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig(
     os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
 )
 
+DOCLING_SERVER_URL = PersistentConfig(
+    "DOCLING_SERVER_URL",
+    "rag.docling_server_url",
+    os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
+)
+
 DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
     "DOCUMENT_INTELLIGENCE_ENDPOINT",
     "rag.document_intelligence_endpoint",

+ 2 - 0
backend/open_webui/main.py

@@ -186,6 +186,7 @@ from open_webui.config import (
     CHUNK_SIZE,
     CONTENT_EXTRACTION_ENGINE,
     TIKA_SERVER_URL,
+    DOCLING_SERVER_URL,
     DOCUMENT_INTELLIGENCE_ENDPOINT,
     DOCUMENT_INTELLIGENCE_KEY,
     RAG_TOP_K,
@@ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
 
 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
+app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
 app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
 app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
 

+ 52 - 0
backend/open_webui/retrieval/loaders/main.py

@@ -117,6 +117,52 @@ class TikaLoader:
             raise Exception(f"Error calling Tika: {r.reason}")
 
 
+class DoclingLoader:
+    def __init__(self, url, file_path=None, mime_type=None):
+        self.url = url.rstrip("/")
+        self.file_path = file_path
+        self.mime_type = mime_type
+
+    def load(self) -> list[Document]:
+        with open(self.file_path, "rb") as f:
+            files = {
+                "files": (
+                    self.file_path,
+                    f,
+                    self.mime_type or "application/octet-stream",
+                )
+            }
+
+            params = {
+                "image_export_mode": "placeholder",
+                "table_mode": "accurate",
+            }
+
+            endpoint = f"{self.url}/v1alpha/convert/file"
+            r = requests.post(endpoint, files=files, data=params)
+
+        if r.ok:
+            result = r.json()
+            document_data = result.get("document", {})
+            text = document_data.get("md_content", "<No text content found>")
+
+            metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
+
+            log.debug("Docling extracted text: %s", text)
+
+            return [Document(page_content=text, metadata=metadata)]
+        else:
+            error_msg = f"Error calling Docling API: {r.reason}"
+            if r.text:
+                try:
+                    error_data = r.json()
+                    if "detail" in error_data:
+                        error_msg += f" - {error_data['detail']}"
+                except Exception:
+                    error_msg += f" - {r.text}"
+            raise Exception(f"Error calling Docling: {error_msg}")
+
+
 class Loader:
     def __init__(self, engine: str = "", **kwargs):
         self.engine = engine
@@ -149,6 +195,12 @@ class Loader:
                     file_path=file_path,
                     mime_type=file_content_type,
                 )
+        elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
+            loader = DoclingLoader(
+                url=self.kwargs.get("DOCLING_SERVER_URL"),
+                file_path=file_path,
+                mime_type=file_content_type,
+            )
         elif (
             self.engine == "document_intelligence"
             and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""

+ 7 - 0
backend/open_webui/routers/retrieval.py

@@ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
         "content_extraction": {
             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
             "document_intelligence_config": {
                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
@@ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel):
 class ContentExtractionConfig(BaseModel):
     engine: str = ""
     tika_server_url: Optional[str] = None
+    docling_server_url: Optional[str] = None
     document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
 
 
@@ -540,6 +542,9 @@ async def update_rag_config(
         request.app.state.config.TIKA_SERVER_URL = (
             form_data.content_extraction.tika_server_url
         )
+        request.app.state.config.DOCLING_SERVER_URL = (
+            form_data.content_extraction.docling_server_url
+        )
         if form_data.content_extraction.document_intelligence_config is not None:
             request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
                 form_data.content_extraction.document_intelligence_config.endpoint
@@ -648,6 +653,7 @@ async def update_rag_config(
         "content_extraction": {
             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
             "document_intelligence_config": {
                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
@@ -990,6 +996,7 @@ def process_file(
                 loader = Loader(
                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
+                    DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
                     DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
                     DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,

+ 19 - 0
src/lib/components/admin/Settings/Documents.svelte

@@ -49,6 +49,8 @@
 	let contentExtractionEngine = 'default';
 	let tikaServerUrl = '';
 	let showTikaServerUrl = false;
+	let doclingServerUrl = '';
+	let showDoclingServerUrl = false;
 	let documentIntelligenceEndpoint = '';
 	let documentIntelligenceKey = '';
 	let showDocumentIntelligenceConfig = false;
@@ -175,6 +177,10 @@
 			toast.error($i18n.t('Tika Server URL required.'));
 			return;
 		}
+		if (contentExtractionEngine === 'docling' && doclingServerUrl === '') {
+			toast.error($i18n.t('Docling Server URL required.'));
+			return;
+		}
 		if (
 			contentExtractionEngine === 'document_intelligence' &&
 			(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
@@ -209,6 +215,7 @@
 			content_extraction: {
 				engine: contentExtractionEngine,
 				tika_server_url: tikaServerUrl,
+				docling_server_url: doclingServerUrl,
 				document_intelligence_config: {
 					key: documentIntelligenceKey,
 					endpoint: documentIntelligenceEndpoint
@@ -269,7 +276,10 @@
 
 			contentExtractionEngine = res.content_extraction.engine;
 			tikaServerUrl = res.content_extraction.tika_server_url;
+			doclingServerUrl = res.content_extraction.docling_server_url;
+
 			showTikaServerUrl = contentExtractionEngine === 'tika';
+			showDoclingServerUrl = contentExtractionEngine === 'docling';
 			documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
 			documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
 			showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
@@ -337,6 +347,7 @@
 							>
 								<option value="">{$i18n.t('Default')} </option>
 								<option value="tika">{$i18n.t('Tika')}</option>
+								<option value="docling">{ $i18n.t('Docling') }</option>
 								<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
 							</select>
 						</div>
@@ -351,6 +362,14 @@
 								/>
 							</div>
 						</div>
+					{:else if contentExtractionEngine === 'docling'}
+						<div class="flex w-full mt-1">
+							<input
+								class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
+								placeholder={$i18n.t('Enter Docling Server URL')}
+								bind:value={doclingServerUrl}
+							/>
+						</div>
 					{:else if contentExtractionEngine === 'document_intelligence'}
 						<div class="my-0.5 flex gap-2 pr-2">
 							<input