Timothy J. Baek 7 mesi fa
parent
commit
b8b994a820

+ 10 - 1
backend/open_webui/apps/retrieval/loader/main.py

@@ -1,5 +1,7 @@
 import requests
 import logging
+import ftfy
+
 
 from langchain_community.document_loaders import (
     BSHTMLLoader,
@@ -122,7 +124,14 @@ class Loader:
         self, filename: str, file_content_type: str, file_path: str
     ) -> list[Document]:
         loader = self._get_loader(filename, file_content_type, file_path)
-        return loader.load()
+        docs = loader.load()
+
+        return [
+            Document(
+                page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata
+            )
+            for doc in docs
+        ]
 
     def _get_loader(self, filename: str, file_content_type: str, file_path: str):
         file_ext = filename.split(".")[-1].lower()

+ 0 - 2
backend/open_webui/apps/retrieval/main.py

@@ -725,7 +725,6 @@ def process_file(
             PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
         )
         docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
-
         raw_content = " ".join([doc.page_content for doc in docs])
         print(raw_content)
 
@@ -872,7 +871,6 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
             translation=app.state.YOUTUBE_LOADER_TRANSLATION,
         )
         docs = loader.load()
-
         save_docs_to_vector_db(docs, collection_name, overwrite=True)
 
         return {

+ 2 - 0
backend/requirements.txt

@@ -46,6 +46,8 @@ sentence-transformers==3.0.1
 colbert-ai==0.2.21
 einops==0.8.0
 
+
+ftfy==6.2.3
 pypdf==4.3.1
 docx2txt==0.8
 python-pptx==1.0.0

+ 2 - 0
pyproject.toml

@@ -53,6 +53,8 @@ dependencies = [
     "colbert-ai==0.2.21",
     "einops==0.8.0",
     
+
+    "ftfy==6.2.3",
     "pypdf==4.3.1",
     "docx2txt==0.8",
     "python-pptx==1.0.0",