|
@@ -1,143 +1,118 @@
|
|
|
-from fastapi import (
|
|
|
- FastAPI,
|
|
|
- Depends,
|
|
|
- HTTPException,
|
|
|
- status,
|
|
|
- UploadFile,
|
|
|
- File,
|
|
|
- Form,
|
|
|
-)
|
|
|
-from fastapi.middleware.cors import CORSMiddleware
|
|
|
-import requests
|
|
|
-import os, shutil, logging, re
|
|
|
+import json
|
|
|
+import logging
|
|
|
+import mimetypes
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import socket
|
|
|
+import urllib.parse
|
|
|
+import uuid
|
|
|
from datetime import datetime
|
|
|
-
|
|
|
from pathlib import Path
|
|
|
-from typing import Union, Sequence, Iterator, Any
|
|
|
-
|
|
|
-from chromadb.utils.batch_utils import create_batches
|
|
|
-from langchain_core.documents import Document
|
|
|
-
|
|
|
-from langchain_community.document_loaders import (
|
|
|
- WebBaseLoader,
|
|
|
- TextLoader,
|
|
|
- PyPDFLoader,
|
|
|
- CSVLoader,
|
|
|
- BSHTMLLoader,
|
|
|
- Docx2txtLoader,
|
|
|
- UnstructuredEPubLoader,
|
|
|
- UnstructuredWordDocumentLoader,
|
|
|
- UnstructuredMarkdownLoader,
|
|
|
- UnstructuredXMLLoader,
|
|
|
- UnstructuredRSTLoader,
|
|
|
- UnstructuredExcelLoader,
|
|
|
- UnstructuredPowerPointLoader,
|
|
|
- YoutubeLoader,
|
|
|
- OutlookMessageLoader,
|
|
|
-)
|
|
|
-from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
+from typing import Iterator, Optional, Sequence, Union
|
|
|
|
|
|
+import requests
|
|
|
import validators
|
|
|
-import urllib.parse
|
|
|
-import socket
|
|
|
-
|
|
|
-
|
|
|
-from pydantic import BaseModel
|
|
|
-from typing import Optional
|
|
|
-import mimetypes
|
|
|
-import uuid
|
|
|
-import json
|
|
|
-
|
|
|
-from apps.webui.models.documents import (
|
|
|
- Documents,
|
|
|
- DocumentForm,
|
|
|
- DocumentResponse,
|
|
|
-)
|
|
|
-from apps.webui.models.files import (
|
|
|
- Files,
|
|
|
-)
|
|
|
-
|
|
|
-from apps.rag.utils import (
|
|
|
- get_model_path,
|
|
|
- get_embedding_function,
|
|
|
- query_doc,
|
|
|
- query_doc_with_hybrid_search,
|
|
|
- query_collection,
|
|
|
- query_collection_with_hybrid_search,
|
|
|
-)
|
|
|
-
|
|
|
from apps.rag.search.brave import search_brave
|
|
|
+from apps.rag.search.duckduckgo import search_duckduckgo
|
|
|
from apps.rag.search.google_pse import search_google_pse
|
|
|
+from apps.rag.search.jina_search import search_jina
|
|
|
from apps.rag.search.main import SearchResult
|
|
|
+from apps.rag.search.searchapi import search_searchapi
|
|
|
from apps.rag.search.searxng import search_searxng
|
|
|
from apps.rag.search.serper import search_serper
|
|
|
-from apps.rag.search.serpstack import search_serpstack
|
|
|
from apps.rag.search.serply import search_serply
|
|
|
-from apps.rag.search.duckduckgo import search_duckduckgo
|
|
|
+from apps.rag.search.serpstack import search_serpstack
|
|
|
from apps.rag.search.tavily import search_tavily
|
|
|
-from apps.rag.search.jina_search import search_jina
|
|
|
-from apps.rag.search.searchapi import search_searchapi
|
|
|
-
|
|
|
-from utils.misc import (
|
|
|
- calculate_sha256,
|
|
|
- calculate_sha256_string,
|
|
|
- sanitize_filename,
|
|
|
- extract_folders_after_data_docs,
|
|
|
+from apps.rag.utils import (
|
|
|
+ get_embedding_function,
|
|
|
+ get_model_path,
|
|
|
+ query_collection,
|
|
|
+ query_collection_with_hybrid_search,
|
|
|
+ query_doc,
|
|
|
+ query_doc_with_hybrid_search,
|
|
|
)
|
|
|
-from utils.utils import get_verified_user, get_admin_user
|
|
|
-
|
|
|
+from apps.webui.models.documents import DocumentForm, Documents
|
|
|
+from apps.webui.models.files import Files
|
|
|
+from chromadb.utils.batch_utils import create_batches
|
|
|
from config import (
|
|
|
- AppConfig,
|
|
|
- ENV,
|
|
|
- SRC_LOG_LEVELS,
|
|
|
- UPLOAD_DIR,
|
|
|
- DOCS_DIR,
|
|
|
+ BRAVE_SEARCH_API_KEY,
|
|
|
+ CHROMA_CLIENT,
|
|
|
+ CHUNK_OVERLAP,
|
|
|
+ CHUNK_SIZE,
|
|
|
CONTENT_EXTRACTION_ENGINE,
|
|
|
- TIKA_SERVER_URL,
|
|
|
- RAG_TOP_K,
|
|
|
- RAG_RELEVANCE_THRESHOLD,
|
|
|
- RAG_FILE_MAX_SIZE,
|
|
|
- RAG_FILE_MAX_COUNT,
|
|
|
+ CORS_ALLOW_ORIGIN,
|
|
|
+ DEVICE_TYPE,
|
|
|
+ DOCS_DIR,
|
|
|
+ ENABLE_RAG_HYBRID_SEARCH,
|
|
|
+ ENABLE_RAG_LOCAL_WEB_FETCH,
|
|
|
+ ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
|
|
+ ENABLE_RAG_WEB_SEARCH,
|
|
|
+ ENV,
|
|
|
+ GOOGLE_PSE_API_KEY,
|
|
|
+ GOOGLE_PSE_ENGINE_ID,
|
|
|
+ PDF_EXTRACT_IMAGES,
|
|
|
RAG_EMBEDDING_ENGINE,
|
|
|
RAG_EMBEDDING_MODEL,
|
|
|
RAG_EMBEDDING_MODEL_AUTO_UPDATE,
|
|
|
RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
|
|
|
- ENABLE_RAG_HYBRID_SEARCH,
|
|
|
- ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
|
|
+ RAG_EMBEDDING_OPENAI_BATCH_SIZE,
|
|
|
+ RAG_FILE_MAX_COUNT,
|
|
|
+ RAG_FILE_MAX_SIZE,
|
|
|
+ RAG_OPENAI_API_BASE_URL,
|
|
|
+ RAG_OPENAI_API_KEY,
|
|
|
+ RAG_RELEVANCE_THRESHOLD,
|
|
|
RAG_RERANKING_MODEL,
|
|
|
- PDF_EXTRACT_IMAGES,
|
|
|
RAG_RERANKING_MODEL_AUTO_UPDATE,
|
|
|
RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
|
|
|
- RAG_OPENAI_API_BASE_URL,
|
|
|
- RAG_OPENAI_API_KEY,
|
|
|
- DEVICE_TYPE,
|
|
|
- CHROMA_CLIENT,
|
|
|
- CHUNK_SIZE,
|
|
|
- CHUNK_OVERLAP,
|
|
|
RAG_TEMPLATE,
|
|
|
- ENABLE_RAG_LOCAL_WEB_FETCH,
|
|
|
- YOUTUBE_LOADER_LANGUAGE,
|
|
|
- ENABLE_RAG_WEB_SEARCH,
|
|
|
- RAG_WEB_SEARCH_ENGINE,
|
|
|
+ RAG_TOP_K,
|
|
|
+ RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
|
|
|
RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
|
+ RAG_WEB_SEARCH_ENGINE,
|
|
|
+ RAG_WEB_SEARCH_RESULT_COUNT,
|
|
|
+ SEARCHAPI_API_KEY,
|
|
|
+ SEARCHAPI_ENGINE,
|
|
|
SEARXNG_QUERY_URL,
|
|
|
- GOOGLE_PSE_API_KEY,
|
|
|
- GOOGLE_PSE_ENGINE_ID,
|
|
|
- BRAVE_SEARCH_API_KEY,
|
|
|
- SERPSTACK_API_KEY,
|
|
|
- SERPSTACK_HTTPS,
|
|
|
SERPER_API_KEY,
|
|
|
SERPLY_API_KEY,
|
|
|
+ SERPSTACK_API_KEY,
|
|
|
+ SERPSTACK_HTTPS,
|
|
|
TAVILY_API_KEY,
|
|
|
- SEARCHAPI_API_KEY,
|
|
|
- SEARCHAPI_ENGINE,
|
|
|
- RAG_WEB_SEARCH_RESULT_COUNT,
|
|
|
- RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
|
|
|
- RAG_EMBEDDING_OPENAI_BATCH_SIZE,
|
|
|
- CORS_ALLOW_ORIGIN,
|
|
|
+ TIKA_SERVER_URL,
|
|
|
+ UPLOAD_DIR,
|
|
|
+ YOUTUBE_LOADER_LANGUAGE,
|
|
|
+ AppConfig,
|
|
|
)
|
|
|
-
|
|
|
from constants import ERROR_MESSAGES
|
|
|
+from env import SRC_LOG_LEVELS
|
|
|
+from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile, status
|
|
|
+from fastapi.middleware.cors import CORSMiddleware
|
|
|
+from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
+from langchain_community.document_loaders import (
|
|
|
+ BSHTMLLoader,
|
|
|
+ CSVLoader,
|
|
|
+ Docx2txtLoader,
|
|
|
+ OutlookMessageLoader,
|
|
|
+ PyPDFLoader,
|
|
|
+ TextLoader,
|
|
|
+ UnstructuredEPubLoader,
|
|
|
+ UnstructuredExcelLoader,
|
|
|
+ UnstructuredMarkdownLoader,
|
|
|
+ UnstructuredPowerPointLoader,
|
|
|
+ UnstructuredRSTLoader,
|
|
|
+ UnstructuredXMLLoader,
|
|
|
+ WebBaseLoader,
|
|
|
+ YoutubeLoader,
|
|
|
+)
|
|
|
+from langchain_core.documents import Document
|
|
|
+from pydantic import BaseModel
|
|
|
+from utils.misc import (
|
|
|
+ calculate_sha256,
|
|
|
+ calculate_sha256_string,
|
|
|
+ extract_folders_after_data_docs,
|
|
|
+ sanitize_filename,
|
|
|
+)
|
|
|
+from utils.utils import get_admin_user, get_verified_user
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
|
@@ -539,9 +514,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
|
|
|
app.state.config.SERPLY_API_KEY = form_data.web.search.serply_api_key
|
|
|
app.state.config.TAVILY_API_KEY = form_data.web.search.tavily_api_key
|
|
|
app.state.config.SEARCHAPI_API_KEY = form_data.web.search.searchapi_api_key
|
|
|
- app.state.config.SEARCHAPI_ENGINE = (
|
|
|
- form_data.web.search.searchapi_engine
|
|
|
- )
|
|
|
+ app.state.config.SEARCHAPI_ENGINE = form_data.web.search.searchapi_engine
|
|
|
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = form_data.web.search.result_count
|
|
|
app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = (
|
|
|
form_data.web.search.concurrent_requests
|
|
@@ -981,7 +954,6 @@ def store_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
|
|
|
def store_data_in_vector_db(
|
|
|
data, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
|
|
|
) -> bool:
|
|
|
-
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
chunk_size=app.state.config.CHUNK_SIZE,
|
|
|
chunk_overlap=app.state.config.CHUNK_OVERLAP,
|
|
@@ -1342,7 +1314,6 @@ def store_text(
|
|
|
form_data: TextRAGForm,
|
|
|
user=Depends(get_verified_user),
|
|
|
):
|
|
|
-
|
|
|
collection_name = form_data.collection_name
|
|
|
if collection_name is None:
|
|
|
collection_name = calculate_sha256_string(form_data.content)
|