|
@@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
|
|
|
TextLoader,
|
|
|
PyPDFLoader,
|
|
|
CSVLoader,
|
|
|
+ BSHTMLLoader,
|
|
|
Docx2txtLoader,
|
|
|
UnstructuredEPubLoader,
|
|
|
UnstructuredWordDocumentLoader,
|
|
@@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel):
|
|
|
class StoreWebForm(CollectionNameForm):
|
|
|
url: str
|
|
|
|
|
|
+
|
|
|
@app.get("/")
|
|
|
async def get_status():
|
|
|
return {
|
|
@@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
|
|
|
|
|
|
|
|
|
def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
|
|
|
+
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
chunk_size=app.state.CHUNK_SIZE,
|
|
|
chunk_overlap=app.state.CHUNK_OVERLAP,
|
|
|
add_start_index=True,
|
|
|
)
|
|
|
docs = text_splitter.split_documents(data)
|
|
|
- return store_docs_in_vector_db(docs, collection_name, overwrite)
|
|
|
+
|
|
|
+ if len(docs) > 0:
|
|
|
+ return store_docs_in_vector_db(docs, collection_name, overwrite), None
|
|
|
+ else:
|
|
|
+ raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
|
|
|
|
|
|
|
|
def store_text_in_vector_db(
|
|
@@ -318,6 +325,7 @@ def store_text_in_vector_db(
|
|
|
|
|
|
|
|
|
def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
|
|
|
+
|
|
|
texts = [doc.page_content for doc in docs]
|
|
|
metadatas = [doc.metadata for doc in docs]
|
|
|
|
|
@@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
|
|
|
loader = UnstructuredRSTLoader(file_path, mode="elements")
|
|
|
elif file_ext == "xml":
|
|
|
loader = UnstructuredXMLLoader(file_path)
|
|
|
+ elif file_ext in ["htm", "html"]:
|
|
|
+ loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
|
|
|
elif file_ext == "md":
|
|
|
loader = UnstructuredMarkdownLoader(file_path)
|
|
|
elif file_content_type == "application/epub+zip":
|
|
@@ -452,19 +462,21 @@ def store_doc(
|
|
|
|
|
|
loader, known_type = get_loader(file.filename, file.content_type, file_path)
|
|
|
data = loader.load()
|
|
|
- result = store_data_in_vector_db(data, collection_name)
|
|
|
-
|
|
|
- if result:
|
|
|
- return {
|
|
|
- "status": True,
|
|
|
- "collection_name": collection_name,
|
|
|
- "filename": filename,
|
|
|
- "known_type": known_type,
|
|
|
- }
|
|
|
- else:
|
|
|
+
|
|
|
+ try:
|
|
|
+ result = store_data_in_vector_db(data, collection_name)
|
|
|
+
|
|
|
+ if result:
|
|
|
+ return {
|
|
|
+ "status": True,
|
|
|
+ "collection_name": collection_name,
|
|
|
+ "filename": filename,
|
|
|
+ "known_type": known_type,
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
raise HTTPException(
|
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
|
- detail=ERROR_MESSAGES.DEFAULT(),
|
|
|
+ detail=e,
|
|
|
)
|
|
|
except Exception as e:
|
|
|
log.exception(e)
|
|
@@ -529,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)):
|
|
|
)
|
|
|
data = loader.load()
|
|
|
|
|
|
- result = store_data_in_vector_db(data, collection_name)
|
|
|
-
|
|
|
- if result:
|
|
|
- sanitized_filename = sanitize_filename(filename)
|
|
|
- doc = Documents.get_doc_by_name(sanitized_filename)
|
|
|
-
|
|
|
- if doc == None:
|
|
|
- doc = Documents.insert_new_doc(
|
|
|
- user.id,
|
|
|
- DocumentForm(
|
|
|
- **{
|
|
|
- "name": sanitized_filename,
|
|
|
- "title": filename,
|
|
|
- "collection_name": collection_name,
|
|
|
- "filename": filename,
|
|
|
- "content": (
|
|
|
- json.dumps(
|
|
|
- {
|
|
|
- "tags": list(
|
|
|
- map(
|
|
|
- lambda name: {"name": name},
|
|
|
- tags,
|
|
|
+ try:
|
|
|
+ result = store_data_in_vector_db(data, collection_name)
|
|
|
+
|
|
|
+ if result:
|
|
|
+ sanitized_filename = sanitize_filename(filename)
|
|
|
+ doc = Documents.get_doc_by_name(sanitized_filename)
|
|
|
+
|
|
|
+ if doc == None:
|
|
|
+ doc = Documents.insert_new_doc(
|
|
|
+ user.id,
|
|
|
+ DocumentForm(
|
|
|
+ **{
|
|
|
+ "name": sanitized_filename,
|
|
|
+ "title": filename,
|
|
|
+ "collection_name": collection_name,
|
|
|
+ "filename": filename,
|
|
|
+ "content": (
|
|
|
+ json.dumps(
|
|
|
+ {
|
|
|
+ "tags": list(
|
|
|
+ map(
|
|
|
+ lambda name: {"name": name},
|
|
|
+ tags,
|
|
|
+ )
|
|
|
)
|
|
|
- )
|
|
|
- }
|
|
|
- )
|
|
|
- if len(tags)
|
|
|
- else "{}"
|
|
|
- ),
|
|
|
- }
|
|
|
- ),
|
|
|
- )
|
|
|
+ }
|
|
|
+ )
|
|
|
+ if len(tags)
|
|
|
+ else "{}"
|
|
|
+ ),
|
|
|
+ }
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ pass
|
|
|
|
|
|
except Exception as e:
|
|
|
log.exception(e)
|