main.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. from fastapi import (
  2. FastAPI,
  3. Depends,
  4. HTTPException,
  5. status,
  6. UploadFile,
  7. File,
  8. Form,
  9. )
  10. from fastapi.middleware.cors import CORSMiddleware
  11. import os, shutil, logging, re
  12. from pathlib import Path
  13. from typing import List
  14. from chromadb.utils.batch_utils import create_batches
  15. from langchain_community.document_loaders import (
  16. WebBaseLoader,
  17. TextLoader,
  18. PyPDFLoader,
  19. CSVLoader,
  20. BSHTMLLoader,
  21. Docx2txtLoader,
  22. UnstructuredEPubLoader,
  23. UnstructuredWordDocumentLoader,
  24. UnstructuredMarkdownLoader,
  25. UnstructuredXMLLoader,
  26. UnstructuredRSTLoader,
  27. UnstructuredExcelLoader,
  28. )
  29. from langchain.text_splitter import RecursiveCharacterTextSplitter
  30. from pydantic import BaseModel
  31. from typing import Optional
  32. import mimetypes
  33. import uuid
  34. import json
  35. import sentence_transformers
  36. from apps.ollama.main import generate_ollama_embeddings, GenerateEmbeddingsForm
  37. from apps.web.models.documents import (
  38. Documents,
  39. DocumentForm,
  40. DocumentResponse,
  41. )
  42. from apps.rag.utils import (
  43. query_embeddings_doc,
  44. query_embeddings_function,
  45. query_embeddings_collection,
  46. )
  47. from utils.misc import (
  48. calculate_sha256,
  49. calculate_sha256_string,
  50. sanitize_filename,
  51. extract_folders_after_data_docs,
  52. )
  53. from utils.utils import get_current_user, get_admin_user
  54. from config import (
  55. SRC_LOG_LEVELS,
  56. UPLOAD_DIR,
  57. DOCS_DIR,
  58. RAG_TOP_K,
  59. RAG_RELEVANCE_THRESHOLD,
  60. RAG_EMBEDDING_ENGINE,
  61. RAG_EMBEDDING_MODEL,
  62. RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
  63. RAG_RERANKING_MODEL,
  64. RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
  65. RAG_OPENAI_API_BASE_URL,
  66. RAG_OPENAI_API_KEY,
  67. DEVICE_TYPE,
  68. CHROMA_CLIENT,
  69. CHUNK_SIZE,
  70. CHUNK_OVERLAP,
  71. RAG_TEMPLATE,
  72. )
  73. from constants import ERROR_MESSAGES
  74. log = logging.getLogger(__name__)
  75. log.setLevel(SRC_LOG_LEVELS["RAG"])
  76. app = FastAPI()
  77. app.state.TOP_K = RAG_TOP_K
  78. app.state.RELEVANCE_THRESHOLD = RAG_RELEVANCE_THRESHOLD
  79. app.state.CHUNK_SIZE = CHUNK_SIZE
  80. app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
  81. app.state.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE
  82. app.state.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL
  83. app.state.RAG_RERANKING_MODEL = RAG_RERANKING_MODEL
  84. app.state.RAG_TEMPLATE = RAG_TEMPLATE
  85. app.state.OPENAI_API_BASE_URL = RAG_OPENAI_API_BASE_URL
  86. app.state.OPENAI_API_KEY = RAG_OPENAI_API_KEY
  87. app.state.PDF_EXTRACT_IMAGES = False
  88. if app.state.RAG_EMBEDDING_ENGINE == "":
  89. app.state.sentence_transformer_ef = sentence_transformers.SentenceTransformer(
  90. app.state.RAG_EMBEDDING_MODEL,
  91. device=DEVICE_TYPE,
  92. trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
  93. )
  94. else:
  95. app.state.sentence_transformer_ef = None
  96. if not app.state.RAG_RERANKING_MODEL == "":
  97. app.state.sentence_transformer_rf = sentence_transformers.CrossEncoder(
  98. app.state.RAG_RERANKING_MODEL,
  99. device=DEVICE_TYPE,
  100. trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
  101. )
  102. else:
  103. app.state.sentence_transformer_rf = None
  104. origins = ["*"]
  105. app.add_middleware(
  106. CORSMiddleware,
  107. allow_origins=origins,
  108. allow_credentials=True,
  109. allow_methods=["*"],
  110. allow_headers=["*"],
  111. )
  112. class CollectionNameForm(BaseModel):
  113. collection_name: Optional[str] = "test"
  114. class StoreWebForm(CollectionNameForm):
  115. url: str
  116. @app.get("/")
  117. async def get_status():
  118. return {
  119. "status": True,
  120. "chunk_size": app.state.CHUNK_SIZE,
  121. "chunk_overlap": app.state.CHUNK_OVERLAP,
  122. "template": app.state.RAG_TEMPLATE,
  123. "embedding_engine": app.state.RAG_EMBEDDING_ENGINE,
  124. "embedding_model": app.state.RAG_EMBEDDING_MODEL,
  125. "reranking_model": app.state.RAG_RERANKING_MODEL,
  126. }
  127. @app.get("/embedding")
  128. async def get_embedding_config(user=Depends(get_admin_user)):
  129. return {
  130. "status": True,
  131. "embedding_engine": app.state.RAG_EMBEDDING_ENGINE,
  132. "embedding_model": app.state.RAG_EMBEDDING_MODEL,
  133. "openai_config": {
  134. "url": app.state.OPENAI_API_BASE_URL,
  135. "key": app.state.OPENAI_API_KEY,
  136. },
  137. }
  138. @app.get("/reranking")
  139. async def get_reraanking_config(user=Depends(get_admin_user)):
  140. return {"status": True, "reranking_model": app.state.RAG_RERANKING_MODEL}
  141. class OpenAIConfigForm(BaseModel):
  142. url: str
  143. key: str
  144. class EmbeddingModelUpdateForm(BaseModel):
  145. openai_config: Optional[OpenAIConfigForm] = None
  146. embedding_engine: str
  147. embedding_model: str
  148. @app.post("/embedding/update")
  149. async def update_embedding_config(
  150. form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
  151. ):
  152. log.info(
  153. f"Updating embedding model: {app.state.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
  154. )
  155. try:
  156. app.state.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
  157. app.state.RAG_EMBEDDING_MODEL = form_data.embedding_model
  158. if app.state.RAG_EMBEDDING_ENGINE in ["ollama", "openai"]:
  159. if form_data.openai_config != None:
  160. app.state.OPENAI_API_BASE_URL = form_data.openai_config.url
  161. app.state.OPENAI_API_KEY = form_data.openai_config.key
  162. app.state.sentence_transformer_ef = None
  163. else:
  164. app.state.sentence_transformer_ef = (
  165. sentence_transformers.SentenceTransformer(
  166. app.state.RAG_EMBEDDING_MODEL,
  167. device=DEVICE_TYPE,
  168. trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
  169. )
  170. )
  171. return {
  172. "status": True,
  173. "embedding_engine": app.state.RAG_EMBEDDING_ENGINE,
  174. "embedding_model": app.state.RAG_EMBEDDING_MODEL,
  175. "openai_config": {
  176. "url": app.state.OPENAI_API_BASE_URL,
  177. "key": app.state.OPENAI_API_KEY,
  178. },
  179. }
  180. except Exception as e:
  181. log.exception(f"Problem updating embedding model: {e}")
  182. raise HTTPException(
  183. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  184. detail=ERROR_MESSAGES.DEFAULT(e),
  185. )
  186. class RerankingModelUpdateForm(BaseModel):
  187. reranking_model: str
  188. @app.post("/reranking/update")
  189. async def update_reranking_config(
  190. form_data: RerankingModelUpdateForm, user=Depends(get_admin_user)
  191. ):
  192. log.info(
  193. f"Updating reranking model: {app.state.RAG_RERANKING_MODEL} to {form_data.reranking_model}"
  194. )
  195. try:
  196. app.state.RAG_RERANKING_MODEL = form_data.reranking_model
  197. if app.state.RAG_RERANKING_MODEL == "":
  198. app.state.sentence_transformer_rf = None
  199. else:
  200. app.state.sentence_transformer_rf = sentence_transformers.CrossEncoder(
  201. app.state.RAG_RERANKING_MODEL,
  202. device=DEVICE_TYPE,
  203. )
  204. return {
  205. "status": True,
  206. "reranking_model": app.state.RAG_RERANKING_MODEL,
  207. }
  208. except Exception as e:
  209. log.exception(f"Problem updating reranking model: {e}")
  210. raise HTTPException(
  211. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  212. detail=ERROR_MESSAGES.DEFAULT(e),
  213. )
  214. @app.get("/config")
  215. async def get_rag_config(user=Depends(get_admin_user)):
  216. return {
  217. "status": True,
  218. "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
  219. "chunk": {
  220. "chunk_size": app.state.CHUNK_SIZE,
  221. "chunk_overlap": app.state.CHUNK_OVERLAP,
  222. },
  223. }
  224. class ChunkParamUpdateForm(BaseModel):
  225. chunk_size: int
  226. chunk_overlap: int
  227. class ConfigUpdateForm(BaseModel):
  228. pdf_extract_images: bool
  229. chunk: ChunkParamUpdateForm
  230. @app.post("/config/update")
  231. async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
  232. app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
  233. app.state.CHUNK_SIZE = form_data.chunk.chunk_size
  234. app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
  235. return {
  236. "status": True,
  237. "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
  238. "chunk": {
  239. "chunk_size": app.state.CHUNK_SIZE,
  240. "chunk_overlap": app.state.CHUNK_OVERLAP,
  241. },
  242. }
  243. @app.get("/template")
  244. async def get_rag_template(user=Depends(get_current_user)):
  245. return {
  246. "status": True,
  247. "template": app.state.RAG_TEMPLATE,
  248. }
  249. @app.get("/query/settings")
  250. async def get_query_settings(user=Depends(get_admin_user)):
  251. return {
  252. "status": True,
  253. "template": app.state.RAG_TEMPLATE,
  254. "k": app.state.TOP_K,
  255. "r": app.state.RELEVANCE_THRESHOLD,
  256. }
  257. class QuerySettingsForm(BaseModel):
  258. k: Optional[int] = None
  259. r: Optional[float] = None
  260. template: Optional[str] = None
  261. @app.post("/query/settings/update")
  262. async def update_query_settings(
  263. form_data: QuerySettingsForm, user=Depends(get_admin_user)
  264. ):
  265. app.state.RAG_TEMPLATE = form_data.template if form_data.template else RAG_TEMPLATE
  266. app.state.TOP_K = form_data.k if form_data.k else 4
  267. app.state.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
  268. return {"status": True, "template": app.state.RAG_TEMPLATE}
  269. class QueryDocForm(BaseModel):
  270. collection_name: str
  271. query: str
  272. k: Optional[int] = None
  273. r: Optional[float] = None
  274. @app.post("/query/doc")
  275. def query_doc_handler(
  276. form_data: QueryDocForm,
  277. user=Depends(get_current_user),
  278. ):
  279. try:
  280. embeddings_function = query_embeddings_function(
  281. app.state.RAG_EMBEDDING_ENGINE,
  282. app.state.RAG_EMBEDDING_MODEL,
  283. app.state.sentence_transformer_ef,
  284. app.state.OPENAI_API_KEY,
  285. app.state.OPENAI_API_BASE_URL,
  286. )
  287. return query_embeddings_doc(
  288. collection_name=form_data.collection_name,
  289. query=form_data.query,
  290. k=form_data.k if form_data.k else app.state.TOP_K,
  291. r=form_data.r if form_data.r else app.state.RELEVANCE_THRESHOLD,
  292. embeddings_function=embeddings_function,
  293. reranking_function=app.state.sentence_transformer_rf,
  294. )
  295. except Exception as e:
  296. log.exception(e)
  297. raise HTTPException(
  298. status_code=status.HTTP_400_BAD_REQUEST,
  299. detail=ERROR_MESSAGES.DEFAULT(e),
  300. )
  301. class QueryCollectionsForm(BaseModel):
  302. collection_names: List[str]
  303. query: str
  304. k: Optional[int] = None
  305. r: Optional[float] = None
  306. @app.post("/query/collection")
  307. def query_collection_handler(
  308. form_data: QueryCollectionsForm,
  309. user=Depends(get_current_user),
  310. ):
  311. try:
  312. embeddings_function = embeddings_function(
  313. app.state.RAG_EMBEDDING_ENGINE,
  314. app.state.RAG_EMBEDDING_MODEL,
  315. app.state.sentence_transformer_ef,
  316. app.state.OPENAI_API_KEY,
  317. app.state.OPENAI_API_BASE_URL,
  318. )
  319. return query_embeddings_collection(
  320. collection_names=form_data.collection_names,
  321. query=form_data.query,
  322. k=form_data.k if form_data.k else app.state.TOP_K,
  323. r=form_data.r if form_data.r else app.state.RELEVANCE_THRESHOLD,
  324. embeddings_function=embeddings_function,
  325. reranking_function=app.state.sentence_transformer_rf,
  326. )
  327. except Exception as e:
  328. log.exception(e)
  329. raise HTTPException(
  330. status_code=status.HTTP_400_BAD_REQUEST,
  331. detail=ERROR_MESSAGES.DEFAULT(e),
  332. )
  333. @app.post("/web")
  334. def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
  335. # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
  336. try:
  337. loader = WebBaseLoader(form_data.url)
  338. data = loader.load()
  339. collection_name = form_data.collection_name
  340. if collection_name == "":
  341. collection_name = calculate_sha256_string(form_data.url)[:63]
  342. store_data_in_vector_db(data, collection_name, overwrite=True)
  343. return {
  344. "status": True,
  345. "collection_name": collection_name,
  346. "filename": form_data.url,
  347. }
  348. except Exception as e:
  349. log.exception(e)
  350. raise HTTPException(
  351. status_code=status.HTTP_400_BAD_REQUEST,
  352. detail=ERROR_MESSAGES.DEFAULT(e),
  353. )
  354. def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
  355. text_splitter = RecursiveCharacterTextSplitter(
  356. chunk_size=app.state.CHUNK_SIZE,
  357. chunk_overlap=app.state.CHUNK_OVERLAP,
  358. add_start_index=True,
  359. )
  360. docs = text_splitter.split_documents(data)
  361. if len(docs) > 0:
  362. log.info(f"store_data_in_vector_db {docs}")
  363. return store_docs_in_vector_db(docs, collection_name, overwrite), None
  364. else:
  365. raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
  366. def store_text_in_vector_db(
  367. text, metadata, collection_name, overwrite: bool = False
  368. ) -> bool:
  369. text_splitter = RecursiveCharacterTextSplitter(
  370. chunk_size=app.state.CHUNK_SIZE,
  371. chunk_overlap=app.state.CHUNK_OVERLAP,
  372. add_start_index=True,
  373. )
  374. docs = text_splitter.create_documents([text], metadatas=[metadata])
  375. return store_docs_in_vector_db(docs, collection_name, overwrite)
  376. def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
  377. log.info(f"store_docs_in_vector_db {docs} {collection_name}")
  378. texts = [doc.page_content for doc in docs]
  379. metadatas = [doc.metadata for doc in docs]
  380. try:
  381. if overwrite:
  382. for collection in CHROMA_CLIENT.list_collections():
  383. if collection_name == collection.name:
  384. log.info(f"deleting existing collection {collection_name}")
  385. CHROMA_CLIENT.delete_collection(name=collection_name)
  386. collection = CHROMA_CLIENT.create_collection(name=collection_name)
  387. embedding_func = query_embeddings_function(
  388. app.state.RAG_EMBEDDING_ENGINE,
  389. app.state.RAG_EMBEDDING_MODEL,
  390. app.state.sentence_transformer_ef,
  391. app.state.OPENAI_API_KEY,
  392. app.state.OPENAI_API_BASE_URL,
  393. )
  394. embedding_texts = list(map(lambda x: x.replace("\n", " "), texts))
  395. embeddings = embedding_func(embedding_texts)
  396. for batch in create_batches(
  397. api=CHROMA_CLIENT,
  398. ids=[str(uuid.uuid1()) for _ in texts],
  399. metadatas=metadatas,
  400. embeddings=embeddings,
  401. documents=texts,
  402. ):
  403. collection.add(*batch)
  404. return True
  405. except Exception as e:
  406. log.exception(e)
  407. if e.__class__.__name__ == "UniqueConstraintError":
  408. return True
  409. return False
  410. def get_loader(filename: str, file_content_type: str, file_path: str):
  411. file_ext = filename.split(".")[-1].lower()
  412. known_type = True
  413. known_source_ext = [
  414. "go",
  415. "py",
  416. "java",
  417. "sh",
  418. "bat",
  419. "ps1",
  420. "cmd",
  421. "js",
  422. "ts",
  423. "css",
  424. "cpp",
  425. "hpp",
  426. "h",
  427. "c",
  428. "cs",
  429. "sql",
  430. "log",
  431. "ini",
  432. "pl",
  433. "pm",
  434. "r",
  435. "dart",
  436. "dockerfile",
  437. "env",
  438. "php",
  439. "hs",
  440. "hsc",
  441. "lua",
  442. "nginxconf",
  443. "conf",
  444. "m",
  445. "mm",
  446. "plsql",
  447. "perl",
  448. "rb",
  449. "rs",
  450. "db2",
  451. "scala",
  452. "bash",
  453. "swift",
  454. "vue",
  455. "svelte",
  456. ]
  457. if file_ext == "pdf":
  458. loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
  459. elif file_ext == "csv":
  460. loader = CSVLoader(file_path)
  461. elif file_ext == "rst":
  462. loader = UnstructuredRSTLoader(file_path, mode="elements")
  463. elif file_ext == "xml":
  464. loader = UnstructuredXMLLoader(file_path)
  465. elif file_ext in ["htm", "html"]:
  466. loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
  467. elif file_ext == "md":
  468. loader = UnstructuredMarkdownLoader(file_path)
  469. elif file_content_type == "application/epub+zip":
  470. loader = UnstructuredEPubLoader(file_path)
  471. elif (
  472. file_content_type
  473. == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
  474. or file_ext in ["doc", "docx"]
  475. ):
  476. loader = Docx2txtLoader(file_path)
  477. elif file_content_type in [
  478. "application/vnd.ms-excel",
  479. "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  480. ] or file_ext in ["xls", "xlsx"]:
  481. loader = UnstructuredExcelLoader(file_path)
  482. elif file_ext in known_source_ext or (
  483. file_content_type and file_content_type.find("text/") >= 0
  484. ):
  485. loader = TextLoader(file_path, autodetect_encoding=True)
  486. else:
  487. loader = TextLoader(file_path, autodetect_encoding=True)
  488. known_type = False
  489. return loader, known_type
  490. @app.post("/doc")
  491. def store_doc(
  492. collection_name: Optional[str] = Form(None),
  493. file: UploadFile = File(...),
  494. user=Depends(get_current_user),
  495. ):
  496. # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
  497. log.info(f"file.content_type: {file.content_type}")
  498. try:
  499. unsanitized_filename = file.filename
  500. filename = os.path.basename(unsanitized_filename)
  501. file_path = f"{UPLOAD_DIR}/{filename}"
  502. contents = file.file.read()
  503. with open(file_path, "wb") as f:
  504. f.write(contents)
  505. f.close()
  506. f = open(file_path, "rb")
  507. if collection_name == None:
  508. collection_name = calculate_sha256(f)[:63]
  509. f.close()
  510. loader, known_type = get_loader(filename, file.content_type, file_path)
  511. data = loader.load()
  512. try:
  513. result = store_data_in_vector_db(data, collection_name)
  514. if result:
  515. return {
  516. "status": True,
  517. "collection_name": collection_name,
  518. "filename": filename,
  519. "known_type": known_type,
  520. }
  521. except Exception as e:
  522. raise HTTPException(
  523. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  524. detail=e,
  525. )
  526. except Exception as e:
  527. log.exception(e)
  528. if "No pandoc was found" in str(e):
  529. raise HTTPException(
  530. status_code=status.HTTP_400_BAD_REQUEST,
  531. detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
  532. )
  533. else:
  534. raise HTTPException(
  535. status_code=status.HTTP_400_BAD_REQUEST,
  536. detail=ERROR_MESSAGES.DEFAULT(e),
  537. )
  538. class TextRAGForm(BaseModel):
  539. name: str
  540. content: str
  541. collection_name: Optional[str] = None
  542. @app.post("/text")
  543. def store_text(
  544. form_data: TextRAGForm,
  545. user=Depends(get_current_user),
  546. ):
  547. collection_name = form_data.collection_name
  548. if collection_name == None:
  549. collection_name = calculate_sha256_string(form_data.content)
  550. result = store_text_in_vector_db(
  551. form_data.content,
  552. metadata={"name": form_data.name, "created_by": user.id},
  553. collection_name=collection_name,
  554. )
  555. if result:
  556. return {"status": True, "collection_name": collection_name}
  557. else:
  558. raise HTTPException(
  559. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  560. detail=ERROR_MESSAGES.DEFAULT(),
  561. )
  562. @app.get("/scan")
  563. def scan_docs_dir(user=Depends(get_admin_user)):
  564. for path in Path(DOCS_DIR).rglob("./**/*"):
  565. try:
  566. if path.is_file() and not path.name.startswith("."):
  567. tags = extract_folders_after_data_docs(path)
  568. filename = path.name
  569. file_content_type = mimetypes.guess_type(path)
  570. f = open(path, "rb")
  571. collection_name = calculate_sha256(f)[:63]
  572. f.close()
  573. loader, known_type = get_loader(
  574. filename, file_content_type[0], str(path)
  575. )
  576. data = loader.load()
  577. try:
  578. result = store_data_in_vector_db(data, collection_name)
  579. if result:
  580. sanitized_filename = sanitize_filename(filename)
  581. doc = Documents.get_doc_by_name(sanitized_filename)
  582. if doc == None:
  583. doc = Documents.insert_new_doc(
  584. user.id,
  585. DocumentForm(
  586. **{
  587. "name": sanitized_filename,
  588. "title": filename,
  589. "collection_name": collection_name,
  590. "filename": filename,
  591. "content": (
  592. json.dumps(
  593. {
  594. "tags": list(
  595. map(
  596. lambda name: {"name": name},
  597. tags,
  598. )
  599. )
  600. }
  601. )
  602. if len(tags)
  603. else "{}"
  604. ),
  605. }
  606. ),
  607. )
  608. except Exception as e:
  609. log.exception(e)
  610. pass
  611. except Exception as e:
  612. log.exception(e)
  613. return True
  614. @app.get("/reset/db")
  615. def reset_vector_db(user=Depends(get_admin_user)):
  616. CHROMA_CLIENT.reset()
  617. @app.get("/reset")
  618. def reset(user=Depends(get_admin_user)) -> bool:
  619. folder = f"{UPLOAD_DIR}"
  620. for filename in os.listdir(folder):
  621. file_path = os.path.join(folder, filename)
  622. try:
  623. if os.path.isfile(file_path) or os.path.islink(file_path):
  624. os.unlink(file_path)
  625. elif os.path.isdir(file_path):
  626. shutil.rmtree(file_path)
  627. except Exception as e:
  628. log.error("Failed to delete %s. Reason: %s" % (file_path, e))
  629. try:
  630. CHROMA_CLIENT.reset()
  631. except Exception as e:
  632. log.exception(e)
  633. return True