Browse Source

Merge pull request #1050 from jannikstdl/rag-pdf-ocr

feat: added ocr functionality to the pdf loader
Timothy Jaeryang Baek 1 year ago
parent
commit
8fb5f54751
2 changed files with 2 additions and 1 deletions
  1. 1 1
      backend/apps/rag/main.py
  2. 1 0
      backend/requirements.txt

+ 1 - 1
backend/apps/rag/main.py

@@ -425,7 +425,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
     ]
 
     if file_ext == "pdf":
-        loader = PyPDFLoader(file_path)
+        loader = PyPDFLoader(file_path, extract_images=True)
     elif file_ext == "csv":
         loader = CSVLoader(file_path)
     elif file_ext == "rst":

+ 1 - 0
backend/requirements.txt

@@ -34,6 +34,7 @@ pandas
 openpyxl
 pyxlsb
 xlrd
+rapidocr-onnxruntime
 
 faster-whisper