-
Notifications
You must be signed in to change notification settings - Fork 40
Expand file tree
/
Copy pathembed.py
More file actions
48 lines (39 loc) · 1.69 KB
/
embed.py
File metadata and controls
48 lines (39 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
from datetime import datetime
from werkzeug.utils import secure_filename
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from get_vector_db import get_vector_db
TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
# Function to check if the uploaded file is allowed (only PDF files)
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'pdf'}
# Function to save the uploaded file to the temporary folder
def save_file(file):
# Save the uploaded file with a secure filename and return the file path
ct = datetime.now()
ts = ct.timestamp()
filename = str(ts) + "_" + secure_filename(file.filename)
file_path = os.path.join(TEMP_FOLDER, filename)
file.save(file_path)
return file_path
# Function to load and split the data from the PDF file
def load_and_split_data(file_path):
# Load the PDF file and split the data into chunks
loader = UnstructuredPDFLoader(file_path=file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)
return chunks
# Main function to handle the embedding process
def embed(file):
# Check if the file is valid, save it, load and split the data, add to the database, and remove the temporary file
if file.filename != '' and file and allowed_file(file.filename):
file_path = save_file(file)
chunks = load_and_split_data(file_path)
db = get_vector_db()
db.add_documents(chunks)
db.persist()
os.remove(file_path)
return True
return False