-
Notifications
You must be signed in to change notification settings - Fork 0
/
vector_db_build.py
28 lines (23 loc) · 946 Bytes
/
vector_db_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import os
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
import concatenator
for file in os.listdir("extracted_text"):
if file == ".DS_Store":
continue
else:
print(f"Now embedding {file}"[:-8]+"...")
loaded = TextLoader(f"extracted_text/{file}")
raw_doc = loaded.load()
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=0.5)
split_docs = text_splitter.split_documents(raw_doc)
ollamaEmbeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(
documents=split_docs,
embedding=ollamaEmbeddings,
persist_directory=f"./vector/my_data/{file}"[:-8],
)
vectorstore.persist()
print("Done!")