forked from llmware-ai/llmware
-
Notifications
You must be signed in to change notification settings - Fork 0
/
embeddings_fast_start.py
63 lines (40 loc) · 2.19 KB
/
embeddings_fast_start.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
# This example shows the general recipe for creating an embedding. This scenario uses FAISS for local
# laptop deployment.
"""
import os
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
def embeddings_fast_start (library_name, vector_db="faiss"):
# Create and populate a library
print (f"\nstep 1 - creating and populating library: {library_name}...")
library = Library().create_new_library(library_name)
sample_files_path = Setup().load_sample_files()
library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
# To create vector embeddings you just need to specify the embedding model and the vector embedding DB
# For examples of using HuggingFace and SentenceTransformer models, see those examples in this same folder
embedding_model = "mini-lm-sbert"
print (f"\n > Generating embedding vectors and storing in '{vector_db}'...")
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
# Then when doing semantic queries, the most recent vector DB used for embeddings will be used.
# We just find the best 3 hits for "Salary"
q = Query(library)
print (f"\n > Running a query for 'Salary'...")
query_results = q.semantic_query(query="Salary", result_count=10, results_only=True)
for i, entries in enumerate(query_results):
# each query result is a dictionary with many useful keys
text = entries["text"]
document_source = entries["file_source"]
page_num = entries["page_num"]
vector_distance = entries["distance"]
# for display purposes only, we will only show the first 100 characters of the text
if len(text) > 125: text = text[0:125] + " ... "
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
.format( i, document_source, page_num, vector_distance))
print("update: text sample - ", text)
return query_results
if __name__ == "__main__":
# set to 'faiss' by default -> switch to 'milvus' once installed and running
db = "faiss"
embeddings_fast_start("embedding_test_1", vector_db=db)