From 7e77e6fbc57b3617dcdbb4c9db315f1584cd47eb Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 28 Jun 2023 08:44:21 -0400
Subject: [PATCH 1/8] basic skeleton for pytest

---
 tests/__init__.py   | 0
 tests/test_basic.py | 6 ++++++
 2 files changed, 6 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_basic.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_basic.py b/tests/test_basic.py
new file mode 100644
index 0000000..0512076
--- /dev/null
+++ b/tests/test_basic.py
@@ -0,0 +1,6 @@
+import pepembed
+
+class Testpepembed:
+    def test_search(self):
+        """Basic example of a test"""
+        assert 1 == 1

From 03c43d6a49c7faa5b7aa94fcaea82ad4d7db5f67 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:36:10 -0400
Subject: [PATCH 2/8] Add flattening of dict to parse all attributes from PEP.
 Adjust basic test.

---
 pepembed/const.py                            |  1 +
 pepembed/pepembed.py                         |  7 +++-
 requirements/requirements-all.txt            |  2 +-
 tests/data/keywordstest.txt                  |  7 ++++
 tests/data/testconfigs/GSE105734_samples.csv |  5 +++
 tests/data/testconfigs/testpep1.yaml         | 42 ++++++++++++++++++++
 tests/test_basic.py                          | 27 ++++++++++++-
 7 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/keywordstest.txt
 create mode 100644 tests/data/testconfigs/GSE105734_samples.csv
 create mode 100644 tests/data/testconfigs/testpep1.yaml

diff --git a/pepembed/const.py b/pepembed/const.py
index 47f8008..6b9d522 100644
--- a/pepembed/const.py
+++ b/pepembed/const.py
@@ -17,6 +17,7 @@
 }
 
 DEFAULT_KEYWORDS = ["cell", "protocol", "description", "processing", "source"]
+DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
 
 MIN_DESCRIPTION_LENGTH = 5
 
diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py
index e1a4398..30e5ba4 100644
--- a/pepembed/pepembed.py
+++ b/pepembed/pepembed.py
@@ -4,6 +4,7 @@
 from peppy.const import SAMPLE_MODS_KEY, CONSTANT_KEY, CONFIG_KEY, NAME_KEY
 from sentence_transformers import SentenceTransformer
 
+import flatdict
 
 from .utils import read_in_key_words
 from .const import DEFAULT_KEYWORDS, MIN_DESCRIPTION_LENGTH
@@ -43,14 +44,16 @@ def mine_metadata_from_dict(
         ):
             return project[NAME_KEY] or ""
 
-        project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY]
+        #project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY]
+        #Flatten dictionary
+        project_level_dict: dict = flatdict.FlatDict(project_config)
         project_level_attrs = list(project_level_dict.keys())
         desc = ""
 
         # build up a description
         for attr in project_level_attrs:
             if any([kw in attr for kw in self.keywords]):
-                desc += project_level_dict[attr] + " "
+                desc += str(project_level_dict[attr]) + " "
 
         # return if description is sufficient
         if len(desc) > min_desc_length:
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index b5e3327..45b4de1 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -3,6 +3,6 @@ sentence-transformers
 peppy
 python-dotenv
 qdrant-client
-psycopg2
+#psycopg2
 ubiquerg
 tqdm
diff --git a/tests/data/keywordstest.txt b/tests/data/keywordstest.txt
new file mode 100644
index 0000000..85e87f6
--- /dev/null
+++ b/tests/data/keywordstest.txt
@@ -0,0 +1,7 @@
+protocol
+description
+processing
+source
+series_summary
+file_path
+series_pubmed_id
\ No newline at end of file
diff --git a/tests/data/testconfigs/GSE105734_samples.csv b/tests/data/testconfigs/GSE105734_samples.csv
new file mode 100644
index 0000000..9a6d8b5
--- /dev/null
+++ b/tests/data/testconfigs/GSE105734_samples.csv
@@ -0,0 +1,5 @@
+,age,gse,lab,sex,sra,file,line,link,type,antibody,assembly,file_url,biosample,dev_stage,file_size,ref_genome,size_range,assay_title,sample_name,sample_type,health_state,sample_title,sample_status,biomaterial_type,named_annotation,sample_series_id,sample_taxid_ch1,possible_controls,sample_platform_id,encode_release_date,sample_contact_city,sample_contact_name,sample_molecule_ch1,sample_organism_ch1,sample_channel_count,sample_contact_email,sample_contact_state,sample_geo_accession,sample_data_row_count,sample_library_source,sample_contact_address,sample_contact_country,sample_source_name_ch1,sample_submission_date,sample_instrument_model,sample_last_update_date,sample_library_strategy,library_encode_accession,sample_contact_institute,sample_library_selection,biosample_encode_accession,technical_replicate_number,biological_replicate_number,experiment_encode_accession,sample_extract_protocol_ch1,sample_contact_zip_postal_code
+0,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,7028129," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__5,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120
+1,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,6936801," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__6,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120
+2,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6830691," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__1,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120
+3,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6839779," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__4,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120
diff --git a/tests/data/testconfigs/testpep1.yaml b/tests/data/testconfigs/testpep1.yaml
new file mode 100644
index 0000000..97f5f10
--- /dev/null
+++ b/tests/data/testconfigs/testpep1.yaml
@@ -0,0 +1,42 @@
+pep_version: 2.1.0
+project_name: GSE105734
+sample_table: GSE105734_samples.csv
+sample_modifiers:
+  append:
+    description: "Mammary gland, adenocarcinoma. (PMID: 4357757)FOS ChIP-seq on human MCF-7"
+    output_file_path: FILES
+    sample_description: https://www.encodeproject.org/experiments/ENCSR569XNP/, ***************
+    sample_data_processing: See GSM*_README.txt supplementary file linked below
+  derive:
+    sources:
+      FILES: ./{gse}/{file}
+    attributes: 
+     - output_file_path
+experiment_metadata:
+  series_type: Genome binding/occupancy profiling by high throughput sequencing
+  series_gp_id: PRJNA63443
+  series_title: ChIP-seq from MCF-7 (ENCSR569XNP)
+  series_status: Public on Oct 23 2017
+  series_project: ENCODE
+  series_summary: FOS ChIP-seq on human MCF-7 +  + For data usage terms and conditions, please refer to http://www.genome.gov/27528022 and http://www.genome.gov/Pages/Research/ENCODE/ENCODE_Data_Use_Policy_for_External_Users_03-07-14.pdf
+  series_pubmed_id: 22955616
+  series_sample_id: GSM2827569 + GSM2827570
+  series_platform_id: GPL11154
+  series_contact_city: Stanford
+  series_contact_name: ENCODE,,DCC
+  series_sample_taxid: 9606
+  series_contact_email: encode-help@lists.stanford.edu
+  series_contact_state: CA
+  series_geo_accession: GSE105734
+  series_overall_design: https://www.encodeproject.org/ENCSR569XNP/
+  series_platform_taxid: 9606
+  series_contact_address: 300 Pasteur Dr
+  series_contact_country: USA
+  series_sample_organism: Homo sapiens
+  series_submission_date: Oct 21 2017
+  series_last_update_date: Jul 25 2021
+  series_contact_institute: ENCODE DCC
+  series_platform_organism: Homo sapiens
+  series_supplementary_file: ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF124QJZ_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF160WPD_fold_change_over_control_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF170POB_optimal_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF174XVH_signal_p-value_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF185VWZ_fold_change_over_control_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF220ZQF_conservative_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF234YXL_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF345JQL_optimal_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF397ZDJ_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF419PYB_signal_p-value_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF538VTB_conservative_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF663JQC_conservative_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF732LJO_optimal_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF888RBY_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF890YVJ_conservative_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF965AIZ_optimal_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_RAW.tar
+  series_contact_zip_postal_code: 94305-5120
+
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 0512076..e3b7a17 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,6 +1,31 @@
+import peppy
+from logging import getLogger
 import pepembed
+from pepembed.pepembed import PEPEncoder
+from pepembed.const import *
+import os
+from peppy import Project
+import flatdict
+
+_LOGGER = getLogger("pepembed")
 
 class Testpepembed:
     def test_search(self):
         """Basic example of a test"""
-        assert 1 == 1
+        #hf_model = "sentence-transformers/all-MiniLM-L12-v2"   #this is the default in argsparser
+        found = False
+        keywordsfilepath = os.path.join(os.getcwd() + "/tests/data/keywordstest.txt")
+
+        encoder = PEPEncoder(DEFAULT_MODEL, keywords_file=keywordsfilepath)
+
+        p = peppy.Project(os.path.join(os.getcwd() + "/tests/data/testconfigs/testpep1.yaml"))
+        p = p.to_dict(extended=True)
+
+        d = encoder.mine_metadata_from_dict(p, min_desc_length=20)
+
+        for k, v in flatdict.FlatDict(p['_config']).items():
+            if str(v) in d:
+                found = True
+
+        assert found == True
+

From e8751e659f9716271cd6993b362727f747f6b9f5 Mon Sep 17 00:00:00 2001
From: Nathan LeRoy <NLeRoy917@gmail.com>
Date: Wed, 28 Jun 2023 11:48:23 -0400
Subject: [PATCH 3/8] add flatdict to reqs

---
 .vscode/settings.json             |  7 +++
 requirements/requirements-all.txt |  1 +
 scripts/database_umap.py          | 76 +++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 .vscode/settings.json
 create mode 100644 scripts/database_umap.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..9b38853
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
\ No newline at end of file
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 45b4de1..f61cb06 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -6,3 +6,4 @@ qdrant-client
 #psycopg2
 ubiquerg
 tqdm
+flatdict
diff --git a/scripts/database_umap.py b/scripts/database_umap.py
new file mode 100644
index 0000000..f416d13
--- /dev/null
+++ b/scripts/database_umap.py
@@ -0,0 +1,76 @@
+# %%
+import os
+import numpy as np
+from qdrant_client import QdrantClient
+
+# %%
+# get the qdrant connection info
+QDRANT_HOST = os.environ.get("QDRANT_HOST")
+QDRANT_PORT = os.environ.get("QDRANT_PORT")
+QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
+
+# connect to qdrant
+qdrant = QdrantClient(
+    url=QDRANT_HOST, 
+    port=QDRANT_PORT,
+    api_key=QDRANT_API_KEY,
+    timeout=1000
+)
+
+# %%
+# get number of embeddings
+n_embeddings = qdrant.get_collection(collection_name="pephub").points_count
+
+# %%
+SAMPLE_SIZE = 10000
+BATCH_SIZE = 10
+# randomly sample embeddings using the qdrant scroll API
+# in batches of 10
+# generate a random offset
+embeddings = []
+for i in range(SAMPLE_SIZE // BATCH_SIZE):
+    print(f"Batch {i}")
+    offset = np.random.randint(0, n_embeddings - 10)
+    result = qdrant.scroll(
+        collection_name="pephub",
+        limit=BATCH_SIZE,
+        with_payload=False,
+        with_vectors=True,
+        offset=offset
+    )
+    embeddings.append(list(result)[0])
+
+# %%
+# flatten the list
+embeddings = [e for batch in embeddings for e in batch]
+
+# %%
+embeddings = [np.array(e.vector) for e in embeddings]
+
+# %%
+from umap import UMAP
+
+reducer = UMAP(n_components=2, random_state=42)
+umap_embedding = reducer.fit_transform(embeddings)
+
+# %%
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+_, ax = plt.subplots(figsize=(5, 5))
+
+plt.rcParams['figure.dpi'] = 300
+
+sns.scatterplot(
+    x=umap_embedding[:,0],
+    y=umap_embedding[:,1],
+    s=5,
+    linewidth=0,
+    ax=ax
+)
+
+ax.set_title("UMAP of GEO Sample Descriptions")
+ax.set_xlabel("UMAP 1", fontsize=14)
+ax.set_ylabel("UMAP 2", fontsize=14)
+
+# %%

From a63d09d6acbc4ea9f7ac6c071a08ff2220876997 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:05:59 -0400
Subject: [PATCH 4/8] Add requirements, adjust standard keywords, add skeleton
 for dynamically generating keywords #2

---
 keywords.txt                      |  7 ++++++-
 pepembed/cli.py                   | 10 +++++++---
 pepembed/pepembed.py              |  8 +++++---
 pepembed/utils.py                 |  7 +++++++
 requirements/requirements-all.txt |  3 ++-
 5 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/keywords.txt b/keywords.txt
index d5a8b35..67fb17b 100644
--- a/keywords.txt
+++ b/keywords.txt
@@ -2,4 +2,9 @@ cell
 protocol
 description
 processing
-source
\ No newline at end of file
+source
+table
+file_path
+pep_version
+project_name
+experiment_metadata
\ No newline at end of file
diff --git a/pepembed/cli.py b/pepembed/cli.py
index 38493fb..a1a64d6 100644
--- a/pepembed/cli.py
+++ b/pepembed/cli.py
@@ -132,14 +132,17 @@ def main():
 
     # connect to qdrant
     qdrant = QdrantClient(
-        url=QDRANT_HOST, 
+        url=QDRANT_HOST,
         port=QDRANT_PORT,
         api_key=QDRANT_API_KEY,
-
     )
 
     # get the collection info
-    COLLECTION = args.qdrant_collection or os.environ.get("QDRANT_COLLECTION") or QDRANT_DEFAULT_COLLECTION
+    COLLECTION = (
+        args.qdrant_collection
+        or os.environ.get("QDRANT_COLLECTION")
+        or QDRANT_DEFAULT_COLLECTION
+    )
 
     # recreate the collection if necessary
     if args.recreate_collection:
@@ -212,6 +215,7 @@ def main():
     """
     )
 
+
 if __name__ == "__main__":
     try:
         sys.exit(main())
diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py
index 30e5ba4..934fdd0 100644
--- a/pepembed/pepembed.py
+++ b/pepembed/pepembed.py
@@ -35,7 +35,9 @@ def mine_metadata_from_dict(
         :param project: A dictionary representing a peppy.Project instance.
         :param min_desc_length: The minimum length of the description.
         """
-        project_config = project.get(CONFIG_KEY) or project.get(CONFIG_KEY.replace("_", ""))
+        project_config = project.get(CONFIG_KEY) or project.get(
+            CONFIG_KEY.replace("_", "")
+        )
         if project_config is None:
             return ""
         if (
@@ -44,8 +46,8 @@ def mine_metadata_from_dict(
         ):
             return project[NAME_KEY] or ""
 
-        #project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY]
-        #Flatten dictionary
+        # project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY]
+        # Flatten dictionary
         project_level_dict: dict = flatdict.FlatDict(project_config)
         project_level_attrs = list(project_level_dict.keys())
         desc = ""
diff --git a/pepembed/utils.py b/pepembed/utils.py
index 1e02532..989afde 100644
--- a/pepembed/utils.py
+++ b/pepembed/utils.py
@@ -10,6 +10,13 @@ def read_in_key_words(key_words_file: str) -> List[str]:
     return key_words
 
 
+def generate_key_words(key_words_file: str) -> List[str]:
+    """Generates keywords based on current PEPs by finding most common shared attributes"""
+    # TODO Generate a dynamic list of keywords for custom PEPs
+    key_words = []
+    return key_words
+
+
 def batch_generator(iterable, batch_size) -> List:
     """Batch generator."""
     l = len(iterable)
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 45b4de1..e8a13ed 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -3,6 +3,7 @@ sentence-transformers
 peppy
 python-dotenv
 qdrant-client
-#psycopg2
+psycopg2
 ubiquerg
 tqdm
+flatdict

From 91686df10d0acc9b66d62f37d1d77514fb2bf2d2 Mon Sep 17 00:00:00 2001
From: Nathan LeRoy <NLeRoy917@gmail.com>
Date: Mon, 17 Jul 2023 10:57:22 -0400
Subject: [PATCH 5/8] more work

---
 pepembed/cli.py   |  17 +++-
 pepembed/const.py |   2 +-
 run_index.py      | 248 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 261 insertions(+), 6 deletions(-)
 create mode 100644 run_index.py

diff --git a/pepembed/cli.py b/pepembed/cli.py
index a1a64d6..955e814 100644
--- a/pepembed/cli.py
+++ b/pepembed/cli.py
@@ -1,3 +1,4 @@
+# %%
 import sys
 import logging
 import os
@@ -15,7 +16,7 @@
     LOGGING_LEVEL,
     PKG_NAME,
     PROJECT_TABLE,
-    PROJECT_COLUMN,
+    CONFIG_COLUMN,
     PROJECT_NAME_COLUMN,
     NAMESPACE_COLUMN,
     TAG_COLUMN,
@@ -28,7 +29,7 @@
 from .pepembed import PEPEncoder
 from .utils import batch_generator
 
-
+# %%
 def main():
     """Entry point for the CLI."""
     load_dotenv()
@@ -74,7 +75,7 @@ def main():
     # get list of peps
     _LOGGER.info("Pulling PEPs from database.")
     curs.execute(
-        f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {PROJECT_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}"
+        f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {CONFIG_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}"
     )
     projects = curs.fetchall()
 
@@ -93,9 +94,9 @@ def main():
 
     # we need to work in batches since its much faster
     projects_encoded = []
-    for batch in tqdm(
+    for i, batch in enumerate(tqdm(
         batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE
-    ):
+    )):
         # build list of descriptions for batch
         descs = []
         for p in batch:
@@ -104,6 +105,10 @@ def main():
                 descs.append(d)
             else:
                 descs.append(f"{p[0]} {p[1]} {p[2]}")
+        
+        # every 100th batch, print out the first description
+        if i % 100 == 0:
+            _LOGGER.info(f"First description: {descs[0]}")
 
         # encode descriptions
         try:
@@ -151,6 +156,7 @@ def main():
             vectors_config=models.VectorParams(
                 size=EMBEDDING_DIM, distance=models.Distance.COSINE
             ),
+            on_disk_payload=True,
         )
         collection_info = qdrant.get_collection(collection_name=COLLECTION)
     else:
@@ -166,6 +172,7 @@ def main():
                 vectors_config=models.VectorParams(
                     size=EMBEDDING_DIM, distance=models.Distance.COSINE
                 ),
+                on_disk_payload=True,
             )
             collection_info = qdrant.get_collection(collection_name=COLLECTION)
 
diff --git a/pepembed/const.py b/pepembed/const.py
index 6b9d522..1eed4ae 100644
--- a/pepembed/const.py
+++ b/pepembed/const.py
@@ -22,7 +22,7 @@
 MIN_DESCRIPTION_LENGTH = 5
 
 PROJECT_TABLE = "projects"
-PROJECT_COLUMN = "project_value"
+CONFIG_COLUMN = "config"
 PROJECT_NAME_COLUMN = "name"
 NAMESPACE_COLUMN = "namespace"
 TAG_COLUMN = "tag"
diff --git a/run_index.py b/run_index.py
new file mode 100644
index 0000000..2b6e1d1
--- /dev/null
+++ b/run_index.py
@@ -0,0 +1,248 @@
+# %%
+import sys
+import logging
+import os
+import psycopg2
+
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from qdrant_client.http.models import PointStruct
+from tqdm import tqdm
+from dotenv import load_dotenv
+from logmuse import init_logger
+from argparse import Namespace
+
+from pepembed.const import (
+    LEVEL_BY_VERBOSITY,
+    LOGGING_LEVEL,
+    PKG_NAME,
+    PROJECT_TABLE,
+    CONFIG_COLUMN,
+    PROJECT_NAME_COLUMN,
+    NAMESPACE_COLUMN,
+    TAG_COLUMN,
+    ROW_ID_COLUMN,
+    DEFAULT_BATCH_SIZE,
+    QDRANT_DEFAULT_COLLECTION,
+    DEFAULT_UPSERT_BATCH_SIZE,
+)
+from pepembed.pepembed import PEPEncoder
+from pepembed.utils import batch_generator
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# %%
+load_dotenv()
+args = Namespace(
+    postgres_user=os.environ.get("POSTGRES_USER"),
+    postgres_password=os.environ.get("POSTGRES_PASSWORD"),
+    postgres_host=os.environ.get("POSTGRES_HOST"),
+    postgres_db=os.environ.get("POSTGRES_DB"),
+    postgres_port=os.environ.get("POSTGRES_PORT"),
+    qdrant_host=os.environ.get("QDRANT_HOST"),
+    qdrant_port=os.environ.get("QDRANT_PORT"),
+    qdrant_api_key=os.environ.get("QDRANT_API_KEY"),
+    qdrant_collection=os.environ.get("QDRANT_COLLECTION"),
+    dbg=False,
+    verbosity=None,
+    logging_level=None,
+    recreate_collection=True,
+    hf_model="sentence-transformers/all-MiniLM-L12-v2",
+    keywords_file="keywords.txt",
+    batch_size=DEFAULT_BATCH_SIZE,
+    upsert_batch_size=DEFAULT_UPSERT_BATCH_SIZE,
+
+)
+
+# %%
+# Set the logging level.
+if args.dbg:
+    # Debug mode takes precedence and will listen for all messages.
+    level = args.logging_level or logging.DEBUG
+elif args.verbosity is not None:
+    # Verbosity-framed specification trumps logging_level.
+    level = LEVEL_BY_VERBOSITY[args.verbosity]
+else:
+    # Normally, we're not in debug mode, and there's not verbosity.
+    level = LOGGING_LEVEL
+
+# initialize the logger
+logger_kwargs = {"level": level, "devmode": args.dbg}
+init_logger(name="peppy", **logger_kwargs)
+global _LOGGER
+_LOGGER = init_logger(name=PKG_NAME, **logger_kwargs)
+
+# %%
+# pull list of peps
+_LOGGER.info("Establishing connection to database.")
+conn = psycopg2.connect(
+    user=(args.postgres_user or os.environ.get("POSTGRES_USER")),
+    password=(args.postgres_password or os.environ.get("POSTGRES_PASSWORD")),
+    host=(args.postgres_host or os.environ.get("POSTGRES_HOST")),
+    database=(args.postgres_db or os.environ.get("POSTGRES_DB")),
+    port=(args.postgres_port or 5432),
+)
+curs = conn.cursor()
+
+# %%
+# test connection
+_LOGGER.info("Testing connection to database.")
+curs.execute("SELECT 1")
+res = curs.fetchone()
+if not res == (1,):
+    _LOGGER.error("Connection to database failed.")
+    sys.exit(1)
+
+# %%
+# get list of peps
+_LOGGER.info("Pulling PEPs from database.")
+curs.execute(
+    f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {CONFIG_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}"
+)
+projects = curs.fetchall()
+
+# map list of tuples to list of dicts
+_LOGGER.info(f"Found {len(projects)} PEPs.")
+
+# %%
+# initialize encoder
+_LOGGER.info("Initializing encoder.")
+encoder = PEPEncoder(args.hf_model, keywords_file=args.keywords_file)
+EMBEDDING_DIM = int(encoder.get_sentence_embedding_dimension())
+_LOGGER.info(f"Computing embeddings of {EMBEDDING_DIM} dimensions.")
+
+# %%
+# encode PEPs in batches
+_LOGGER.info("Encoding PEPs.")
+BATCH_SIZE = args.batch_size or DEFAULT_BATCH_SIZE
+
+# we need to work in batches since its much faster
+projects_encoded = []
+for i, batch in enumerate(tqdm(
+    batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE
+)):
+    # build list of descriptions for batch
+    descs = []
+    for p in batch:
+        d = encoder.mine_metadata_from_dict(p[3], min_desc_length=20)
+        if d != "" or d is None:
+            descs.append(d)
+        else:
+            descs.append(f"{p[0]} {p[1]} {p[2]}")
+    
+    # every 100th batch, print out the first description
+    if i % 100 == 0:
+        _LOGGER.info(f"First description: {descs[0]}")
+
+    # encode descriptions
+    try:
+        embeddings = encoder.encode(descs)
+        projects_encoded.extend(
+            [
+                dict(
+                    id=p[4],
+                    registry=f"{p[0]}/{p[1]}:{p[2]}",
+                    description=desc,
+                    embedding=embd,
+                )
+                for p, desc, embd in zip(batch, descs, embeddings)
+            ]
+        )
+    except Exception as e:
+        _LOGGER.error(f"Error encoding batch: {e}")
+
+# %%
+_LOGGER.info("Encoding complete.")
+_LOGGER.info("Connecting to Qdrant.")
+
+# get the qdrant connection info
+QDRANT_HOST = args.qdrant_host or os.environ.get("QDRANT_HOST")
+QDRANT_PORT = args.qdrant_port or os.environ.get("QDRANT_PORT")
+QDRANT_API_KEY = args.qdrant_api_key or os.environ.get("QDRANT_API_KEY")
+
+# connect to qdrant
+qdrant = QdrantClient(
+    url=QDRANT_HOST,
+    port=QDRANT_PORT,
+    api_key=QDRANT_API_KEY,
+)
+
+# get the collection info
+COLLECTION = (
+    args.qdrant_collection
+    or os.environ.get("QDRANT_COLLECTION")
+    or QDRANT_DEFAULT_COLLECTION
+)
+
+# recreate the collection if necessary
+if args.recreate_collection:
+    qdrant.recreate_collection(
+        collection_name=COLLECTION,
+        vectors_config=models.VectorParams(
+            size=EMBEDDING_DIM, distance=models.Distance.COSINE
+        ),
+        on_disk_payload=True,
+    )
+    collection_info = qdrant.get_collection(collection_name=COLLECTION)
+else:
+    try:
+        collection_info = qdrant.get_collection(collection_name=COLLECTION)
+    except Exception as e:
+        _LOGGER.error(
+            f"Error getting collection info. Collection {COLLECTION} might not exist."
+        )
+        _LOGGER.info("Recreating collection.")
+        qdrant.recreate_collection(
+            collection_name=COLLECTION,
+            vectors_config=models.VectorParams(
+                size=EMBEDDING_DIM, distance=models.Distance.COSINE
+            ),
+            on_disk_payload=True,
+        )
+        collection_info = qdrant.get_collection(collection_name=COLLECTION)
+
+# verify status of collection after getting or creating
+_LOGGER.info(f"Collection status: {collection_info.status}")
+
+# insert embeddings into qdrant
+_LOGGER.info("Inserting embeddings into Qdrant.")
+_LOGGER.info("Building point strcutures.")
+
+# build up point structs
+all_points = [
+    PointStruct(
+        id=p["id"],
+        vector=p["embedding"].tolist(),
+        payload={"registry": p["registry"], "description": p["description"]},
+    )
+    for p in tqdm(projects_encoded, total=len(projects_encoded))
+]
+
+# determine upsert batch size
+UPSERT_BATCH_SIZE = args.upsert_batch_size or DEFAULT_UPSERT_BATCH_SIZE
+
+# upsert in batches, it will timeout if we do not
+# a good batch size is ~1000 vectors. Running locally, this is super quick.
+for batch in tqdm(
+    batch_generator(all_points, UPSERT_BATCH_SIZE),
+    total=len(all_points) // UPSERT_BATCH_SIZE,
+):
+    operation_info = qdrant.upsert(
+        collection_name=COLLECTION, wait=True, points=batch
+    )
+
+    assert operation_info.status == "completed"
+
+conn.close()
+
+_LOGGER.info("Done.")
+_LOGGER.info(
+    f"View the collection at https://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}"
+)
+_LOGGER.info(
+    f"""View some points and their paylods with the following curl command:
+    curl -H "Content-type: application/json" -d '{{
+        "ids": [0, 3, 100]
+    }}' 'http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}/points'
+"""
+)
\ No newline at end of file

From 7a024ad9c04bf2bc203a25de509bc5d5530739de Mon Sep 17 00:00:00 2001
From: Nathan LeRoy <NLeRoy917@gmail.com>
Date: Mon, 17 Jul 2023 17:06:11 -0400
Subject: [PATCH 6/8] use description first to index so things look better on
 the UI

---
 pepembed/pepembed.py | 14 +++++++++++---
 run_index.py         |  7 +++++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py
index 934fdd0..8382360 100644
--- a/pepembed/pepembed.py
+++ b/pepembed/pepembed.py
@@ -35,9 +35,12 @@ def mine_metadata_from_dict(
         :param project: A dictionary representing a peppy.Project instance.
         :param min_desc_length: The minimum length of the description.
         """
-        project_config = project.get(CONFIG_KEY) or project.get(
-            CONFIG_KEY.replace("_", "")
-        )
+        # project_config = project.get(CONFIG_KEY) or project.get(
+        #     CONFIG_KEY.replace("_", "")
+        # )
+        # fix bug where config key is not in the project,
+        # new database schema does not have config key
+        project_config = project
         if project_config is None:
             return ""
         if (
@@ -52,6 +55,11 @@ def mine_metadata_from_dict(
         project_level_attrs = list(project_level_dict.keys())
         desc = ""
 
+        # use description first
+        if "description" in project_level_attrs:
+            desc += project_level_dict["description"] + " "
+            project_level_attrs.remove("description")
+            
         # build up a description
         for attr in project_level_attrs:
             if any([kw in attr for kw in self.keywords]):
diff --git a/run_index.py b/run_index.py
index 2b6e1d1..ccabbb5 100644
--- a/run_index.py
+++ b/run_index.py
@@ -118,9 +118,10 @@
 
 # we need to work in batches since its much faster
 projects_encoded = []
-for i, batch in enumerate(tqdm(
+i = 0
+for batch in tqdm(
     batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE
-)):
+):
     # build list of descriptions for batch
     descs = []
     for p in batch:
@@ -150,6 +151,8 @@
         )
     except Exception as e:
         _LOGGER.error(f"Error encoding batch: {e}")
+    
+    i += 1
 
 # %%
 _LOGGER.info("Encoding complete.")

From 52fa2c17261217965e56ab313aa6b15de8e803bc Mon Sep 17 00:00:00 2001
From: Nathan LeRoy <NLeRoy917@gmail.com>
Date: Mon, 17 Jul 2023 17:07:15 -0400
Subject: [PATCH 7/8] comment

---
 pepembed/pepembed.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py
index 8382360..567914e 100644
--- a/pepembed/pepembed.py
+++ b/pepembed/pepembed.py
@@ -55,12 +55,13 @@ def mine_metadata_from_dict(
         project_level_attrs = list(project_level_dict.keys())
         desc = ""
 
-        # use description first
+        # use description first - this is just for 
+        # the UI, so things look good
         if "description" in project_level_attrs:
             desc += project_level_dict["description"] + " "
             project_level_attrs.remove("description")
             
-        # build up a description
+        # build up a description using the rest
         for attr in project_level_attrs:
             if any([kw in attr for kw in self.keywords]):
                 desc += str(project_level_dict[attr]) + " "

From 28c357b2f1bef2340dc6824d67853a97e1b4e7ac Mon Sep 17 00:00:00 2001
From: Nathan LeRoy <NLeRoy917@gmail.com>
Date: Mon, 11 Dec 2023 15:24:46 -0500
Subject: [PATCH 8/8] updates

---
 .gitignore                        |  1 +
 pepembed/const.py                 |  2 --
 pepembed/pepembed.py              | 51 ++++++-------------------------
 production.env                    |  2 +-
 requirements/requirements-all.txt |  2 +-
 run_index.py                      | 37 ++++++++++++++--------
 6 files changed, 36 insertions(+), 59 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8854835..4009f71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,3 +161,4 @@ cython_debug/
 
 # qdrant
 qdrant_storage/
+local_cache/
\ No newline at end of file
diff --git a/pepembed/const.py b/pepembed/const.py
index 1eed4ae..431a036 100644
--- a/pepembed/const.py
+++ b/pepembed/const.py
@@ -1,4 +1,3 @@
-from sentence_transformers import __version__ as st_version
 from platform import python_version
 from logging import CRITICAL, DEBUG, ERROR, INFO, WARN
 
@@ -12,7 +11,6 @@
 QDRANT_DEFAULT_COLLECTION = "pephub"
 
 VERSIONS = {
-    "sentence_transformers_version": st_version,
     "python_version": python_version(),
 }
 
diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py
index 567914e..3ac539c 100644
--- a/pepembed/pepembed.py
+++ b/pepembed/pepembed.py
@@ -2,7 +2,7 @@
 from typing import List, Dict, Any, Union
 from peppy import Project
 from peppy.const import SAMPLE_MODS_KEY, CONSTANT_KEY, CONFIG_KEY, NAME_KEY
-from sentence_transformers import SentenceTransformer
+from fastembed.embedding import FlagEmbedding as Embedding
 
 import flatdict
 
@@ -10,7 +10,7 @@
 from .const import DEFAULT_KEYWORDS, MIN_DESCRIPTION_LENGTH
 
 
-class PEPEncoder(SentenceTransformer):
+class PEPEncoder(Embedding):
     """
     Simple wrapper of the sentence trasnformer class that lets you
     embed metadata inside a PEP.
@@ -55,11 +55,13 @@ def mine_metadata_from_dict(
         project_level_attrs = list(project_level_dict.keys())
         desc = ""
 
-        # use description first - this is just for 
-        # the UI, so things look good
-        if "description" in project_level_attrs:
-            desc += project_level_dict["description"] + " "
-            project_level_attrs.remove("description")
+        # search for "summary" in keys, if found, use that first, then pop it out
+        # should catch if key simply contains "summary"
+        for attr in project_level_attrs:
+            if "summary" in attr:
+                desc += str(project_level_dict[attr]) + " "
+                project_level_attrs.remove(attr)
+                break
             
         # build up a description using the rest
         for attr in project_level_attrs:
@@ -87,38 +89,3 @@ def mine_metadata_from_pep(
         return self.mine_metadata_from_dict(
             project_dict, min_desc_length=min_desc_length
         )
-
-    def embed(
-        self, projects: Union[dict, List[dict], Project, List[Project]], **kwargs
-    ) -> np.ndarray:
-        """
-        Embed a PEP based on it's metadata.
-
-        :param projects: A PEP or list of PEPs to embed.
-        :param kwargs: Keyword arguments to pass to the `encode` method of the SentenceTransformer class.
-        """
-        # if single dictionary is passed
-        if isinstance(projects, dict):
-            desc = self.mine_metadata_from_dict(projects)
-            return super().encode(desc, **kwargs)
-
-        # if single peppy.Project is passed
-        elif isinstance(projects, Project):
-            desc = self.mine_metadata_from_pep(projects)
-            return super().encode(desc, **kwargs)
-
-        # if list of dictionaries is passed
-        elif isinstance(projects, list) and isinstance(projects[0], dict):
-            descs = [self.mine_metadata_from_dict(p) for p in projects]
-            return super().encode(descs, **kwargs)
-
-        # if list of peppy.Projects is passed
-        elif isinstance(projects, list) and isinstance(projects[0], Project):
-            descs = [self.mine_metadata_from_pep(p) for p in projects]
-            return super().encode(descs, **kwargs)
-
-        # else, return ValueError
-        else:
-            raise ValueError(
-                "Invalid input type. Must be a dictionary, peppy.Project, list of dictionaries, or list of peppy.Projects."
-            )
diff --git a/production.env b/production.env
index 7a6ed74..b092853 100755
--- a/production.env
+++ b/production.env
@@ -7,4 +7,4 @@ export QDRANT_HOST=`pass databio/pephub/qdrant_host`
 export QDRANT_PORT=6333
 export QDRANT_API_KEY=`pass databio/pephub/qdrant_api_key`
 
-export HF_MODEL="sentence-transformers/all-MiniLM-L12-v2"
+export HF_MODEL="BAAI/bge-small-en-v1.5"
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index e8a13ed..82d49c2 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,5 +1,5 @@
 logmuse
-sentence-transformers
+fastembed
 peppy
 python-dotenv
 qdrant-client
diff --git a/run_index.py b/run_index.py
index ccabbb5..f193bb4 100644
--- a/run_index.py
+++ b/run_index.py
@@ -47,11 +47,10 @@
     verbosity=None,
     logging_level=None,
     recreate_collection=True,
-    hf_model="sentence-transformers/all-MiniLM-L12-v2",
+    hf_model=os.environ.get("HF_MODEL"),
     keywords_file="keywords.txt",
     batch_size=DEFAULT_BATCH_SIZE,
     upsert_batch_size=DEFAULT_UPSERT_BATCH_SIZE,
-
 )
 
 # %%
@@ -108,9 +107,8 @@
 # initialize encoder
 _LOGGER.info("Initializing encoder.")
 encoder = PEPEncoder(args.hf_model, keywords_file=args.keywords_file)
-EMBEDDING_DIM = int(encoder.get_sentence_embedding_dimension())
+EMBEDDING_DIM = 384 # hardcoded for sentence-transformers/all-MiniLM-L12-v2 and BAAI/bge-small-en-v1.5
 _LOGGER.info(f"Computing embeddings of {EMBEDDING_DIM} dimensions.")
-
 # %%
 # encode PEPs in batches
 _LOGGER.info("Encoding PEPs.")
@@ -120,7 +118,7 @@
 projects_encoded = []
 i = 0
 for batch in tqdm(
-    batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE
+    batch_generator(projects, BATCH_SIZE), total=(len(projects) // BATCH_SIZE)
 ):
     # build list of descriptions for batch
     descs = []
@@ -130,14 +128,12 @@
             descs.append(d)
         else:
             descs.append(f"{p[0]} {p[1]} {p[2]}")
-    
     # every 100th batch, print out the first description
     if i % 100 == 0:
         _LOGGER.info(f"First description: {descs[0]}")
-
     # encode descriptions
     try:
-        embeddings = encoder.encode(descs)
+        embeddings = encoder.embed(descs)
         projects_encoded.extend(
             [
                 dict(
@@ -151,7 +147,6 @@
         )
     except Exception as e:
         _LOGGER.error(f"Error encoding batch: {e}")
-    
     i += 1
 
 # %%
@@ -170,6 +165,7 @@
     api_key=QDRANT_API_KEY,
 )
 
+# %%
 # get the collection info
 COLLECTION = (
     args.qdrant_collection
@@ -177,6 +173,7 @@
     or QDRANT_DEFAULT_COLLECTION
 )
 
+# %%
 # recreate the collection if necessary
 if args.recreate_collection:
     qdrant.recreate_collection(
@@ -185,6 +182,13 @@
             size=EMBEDDING_DIM, distance=models.Distance.COSINE
         ),
         on_disk_payload=True,
+        quantization_config=models.ScalarQuantization(
+            scalar=models.ScalarQuantizationConfig(
+                type=models.ScalarType.INT8,
+                quantile=0.99,
+                always_ram=True,
+            ),
+        ),
     )
     collection_info = qdrant.get_collection(collection_name=COLLECTION)
 else:
@@ -201,6 +205,13 @@
                 size=EMBEDDING_DIM, distance=models.Distance.COSINE
             ),
             on_disk_payload=True,
+            quantization_config=models.ScalarQuantization(
+                scalar=models.ScalarQuantizationConfig(
+                    type=models.ScalarType.INT8,
+                    quantile=0.99,
+                    always_ram=True,
+                ),
+            ),
         )
         collection_info = qdrant.get_collection(collection_name=COLLECTION)
 
@@ -211,6 +222,7 @@
 _LOGGER.info("Inserting embeddings into Qdrant.")
 _LOGGER.info("Building point strcutures.")
 
+# %%
 # build up point structs
 all_points = [
     PointStruct(
@@ -224,15 +236,14 @@
 # determine upsert batch size
 UPSERT_BATCH_SIZE = args.upsert_batch_size or DEFAULT_UPSERT_BATCH_SIZE
 
+# %%
 # upsert in batches, it will timeout if we do not
 # a good batch size is ~1000 vectors. Running locally, this is super quick.
 for batch in tqdm(
     batch_generator(all_points, UPSERT_BATCH_SIZE),
     total=len(all_points) // UPSERT_BATCH_SIZE,
 ):
-    operation_info = qdrant.upsert(
-        collection_name=COLLECTION, wait=True, points=batch
-    )
+    operation_info = qdrant.upsert(collection_name=COLLECTION, wait=True, points=batch)
 
     assert operation_info.status == "completed"
 
@@ -248,4 +259,4 @@
         "ids": [0, 3, 100]
     }}' 'http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}/points'
 """
-)
\ No newline at end of file
+)