From 7e77e6fbc57b3617dcdbb4c9db315f1584cd47eb Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 28 Jun 2023 08:44:21 -0400 Subject: [PATCH 1/8] basic skeleton for pytest --- tests/__init__.py | 0 tests/test_basic.py | 6 ++++++ 2 files changed, 6 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_basic.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..0512076 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,6 @@ +import pepembed + +class Testpepembed: + def test_search(self): + """Basic example of a test""" + assert 1 == 1 From 03c43d6a49c7faa5b7aa94fcaea82ad4d7db5f67 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 28 Jun 2023 11:36:10 -0400 Subject: [PATCH 2/8] Add flattening of dict to parse all attributes from PEP. Adjust basic test. --- pepembed/const.py | 1 + pepembed/pepembed.py | 7 +++- requirements/requirements-all.txt | 2 +- tests/data/keywordstest.txt | 7 ++++ tests/data/testconfigs/GSE105734_samples.csv | 5 +++ tests/data/testconfigs/testpep1.yaml | 42 ++++++++++++++++++++ tests/test_basic.py | 27 ++++++++++++- 7 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 tests/data/keywordstest.txt create mode 100644 tests/data/testconfigs/GSE105734_samples.csv create mode 100644 tests/data/testconfigs/testpep1.yaml diff --git a/pepembed/const.py b/pepembed/const.py index 47f8008..6b9d522 100644 --- a/pepembed/const.py +++ b/pepembed/const.py @@ -17,6 +17,7 @@ } DEFAULT_KEYWORDS = ["cell", "protocol", "description", "processing", "source"] +DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L12-v2" MIN_DESCRIPTION_LENGTH = 5 diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index e1a4398..30e5ba4 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -4,6 +4,7 @@ from peppy.const import SAMPLE_MODS_KEY, CONSTANT_KEY, CONFIG_KEY, NAME_KEY from sentence_transformers import SentenceTransformer +import flatdict from .utils import read_in_key_words from .const import DEFAULT_KEYWORDS, MIN_DESCRIPTION_LENGTH @@ -43,14 +44,16 @@ def mine_metadata_from_dict( ): return project[NAME_KEY] or "" - project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] + #project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] + #Flatten dictionary + project_level_dict: dict = flatdict.FlatDict(project_config) project_level_attrs = list(project_level_dict.keys()) desc = "" # build up a description for attr in project_level_attrs: if any([kw in attr for kw in self.keywords]): - desc += project_level_dict[attr] + " " + desc += str(project_level_dict[attr]) + " " # return if description is sufficient if len(desc) > min_desc_length: diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index b5e3327..45b4de1 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,6 +3,6 @@ sentence-transformers peppy python-dotenv qdrant-client -psycopg2 +#psycopg2 ubiquerg tqdm diff --git a/tests/data/keywordstest.txt b/tests/data/keywordstest.txt new file mode 100644 index 0000000..85e87f6 --- /dev/null +++ b/tests/data/keywordstest.txt @@ -0,0 +1,7 @@ +protocol +description +processing +source +series_summary +file_path +series_pubmed_id \ No newline at end of file diff --git a/tests/data/testconfigs/GSE105734_samples.csv b/tests/data/testconfigs/GSE105734_samples.csv new file mode 100644 index 0000000..9a6d8b5 --- /dev/null +++ b/tests/data/testconfigs/GSE105734_samples.csv @@ -0,0 +1,5 @@ +,age,gse,lab,sex,sra,file,line,link,type,antibody,assembly,file_url,biosample,dev_stage,file_size,ref_genome,size_range,assay_title,sample_name,sample_type,health_state,sample_title,sample_status,biomaterial_type,named_annotation,sample_series_id,sample_taxid_ch1,possible_controls,sample_platform_id,encode_release_date,sample_contact_city,sample_contact_name,sample_molecule_ch1,sample_organism_ch1,sample_channel_count,sample_contact_email,sample_contact_state,sample_geo_accession,sample_data_row_count,sample_library_source,sample_contact_address,sample_contact_country,sample_source_name_ch1,sample_submission_date,sample_instrument_model,sample_last_update_date,sample_library_strategy,library_encode_accession,sample_contact_institute,sample_library_selection,biosample_encode_accession,technical_replicate_number,biological_replicate_number,experiment_encode_accession,sample_extract_protocol_ch1,sample_contact_zip_postal_code +0,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,7028129," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__5,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120 +1,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,6936801," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__6,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120 +2,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6830691," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__1,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120 +3,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6839779," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__4,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120 diff --git a/tests/data/testconfigs/testpep1.yaml b/tests/data/testconfigs/testpep1.yaml new file mode 100644 index 0000000..97f5f10 --- /dev/null +++ b/tests/data/testconfigs/testpep1.yaml @@ -0,0 +1,42 @@ +pep_version: 2.1.0 +project_name: GSE105734 +sample_table: GSE105734_samples.csv +sample_modifiers: + append: + description: "Mammary gland, adenocarcinoma. (PMID: 4357757)FOS ChIP-seq on human MCF-7" + output_file_path: FILES + sample_description: https://www.encodeproject.org/experiments/ENCSR569XNP/, *************** + sample_data_processing: See GSM*_README.txt supplementary file linked below + derive: + sources: + FILES: ./{gse}/{file} + attributes: + - output_file_path +experiment_metadata: + series_type: Genome binding/occupancy profiling by high throughput sequencing + series_gp_id: PRJNA63443 + series_title: ChIP-seq from MCF-7 (ENCSR569XNP) + series_status: Public on Oct 23 2017 + series_project: ENCODE + series_summary: FOS ChIP-seq on human MCF-7 + + For data usage terms and conditions, please refer to http://www.genome.gov/27528022 and http://www.genome.gov/Pages/Research/ENCODE/ENCODE_Data_Use_Policy_for_External_Users_03-07-14.pdf + series_pubmed_id: 22955616 + series_sample_id: GSM2827569 + GSM2827570 + series_platform_id: GPL11154 + series_contact_city: Stanford + series_contact_name: ENCODE,,DCC + series_sample_taxid: 9606 + series_contact_email: encode-help@lists.stanford.edu + series_contact_state: CA + series_geo_accession: GSE105734 + series_overall_design: https://www.encodeproject.org/ENCSR569XNP/ + series_platform_taxid: 9606 + series_contact_address: 300 Pasteur Dr + series_contact_country: USA + series_sample_organism: Homo sapiens + series_submission_date: Oct 21 2017 + series_last_update_date: Jul 25 2021 + series_contact_institute: ENCODE DCC + series_platform_organism: Homo sapiens + series_supplementary_file: ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF124QJZ_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF160WPD_fold_change_over_control_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF170POB_optimal_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF174XVH_signal_p-value_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF185VWZ_fold_change_over_control_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF220ZQF_conservative_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF234YXL_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF345JQL_optimal_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF397ZDJ_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF419PYB_signal_p-value_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF538VTB_conservative_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF663JQC_conservative_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF732LJO_optimal_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF888RBY_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF890YVJ_conservative_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF965AIZ_optimal_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_RAW.tar + series_contact_zip_postal_code: 94305-5120 + diff --git a/tests/test_basic.py b/tests/test_basic.py index 0512076..e3b7a17 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,6 +1,31 @@ +import peppy +from logging import getLogger import pepembed +from pepembed.pepembed import PEPEncoder +from pepembed.const import * +import os +from peppy import Project +import flatdict + +_LOGGER = getLogger("pepembed") class Testpepembed: def test_search(self): """Basic example of a test""" - assert 1 == 1 + #hf_model = "sentence-transformers/all-MiniLM-L12-v2" #this is the default in argsparser + found = False + keywordsfilepath = os.path.join(os.getcwd() + "/tests/data/keywordstest.txt") + + encoder = PEPEncoder(DEFAULT_MODEL, keywords_file=keywordsfilepath) + + p = peppy.Project(os.path.join(os.getcwd() + "/tests/data/testconfigs/testpep1.yaml")) + p = p.to_dict(extended=True) + + d = encoder.mine_metadata_from_dict(p, min_desc_length=20) + + for k, v in flatdict.FlatDict(p['_config']).items(): + if str(v) in d: + found = True + + assert found == True + From e8751e659f9716271cd6993b362727f747f6b9f5 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Wed, 28 Jun 2023 11:48:23 -0400 Subject: [PATCH 3/8] add flatdict to reqs --- .vscode/settings.json | 7 +++ requirements/requirements-all.txt | 1 + scripts/database_umap.py | 76 +++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 scripts/database_umap.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9b38853 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 45b4de1..f61cb06 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -6,3 +6,4 @@ qdrant-client #psycopg2 ubiquerg tqdm +flatdict diff --git a/scripts/database_umap.py b/scripts/database_umap.py new file mode 100644 index 0000000..f416d13 --- /dev/null +++ b/scripts/database_umap.py @@ -0,0 +1,76 @@ +# %% +import os +import numpy as np +from qdrant_client import QdrantClient + +# %% +# get the qdrant connection info +QDRANT_HOST = os.environ.get("QDRANT_HOST") +QDRANT_PORT = os.environ.get("QDRANT_PORT") +QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY") + +# connect to qdrant +qdrant = QdrantClient( + url=QDRANT_HOST, + port=QDRANT_PORT, + api_key=QDRANT_API_KEY, + timeout=1000 +) + +# %% +# get number of embeddings +n_embeddings = qdrant.get_collection(collection_name="pephub").points_count + +# %% +SAMPLE_SIZE = 10000 +BATCH_SIZE = 10 +# randomly sample embeddings using the qdrant scroll API +# in batches of 10 +# generate a random offset +embeddings = [] +for i in range(SAMPLE_SIZE // BATCH_SIZE): + print(f"Batch {i}") + offset = np.random.randint(0, n_embeddings - 10) + result = qdrant.scroll( + collection_name="pephub", + limit=BATCH_SIZE, + with_payload=False, + with_vectors=True, + offset=offset + ) + embeddings.append(list(result)[0]) + +# %% +# flatten the list +embeddings = [e for batch in embeddings for e in batch] + +# %% +embeddings = [np.array(e.vector) for e in embeddings] + +# %% +from umap import UMAP + +reducer = UMAP(n_components=2, random_state=42) +umap_embedding = reducer.fit_transform(embeddings) + +# %% +import matplotlib.pyplot as plt +import seaborn as sns + +_, ax = plt.subplots(figsize=(5, 5)) + +plt.rcParams['figure.dpi'] = 300 + +sns.scatterplot( + x=umap_embedding[:,0], + y=umap_embedding[:,1], + s=5, + linewidth=0, + ax=ax +) + +ax.set_title("UMAP of GEO Sample Descriptions") +ax.set_xlabel("UMAP 1", fontsize=14) +ax.set_ylabel("UMAP 2", fontsize=14) + +# %% From a63d09d6acbc4ea9f7ac6c071a08ff2220876997 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 28 Jun 2023 12:05:59 -0400 Subject: [PATCH 4/8] Add requirements, adjust standard keywords, add skeleton for dynamically generating keywords #2 --- keywords.txt | 7 ++++++- pepembed/cli.py | 10 +++++++--- pepembed/pepembed.py | 8 +++++--- pepembed/utils.py | 7 +++++++ requirements/requirements-all.txt | 3 ++- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/keywords.txt b/keywords.txt index d5a8b35..67fb17b 100644 --- a/keywords.txt +++ b/keywords.txt @@ -2,4 +2,9 @@ cell protocol description processing -source \ No newline at end of file +source +table +file_path +pep_version +project_name +experiment_metadata \ No newline at end of file diff --git a/pepembed/cli.py b/pepembed/cli.py index 38493fb..a1a64d6 100644 --- a/pepembed/cli.py +++ b/pepembed/cli.py @@ -132,14 +132,17 @@ def main(): # connect to qdrant qdrant = QdrantClient( - url=QDRANT_HOST, + url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY, - ) # get the collection info - COLLECTION = args.qdrant_collection or os.environ.get("QDRANT_COLLECTION") or QDRANT_DEFAULT_COLLECTION + COLLECTION = ( + args.qdrant_collection + or os.environ.get("QDRANT_COLLECTION") + or QDRANT_DEFAULT_COLLECTION + ) # recreate the collection if necessary if args.recreate_collection: @@ -212,6 +215,7 @@ def main(): """ ) + if __name__ == "__main__": try: sys.exit(main()) diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index 30e5ba4..934fdd0 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -35,7 +35,9 @@ def mine_metadata_from_dict( :param project: A dictionary representing a peppy.Project instance. :param min_desc_length: The minimum length of the description. """ - project_config = project.get(CONFIG_KEY) or project.get(CONFIG_KEY.replace("_", "")) + project_config = project.get(CONFIG_KEY) or project.get( + CONFIG_KEY.replace("_", "") + ) if project_config is None: return "" if ( @@ -44,8 +46,8 @@ def mine_metadata_from_dict( ): return project[NAME_KEY] or "" - #project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] - #Flatten dictionary + # project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] + # Flatten dictionary project_level_dict: dict = flatdict.FlatDict(project_config) project_level_attrs = list(project_level_dict.keys()) desc = "" diff --git a/pepembed/utils.py b/pepembed/utils.py index 1e02532..989afde 100644 --- a/pepembed/utils.py +++ b/pepembed/utils.py @@ -10,6 +10,13 @@ def read_in_key_words(key_words_file: str) -> List[str]: return key_words +def generate_key_words(key_words_file: str) -> List[str]: + """Generates keywords based on current PEPs by finding most common shared attributes""" + # TODO Generate a dynamic list of keywords for custom PEPs + key_words = [] + return key_words + + def batch_generator(iterable, batch_size) -> List: """Batch generator.""" l = len(iterable) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 45b4de1..e8a13ed 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,6 +3,7 @@ sentence-transformers peppy python-dotenv qdrant-client -#psycopg2 +psycopg2 ubiquerg tqdm +flatdict From 91686df10d0acc9b66d62f37d1d77514fb2bf2d2 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 17 Jul 2023 10:57:22 -0400 Subject: [PATCH 5/8] more work --- pepembed/cli.py | 17 +++- pepembed/const.py | 2 +- run_index.py | 248 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 run_index.py diff --git a/pepembed/cli.py b/pepembed/cli.py index a1a64d6..955e814 100644 --- a/pepembed/cli.py +++ b/pepembed/cli.py @@ -1,3 +1,4 @@ +# %% import sys import logging import os @@ -15,7 +16,7 @@ LOGGING_LEVEL, PKG_NAME, PROJECT_TABLE, - PROJECT_COLUMN, + CONFIG_COLUMN, PROJECT_NAME_COLUMN, NAMESPACE_COLUMN, TAG_COLUMN, @@ -28,7 +29,7 @@ from .pepembed import PEPEncoder from .utils import batch_generator - +# %% def main(): """Entry point for the CLI.""" load_dotenv() @@ -74,7 +75,7 @@ def main(): # get list of peps _LOGGER.info("Pulling PEPs from database.") curs.execute( - f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {PROJECT_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}" + f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {CONFIG_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}" ) projects = curs.fetchall() @@ -93,9 +94,9 @@ def main(): # we need to work in batches since its much faster projects_encoded = [] - for batch in tqdm( + for i, batch in enumerate(tqdm( batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE - ): + )): # build list of descriptions for batch descs = [] for p in batch: @@ -104,6 +105,10 @@ def main(): descs.append(d) else: descs.append(f"{p[0]} {p[1]} {p[2]}") + + # every 100th batch, print out the first description + if i % 100 == 0: + _LOGGER.info(f"First description: {descs[0]}") # encode descriptions try: @@ -151,6 +156,7 @@ def main(): vectors_config=models.VectorParams( size=EMBEDDING_DIM, distance=models.Distance.COSINE ), + on_disk_payload=True, ) collection_info = qdrant.get_collection(collection_name=COLLECTION) else: @@ -166,6 +172,7 @@ def main(): vectors_config=models.VectorParams( size=EMBEDDING_DIM, distance=models.Distance.COSINE ), + on_disk_payload=True, ) collection_info = qdrant.get_collection(collection_name=COLLECTION) diff --git a/pepembed/const.py b/pepembed/const.py index 6b9d522..1eed4ae 100644 --- a/pepembed/const.py +++ b/pepembed/const.py @@ -22,7 +22,7 @@ MIN_DESCRIPTION_LENGTH = 5 PROJECT_TABLE = "projects" -PROJECT_COLUMN = "project_value" +CONFIG_COLUMN = "config" PROJECT_NAME_COLUMN = "name" NAMESPACE_COLUMN = "namespace" TAG_COLUMN = "tag" diff --git a/run_index.py b/run_index.py new file mode 100644 index 0000000..2b6e1d1 --- /dev/null +++ b/run_index.py @@ -0,0 +1,248 @@ +# %% +import sys +import logging +import os +import psycopg2 + +from qdrant_client import QdrantClient +from qdrant_client.http import models +from qdrant_client.http.models import PointStruct +from tqdm import tqdm +from dotenv import load_dotenv +from logmuse import init_logger +from argparse import Namespace + +from pepembed.const import ( + LEVEL_BY_VERBOSITY, + LOGGING_LEVEL, + PKG_NAME, + PROJECT_TABLE, + CONFIG_COLUMN, + PROJECT_NAME_COLUMN, + NAMESPACE_COLUMN, + TAG_COLUMN, + ROW_ID_COLUMN, + DEFAULT_BATCH_SIZE, + QDRANT_DEFAULT_COLLECTION, + DEFAULT_UPSERT_BATCH_SIZE, +) +from pepembed.pepembed import PEPEncoder +from pepembed.utils import batch_generator + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +# %% +load_dotenv() +args = Namespace( + postgres_user=os.environ.get("POSTGRES_USER"), + postgres_password=os.environ.get("POSTGRES_PASSWORD"), + postgres_host=os.environ.get("POSTGRES_HOST"), + postgres_db=os.environ.get("POSTGRES_DB"), + postgres_port=os.environ.get("POSTGRES_PORT"), + qdrant_host=os.environ.get("QDRANT_HOST"), + qdrant_port=os.environ.get("QDRANT_PORT"), + qdrant_api_key=os.environ.get("QDRANT_API_KEY"), + qdrant_collection=os.environ.get("QDRANT_COLLECTION"), + dbg=False, + verbosity=None, + logging_level=None, + recreate_collection=True, + hf_model="sentence-transformers/all-MiniLM-L12-v2", + keywords_file="keywords.txt", + batch_size=DEFAULT_BATCH_SIZE, + upsert_batch_size=DEFAULT_UPSERT_BATCH_SIZE, + +) + +# %% +# Set the logging level. +if args.dbg: + # Debug mode takes precedence and will listen for all messages. + level = args.logging_level or logging.DEBUG +elif args.verbosity is not None: + # Verbosity-framed specification trumps logging_level. + level = LEVEL_BY_VERBOSITY[args.verbosity] +else: + # Normally, we're not in debug mode, and there's not verbosity. + level = LOGGING_LEVEL + +# initialize the logger +logger_kwargs = {"level": level, "devmode": args.dbg} +init_logger(name="peppy", **logger_kwargs) +global _LOGGER +_LOGGER = init_logger(name=PKG_NAME, **logger_kwargs) + +# %% +# pull list of peps +_LOGGER.info("Establishing connection to database.") +conn = psycopg2.connect( + user=(args.postgres_user or os.environ.get("POSTGRES_USER")), + password=(args.postgres_password or os.environ.get("POSTGRES_PASSWORD")), + host=(args.postgres_host or os.environ.get("POSTGRES_HOST")), + database=(args.postgres_db or os.environ.get("POSTGRES_DB")), + port=(args.postgres_port or 5432), +) +curs = conn.cursor() + +# %% +# test connection +_LOGGER.info("Testing connection to database.") +curs.execute("SELECT 1") +res = curs.fetchone() +if not res == (1,): + _LOGGER.error("Connection to database failed.") + sys.exit(1) + +# %% +# get list of peps +_LOGGER.info("Pulling PEPs from database.") +curs.execute( + f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {CONFIG_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}" +) +projects = curs.fetchall() + +# map list of tuples to list of dicts +_LOGGER.info(f"Found {len(projects)} PEPs.") + +# %% +# initialize encoder +_LOGGER.info("Initializing encoder.") +encoder = PEPEncoder(args.hf_model, keywords_file=args.keywords_file) +EMBEDDING_DIM = int(encoder.get_sentence_embedding_dimension()) +_LOGGER.info(f"Computing embeddings of {EMBEDDING_DIM} dimensions.") + +# %% +# encode PEPs in batches +_LOGGER.info("Encoding PEPs.") +BATCH_SIZE = args.batch_size or DEFAULT_BATCH_SIZE + +# we need to work in batches since its much faster +projects_encoded = [] +for i, batch in enumerate(tqdm( + batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE +)): + # build list of descriptions for batch + descs = [] + for p in batch: + d = encoder.mine_metadata_from_dict(p[3], min_desc_length=20) + if d != "" or d is None: + descs.append(d) + else: + descs.append(f"{p[0]} {p[1]} {p[2]}") + + # every 100th batch, print out the first description + if i % 100 == 0: + _LOGGER.info(f"First description: {descs[0]}") + + # encode descriptions + try: + embeddings = encoder.encode(descs) + projects_encoded.extend( + [ + dict( + id=p[4], + registry=f"{p[0]}/{p[1]}:{p[2]}", + description=desc, + embedding=embd, + ) + for p, desc, embd in zip(batch, descs, embeddings) + ] + ) + except Exception as e: + _LOGGER.error(f"Error encoding batch: {e}") + +# %% +_LOGGER.info("Encoding complete.") +_LOGGER.info("Connecting to Qdrant.") + +# get the qdrant connection info +QDRANT_HOST = args.qdrant_host or os.environ.get("QDRANT_HOST") +QDRANT_PORT = args.qdrant_port or os.environ.get("QDRANT_PORT") +QDRANT_API_KEY = args.qdrant_api_key or os.environ.get("QDRANT_API_KEY") + +# connect to qdrant +qdrant = QdrantClient( + url=QDRANT_HOST, + port=QDRANT_PORT, + api_key=QDRANT_API_KEY, +) + +# get the collection info +COLLECTION = ( + args.qdrant_collection + or os.environ.get("QDRANT_COLLECTION") + or QDRANT_DEFAULT_COLLECTION +) + +# recreate the collection if necessary +if args.recreate_collection: + qdrant.recreate_collection( + collection_name=COLLECTION, + vectors_config=models.VectorParams( + size=EMBEDDING_DIM, distance=models.Distance.COSINE + ), + on_disk_payload=True, + ) + collection_info = qdrant.get_collection(collection_name=COLLECTION) +else: + try: + collection_info = qdrant.get_collection(collection_name=COLLECTION) + except Exception as e: + _LOGGER.error( + f"Error getting collection info. Collection {COLLECTION} might not exist." + ) + _LOGGER.info("Recreating collection.") + qdrant.recreate_collection( + collection_name=COLLECTION, + vectors_config=models.VectorParams( + size=EMBEDDING_DIM, distance=models.Distance.COSINE + ), + on_disk_payload=True, + ) + collection_info = qdrant.get_collection(collection_name=COLLECTION) + +# verify status of collection after getting or creating +_LOGGER.info(f"Collection status: {collection_info.status}") + +# insert embeddings into qdrant +_LOGGER.info("Inserting embeddings into Qdrant.") +_LOGGER.info("Building point strcutures.") + +# build up point structs +all_points = [ + PointStruct( + id=p["id"], + vector=p["embedding"].tolist(), + payload={"registry": p["registry"], "description": p["description"]}, + ) + for p in tqdm(projects_encoded, total=len(projects_encoded)) +] + +# determine upsert batch size +UPSERT_BATCH_SIZE = args.upsert_batch_size or DEFAULT_UPSERT_BATCH_SIZE + +# upsert in batches, it will timeout if we do not +# a good batch size is ~1000 vectors. Running locally, this is super quick. +for batch in tqdm( + batch_generator(all_points, UPSERT_BATCH_SIZE), + total=len(all_points) // UPSERT_BATCH_SIZE, +): + operation_info = qdrant.upsert( + collection_name=COLLECTION, wait=True, points=batch + ) + + assert operation_info.status == "completed" + +conn.close() + +_LOGGER.info("Done.") +_LOGGER.info( + f"View the collection at https://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}" +) +_LOGGER.info( + f"""View some points and their paylods with the following curl command: + curl -H "Content-type: application/json" -d '{{ + "ids": [0, 3, 100] + }}' 'http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}/points' +""" +) \ No newline at end of file From 7a024ad9c04bf2bc203a25de509bc5d5530739de Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 17 Jul 2023 17:06:11 -0400 Subject: [PATCH 6/8] use description first to index so things look better on the UI --- pepembed/pepembed.py | 14 +++++++++++--- run_index.py | 7 +++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index 934fdd0..8382360 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -35,9 +35,12 @@ def mine_metadata_from_dict( :param project: A dictionary representing a peppy.Project instance. :param min_desc_length: The minimum length of the description. """ - project_config = project.get(CONFIG_KEY) or project.get( - CONFIG_KEY.replace("_", "") - ) + # project_config = project.get(CONFIG_KEY) or project.get( + # CONFIG_KEY.replace("_", "") + # ) + # fix bug where config key is not in the project, + # new database schema does not have config key + project_config = project if project_config is None: return "" if ( @@ -52,6 +55,11 @@ def mine_metadata_from_dict( project_level_attrs = list(project_level_dict.keys()) desc = "" + # use description first + if "description" in project_level_attrs: + desc += project_level_dict["description"] + " " + project_level_attrs.remove("description") + # build up a description for attr in project_level_attrs: if any([kw in attr for kw in self.keywords]): diff --git a/run_index.py b/run_index.py index 2b6e1d1..ccabbb5 100644 --- a/run_index.py +++ b/run_index.py @@ -118,9 +118,10 @@ # we need to work in batches since its much faster projects_encoded = [] -for i, batch in enumerate(tqdm( +i = 0 +for batch in tqdm( batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE -)): +): # build list of descriptions for batch descs = [] for p in batch: @@ -150,6 +151,8 @@ ) except Exception as e: _LOGGER.error(f"Error encoding batch: {e}") + + i += 1 # %% _LOGGER.info("Encoding complete.") From 52fa2c17261217965e56ab313aa6b15de8e803bc Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 17 Jul 2023 17:07:15 -0400 Subject: [PATCH 7/8] comment --- pepembed/pepembed.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index 8382360..567914e 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -55,12 +55,13 @@ def mine_metadata_from_dict( project_level_attrs = list(project_level_dict.keys()) desc = "" - # use description first + # use description first - this is just for + # the UI, so things look good if "description" in project_level_attrs: desc += project_level_dict["description"] + " " project_level_attrs.remove("description") - # build up a description + # build up a description using the rest for attr in project_level_attrs: if any([kw in attr for kw in self.keywords]): desc += str(project_level_dict[attr]) + " " From 28c357b2f1bef2340dc6824d67853a97e1b4e7ac Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 11 Dec 2023 15:24:46 -0500 Subject: [PATCH 8/8] updates --- .gitignore | 1 + pepembed/const.py | 2 -- pepembed/pepembed.py | 51 ++++++------------------------- production.env | 2 +- requirements/requirements-all.txt | 2 +- run_index.py | 37 ++++++++++++++-------- 6 files changed, 36 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index 8854835..4009f71 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ cython_debug/ # qdrant qdrant_storage/ +local_cache/ \ No newline at end of file diff --git a/pepembed/const.py b/pepembed/const.py index 1eed4ae..431a036 100644 --- a/pepembed/const.py +++ b/pepembed/const.py @@ -1,4 +1,3 @@ -from sentence_transformers import __version__ as st_version from platform import python_version from logging import CRITICAL, DEBUG, ERROR, INFO, WARN @@ -12,7 +11,6 @@ QDRANT_DEFAULT_COLLECTION = "pephub" VERSIONS = { - "sentence_transformers_version": st_version, "python_version": python_version(), } diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index 567914e..3ac539c 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -2,7 +2,7 @@ from typing import List, Dict, Any, Union from peppy import Project from peppy.const import SAMPLE_MODS_KEY, CONSTANT_KEY, CONFIG_KEY, NAME_KEY -from sentence_transformers import SentenceTransformer +from fastembed.embedding import FlagEmbedding as Embedding import flatdict @@ -10,7 +10,7 @@ from .const import DEFAULT_KEYWORDS, MIN_DESCRIPTION_LENGTH -class PEPEncoder(SentenceTransformer): +class PEPEncoder(Embedding): """ Simple wrapper of the sentence trasnformer class that lets you embed metadata inside a PEP. @@ -55,11 +55,13 @@ def mine_metadata_from_dict( project_level_attrs = list(project_level_dict.keys()) desc = "" - # use description first - this is just for - # the UI, so things look good - if "description" in project_level_attrs: - desc += project_level_dict["description"] + " " - project_level_attrs.remove("description") + # search for "summary" in keys, if found, use that first, then pop it out + # should catch if key simply contains "summary" + for attr in project_level_attrs: + if "summary" in attr: + desc += str(project_level_dict[attr]) + " " + project_level_attrs.remove(attr) + break # build up a description using the rest for attr in project_level_attrs: @@ -87,38 +89,3 @@ def mine_metadata_from_pep( return self.mine_metadata_from_dict( project_dict, min_desc_length=min_desc_length ) - - def embed( - self, projects: Union[dict, List[dict], Project, List[Project]], **kwargs - ) -> np.ndarray: - """ - Embed a PEP based on it's metadata. - - :param projects: A PEP or list of PEPs to embed. - :param kwargs: Keyword arguments to pass to the `encode` method of the SentenceTransformer class. - """ - # if single dictionary is passed - if isinstance(projects, dict): - desc = self.mine_metadata_from_dict(projects) - return super().encode(desc, **kwargs) - - # if single peppy.Project is passed - elif isinstance(projects, Project): - desc = self.mine_metadata_from_pep(projects) - return super().encode(desc, **kwargs) - - # if list of dictionaries is passed - elif isinstance(projects, list) and isinstance(projects[0], dict): - descs = [self.mine_metadata_from_dict(p) for p in projects] - return super().encode(descs, **kwargs) - - # if list of peppy.Projects is passed - elif isinstance(projects, list) and isinstance(projects[0], Project): - descs = [self.mine_metadata_from_pep(p) for p in projects] - return super().encode(descs, **kwargs) - - # else, return ValueError - else: - raise ValueError( - "Invalid input type. Must be a dictionary, peppy.Project, list of dictionaries, or list of peppy.Projects." - ) diff --git a/production.env b/production.env index 7a6ed74..b092853 100755 --- a/production.env +++ b/production.env @@ -7,4 +7,4 @@ export QDRANT_HOST=`pass databio/pephub/qdrant_host` export QDRANT_PORT=6333 export QDRANT_API_KEY=`pass databio/pephub/qdrant_api_key` -export HF_MODEL="sentence-transformers/all-MiniLM-L12-v2" +export HF_MODEL="BAAI/bge-small-en-v1.5" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index e8a13ed..82d49c2 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ logmuse -sentence-transformers +fastembed peppy python-dotenv qdrant-client diff --git a/run_index.py b/run_index.py index ccabbb5..f193bb4 100644 --- a/run_index.py +++ b/run_index.py @@ -47,11 +47,10 @@ verbosity=None, logging_level=None, recreate_collection=True, - hf_model="sentence-transformers/all-MiniLM-L12-v2", + hf_model=os.environ.get("HF_MODEL"), keywords_file="keywords.txt", batch_size=DEFAULT_BATCH_SIZE, upsert_batch_size=DEFAULT_UPSERT_BATCH_SIZE, - ) # %% @@ -108,9 +107,8 @@ # initialize encoder _LOGGER.info("Initializing encoder.") encoder = PEPEncoder(args.hf_model, keywords_file=args.keywords_file) -EMBEDDING_DIM = int(encoder.get_sentence_embedding_dimension()) +EMBEDDING_DIM = 384 # hardcoded for sentence-transformers/all-MiniLM-L12-v2 and BAAI/bge-small-en-v1.5 _LOGGER.info(f"Computing embeddings of {EMBEDDING_DIM} dimensions.") - # %% # encode PEPs in batches _LOGGER.info("Encoding PEPs.") @@ -120,7 +118,7 @@ projects_encoded = [] i = 0 for batch in tqdm( - batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE + batch_generator(projects, BATCH_SIZE), total=(len(projects) // BATCH_SIZE) ): # build list of descriptions for batch descs = [] @@ -130,14 +128,12 @@ descs.append(d) else: descs.append(f"{p[0]} {p[1]} {p[2]}") - # every 100th batch, print out the first description if i % 100 == 0: _LOGGER.info(f"First description: {descs[0]}") - # encode descriptions try: - embeddings = encoder.encode(descs) + embeddings = encoder.embed(descs) projects_encoded.extend( [ dict( @@ -151,7 +147,6 @@ ) except Exception as e: _LOGGER.error(f"Error encoding batch: {e}") - i += 1 # %% @@ -170,6 +165,7 @@ api_key=QDRANT_API_KEY, ) +# %% # get the collection info COLLECTION = ( args.qdrant_collection @@ -177,6 +173,7 @@ or QDRANT_DEFAULT_COLLECTION ) +# %% # recreate the collection if necessary if args.recreate_collection: qdrant.recreate_collection( @@ -185,6 +182,13 @@ size=EMBEDDING_DIM, distance=models.Distance.COSINE ), on_disk_payload=True, + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), ) collection_info = qdrant.get_collection(collection_name=COLLECTION) else: @@ -201,6 +205,13 @@ size=EMBEDDING_DIM, distance=models.Distance.COSINE ), on_disk_payload=True, + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), ) collection_info = qdrant.get_collection(collection_name=COLLECTION) @@ -211,6 +222,7 @@ _LOGGER.info("Inserting embeddings into Qdrant.") _LOGGER.info("Building point strcutures.") +# %% # build up point structs all_points = [ PointStruct( @@ -224,15 +236,14 @@ # determine upsert batch size UPSERT_BATCH_SIZE = args.upsert_batch_size or DEFAULT_UPSERT_BATCH_SIZE +# %% # upsert in batches, it will timeout if we do not # a good batch size is ~1000 vectors. Running locally, this is super quick. for batch in tqdm( batch_generator(all_points, UPSERT_BATCH_SIZE), total=len(all_points) // UPSERT_BATCH_SIZE, ): - operation_info = qdrant.upsert( - collection_name=COLLECTION, wait=True, points=batch - ) + operation_info = qdrant.upsert(collection_name=COLLECTION, wait=True, points=batch) assert operation_info.status == "completed" @@ -248,4 +259,4 @@ "ids": [0, 3, 100] }}' 'http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}/points' """ -) \ No newline at end of file +)