diff --git a/.gitignore b/.gitignore index a5c82b5..0794186 100644 --- a/.gitignore +++ b/.gitignore @@ -161,5 +161,5 @@ cython_debug/ # qdrant qdrant_storage/ - -scripts/ \ No newline at end of file +local_cache/ +scripts/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9b38853 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/keywords.txt b/keywords.txt index d5a8b35..67fb17b 100644 --- a/keywords.txt +++ b/keywords.txt @@ -2,4 +2,9 @@ cell protocol description processing -source \ No newline at end of file +source +table +file_path +pep_version +project_name +experiment_metadata \ No newline at end of file diff --git a/pepembed/cli.py b/pepembed/cli.py index 1ceb18b..fb59c46 100644 --- a/pepembed/cli.py +++ b/pepembed/cli.py @@ -1,3 +1,4 @@ +# %% import sys import logging import os @@ -16,7 +17,7 @@ PKG_NAME, DESCRIPTION_COLUNM, PROJECT_TABLE, - PROJECT_COLUMN, + CONFIG_COLUMN, PROJECT_NAME_COLUMN, NAMESPACE_COLUMN, TAG_COLUMN, @@ -29,7 +30,7 @@ from .pepembed import PEPEncoder from .utils import batch_generator - +# %% def main(): """Entry point for the CLI.""" load_dotenv() @@ -75,7 +76,7 @@ def main(): # get list of peps _LOGGER.info("Pulling PEPs from database.") curs.execute( - f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {PROJECT_COLUMN}, {DESCRIPTION_COLUNM}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}" + f"SELECT {NAMESPACE_COLUMN}, {PROJECT_NAME_COLUMN}, {TAG_COLUMN}, {CONFIG_COLUMN}, {ROW_ID_COLUMN} FROM {PROJECT_TABLE}" ) projects = curs.fetchall() @@ -94,9 +95,9 @@ def main(): # we need to work in batches since its much faster projects_encoded = [] - for batch in tqdm( + for i, batch in enumerate(tqdm( batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE - ): + )): # build list of descriptions for batch descs = [] for p in batch: @@ -105,6 +106,10 @@ def main(): descs.append(d) else: descs.append(f"{p[0]} {p[1]} {p[2]}") + + # every 100th batch, print out the first description + if i % 100 == 0: + _LOGGER.info(f"First description: {descs[0]}") # encode descriptions try: @@ -133,13 +138,17 @@ def main(): # connect to qdrant qdrant = QdrantClient( - url=QDRANT_HOST, + url=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY, ) # get the collection info - COLLECTION = args.qdrant_collection or os.environ.get("QDRANT_COLLECTION") or QDRANT_DEFAULT_COLLECTION + COLLECTION = ( + args.qdrant_collection + or os.environ.get("QDRANT_COLLECTION") + or QDRANT_DEFAULT_COLLECTION + ) # recreate the collection if necessary if args.recreate_collection: @@ -148,7 +157,7 @@ def main(): vectors_config=models.VectorParams( size=EMBEDDING_DIM, distance=models.Distance.COSINE ), - on_disk_payload=True + on_disk_payload=True, ) collection_info = qdrant.get_collection(collection_name=COLLECTION) else: @@ -164,7 +173,7 @@ def main(): vectors_config=models.VectorParams( size=EMBEDDING_DIM, distance=models.Distance.COSINE ), - on_disk_payload=True + on_disk_payload=True, ) collection_info = qdrant.get_collection(collection_name=COLLECTION) @@ -214,6 +223,7 @@ def main(): """ ) + if __name__ == "__main__": try: sys.exit(main()) diff --git a/pepembed/const.py b/pepembed/const.py index 6040590..4abd2dc 100644 --- a/pepembed/const.py +++ b/pepembed/const.py @@ -1,4 +1,3 @@ -from sentence_transformers import __version__ as st_version from platform import python_version from logging import CRITICAL, DEBUG, ERROR, INFO, WARN @@ -12,16 +11,16 @@ QDRANT_DEFAULT_COLLECTION = "pephub" VERSIONS = { - "sentence_transformers_version": st_version, "python_version": python_version(), } DEFAULT_KEYWORDS = ["cell", "protocol", "description", "processing", "source"] +DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L12-v2" MIN_DESCRIPTION_LENGTH = 5 PROJECT_TABLE = "projects" -PROJECT_COLUMN = "project_value" +CONFIG_COLUMN = "config" PROJECT_NAME_COLUMN = "name" CONFIG_COLUMN = "config" NAMESPACE_COLUMN = "namespace" diff --git a/pepembed/pepembed.py b/pepembed/pepembed.py index 90c5e0a..3ac539c 100644 --- a/pepembed/pepembed.py +++ b/pepembed/pepembed.py @@ -2,14 +2,15 @@ from typing import List, Dict, Any, Union from peppy import Project from peppy.const import SAMPLE_MODS_KEY, CONSTANT_KEY, CONFIG_KEY, NAME_KEY -from sentence_transformers import SentenceTransformer +from fastembed.embedding import FlagEmbedding as Embedding +import flatdict from .utils import read_in_key_words from .const import DEFAULT_KEYWORDS, MIN_DESCRIPTION_LENGTH -class PEPEncoder(SentenceTransformer): +class PEPEncoder(Embedding): """ Simple wrapper of the sentence trasnformer class that lets you embed metadata inside a PEP. @@ -34,7 +35,11 @@ def mine_metadata_from_dict( :param project: A dictionary representing a peppy.Project instance. :param min_desc_length: The minimum length of the description. """ - # project_config = project.get(CONFIG_KEY) or project.get(CONFIG_KEY.replace("_", "")) + # project_config = project.get(CONFIG_KEY) or project.get( + # CONFIG_KEY.replace("_", "") + # ) + # fix bug where config key is not in the project, + # new database schema does not have config key project_config = project if project_config is None: return "" @@ -44,14 +49,24 @@ def mine_metadata_from_dict( ): return project[NAME_KEY] or "" - project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] + # project_level_dict: dict = project_config[SAMPLE_MODS_KEY][CONSTANT_KEY] + # Flatten dictionary + project_level_dict: dict = flatdict.FlatDict(project_config) project_level_attrs = list(project_level_dict.keys()) desc = "" - # build up a description + # search for "summary" in keys, if found, use that first, then pop it out + # should catch if key simply contains "summary" + for attr in project_level_attrs: + if "summary" in attr: + desc += str(project_level_dict[attr]) + " " + project_level_attrs.remove(attr) + break + + # build up a description using the rest for attr in project_level_attrs: if any([kw in attr for kw in self.keywords]): - desc += project_level_dict[attr] + " " + desc += str(project_level_dict[attr]) + " " # return if description is sufficient if len(desc) > min_desc_length: @@ -74,38 +89,3 @@ def mine_metadata_from_pep( return self.mine_metadata_from_dict( project_dict, min_desc_length=min_desc_length ) - - def embed( - self, projects: Union[dict, List[dict], Project, List[Project]], **kwargs - ) -> np.ndarray: - """ - Embed a PEP based on it's metadata. - - :param projects: A PEP or list of PEPs to embed. - :param kwargs: Keyword arguments to pass to the `encode` method of the SentenceTransformer class. - """ - # if single dictionary is passed - if isinstance(projects, dict): - desc = self.mine_metadata_from_dict(projects) - return super().encode(desc, **kwargs) - - # if single peppy.Project is passed - elif isinstance(projects, Project): - desc = self.mine_metadata_from_pep(projects) - return super().encode(desc, **kwargs) - - # if list of dictionaries is passed - elif isinstance(projects, list) and isinstance(projects[0], dict): - descs = [self.mine_metadata_from_dict(p) for p in projects] - return super().encode(descs, **kwargs) - - # if list of peppy.Projects is passed - elif isinstance(projects, list) and isinstance(projects[0], Project): - descs = [self.mine_metadata_from_pep(p) for p in projects] - return super().encode(descs, **kwargs) - - # else, return ValueError - else: - raise ValueError( - "Invalid input type. Must be a dictionary, peppy.Project, list of dictionaries, or list of peppy.Projects." - ) diff --git a/pepembed/utils.py b/pepembed/utils.py index 1e02532..989afde 100644 --- a/pepembed/utils.py +++ b/pepembed/utils.py @@ -10,6 +10,13 @@ def read_in_key_words(key_words_file: str) -> List[str]: return key_words +def generate_key_words(key_words_file: str) -> List[str]: + """Generates keywords based on current PEPs by finding most common shared attributes""" + # TODO Generate a dynamic list of keywords for custom PEPs + key_words = [] + return key_words + + def batch_generator(iterable, batch_size) -> List: """Batch generator.""" l = len(iterable) diff --git a/production.env b/production.env index 7a6ed74..b092853 100755 --- a/production.env +++ b/production.env @@ -7,4 +7,4 @@ export QDRANT_HOST=`pass databio/pephub/qdrant_host` export QDRANT_PORT=6333 export QDRANT_API_KEY=`pass databio/pephub/qdrant_api_key` -export HF_MODEL="sentence-transformers/all-MiniLM-L12-v2" +export HF_MODEL="BAAI/bge-small-en-v1.5" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index b5e3327..82d49c2 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,8 +1,9 @@ logmuse -sentence-transformers +fastembed peppy python-dotenv qdrant-client psycopg2 ubiquerg tqdm +flatdict diff --git a/run_index.py b/run_index.py index 34f09b6..3714ac1 100644 --- a/run_index.py +++ b/run_index.py @@ -47,7 +47,7 @@ verbosity=None, logging_level=None, recreate_collection=True, - hf_model="sentence-transformers/all-MiniLM-L12-v2", + hf_model=os.environ.get("HF_MODEL"), keywords_file="keywords.txt", batch_size=DEFAULT_BATCH_SIZE, upsert_batch_size=DEFAULT_UPSERT_BATCH_SIZE, @@ -107,9 +107,8 @@ # initialize encoder _LOGGER.info("Initializing encoder.") encoder = PEPEncoder(args.hf_model, keywords_file=args.keywords_file) -EMBEDDING_DIM = int(encoder.get_sentence_embedding_dimension()) +EMBEDDING_DIM = 384 # hardcoded for sentence-transformers/all-MiniLM-L12-v2 and BAAI/bge-small-en-v1.5 _LOGGER.info(f"Computing embeddings of {EMBEDDING_DIM} dimensions.") - # %% # encode PEPs in batches _LOGGER.info("Encoding PEPs.") @@ -117,9 +116,10 @@ # we need to work in batches since its much faster projects_encoded = [] -for i, batch in enumerate(tqdm( - batch_generator(projects, BATCH_SIZE), total=len(projects) // BATCH_SIZE -)): +i = 0 +for batch in tqdm( + batch_generator(projects, BATCH_SIZE), total=(len(projects) // BATCH_SIZE) +): # build list of descriptions for batch descs = [] for p in batch: @@ -128,14 +128,12 @@ descs.append(d) else: descs.append(f"{p[0]} {p[1]} {p[2]}") - # every 100th batch, print out the first description if i % 100 == 0: _LOGGER.info(f"First description: {descs[0]}") - # encode descriptions try: - embeddings = encoder.encode(descs) + embeddings = encoder.embed(descs) projects_encoded.extend( [ dict( @@ -149,6 +147,7 @@ ) except Exception as e: _LOGGER.error(f"Error encoding batch: {e}") + i += 1 # %% _LOGGER.info("Encoding complete.") @@ -181,6 +180,13 @@ size=EMBEDDING_DIM, distance=models.Distance.COSINE ), on_disk_payload=True, + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), ) collection_info = qdrant.get_collection(collection_name=COLLECTION) else: @@ -197,6 +203,13 @@ size=EMBEDDING_DIM, distance=models.Distance.COSINE ), on_disk_payload=True, + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), ) collection_info = qdrant.get_collection(collection_name=COLLECTION) @@ -226,9 +239,7 @@ batch_generator(all_points, UPSERT_BATCH_SIZE), total=len(all_points) // UPSERT_BATCH_SIZE, ): - operation_info = qdrant.upsert( - collection_name=COLLECTION, wait=True, points=batch - ) + operation_info = qdrant.upsert(collection_name=COLLECTION, wait=True, points=batch) assert operation_info.status == "completed" @@ -244,4 +255,4 @@ "ids": [0, 3, 100] }}' 'http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{COLLECTION}/points' """ -) \ No newline at end of file +) diff --git a/scripts/database_umap.py b/scripts/database_umap.py new file mode 100644 index 0000000..f416d13 --- /dev/null +++ b/scripts/database_umap.py @@ -0,0 +1,76 @@ +# %% +import os +import numpy as np +from qdrant_client import QdrantClient + +# %% +# get the qdrant connection info +QDRANT_HOST = os.environ.get("QDRANT_HOST") +QDRANT_PORT = os.environ.get("QDRANT_PORT") +QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY") + +# connect to qdrant +qdrant = QdrantClient( + url=QDRANT_HOST, + port=QDRANT_PORT, + api_key=QDRANT_API_KEY, + timeout=1000 +) + +# %% +# get number of embeddings +n_embeddings = qdrant.get_collection(collection_name="pephub").points_count + +# %% +SAMPLE_SIZE = 10000 +BATCH_SIZE = 10 +# randomly sample embeddings using the qdrant scroll API +# in batches of 10 +# generate a random offset +embeddings = [] +for i in range(SAMPLE_SIZE // BATCH_SIZE): + print(f"Batch {i}") + offset = np.random.randint(0, n_embeddings - 10) + result = qdrant.scroll( + collection_name="pephub", + limit=BATCH_SIZE, + with_payload=False, + with_vectors=True, + offset=offset + ) + embeddings.append(list(result)[0]) + +# %% +# flatten the list +embeddings = [e for batch in embeddings for e in batch] + +# %% +embeddings = [np.array(e.vector) for e in embeddings] + +# %% +from umap import UMAP + +reducer = UMAP(n_components=2, random_state=42) +umap_embedding = reducer.fit_transform(embeddings) + +# %% +import matplotlib.pyplot as plt +import seaborn as sns + +_, ax = plt.subplots(figsize=(5, 5)) + +plt.rcParams['figure.dpi'] = 300 + +sns.scatterplot( + x=umap_embedding[:,0], + y=umap_embedding[:,1], + s=5, + linewidth=0, + ax=ax +) + +ax.set_title("UMAP of GEO Sample Descriptions") +ax.set_xlabel("UMAP 1", fontsize=14) +ax.set_ylabel("UMAP 2", fontsize=14) + +# %% diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/keywordstest.txt b/tests/data/keywordstest.txt new file mode 100644 index 0000000..85e87f6 --- /dev/null +++ b/tests/data/keywordstest.txt @@ -0,0 +1,7 @@ +protocol +description +processing +source +series_summary +file_path +series_pubmed_id \ No newline at end of file diff --git a/tests/data/testconfigs/GSE105734_samples.csv b/tests/data/testconfigs/GSE105734_samples.csv new file mode 100644 index 0000000..9a6d8b5 --- /dev/null +++ b/tests/data/testconfigs/GSE105734_samples.csv @@ -0,0 +1,5 @@ +,age,gse,lab,sex,sra,file,line,link,type,antibody,assembly,file_url,biosample,dev_stage,file_size,ref_genome,size_range,assay_title,sample_name,sample_type,health_state,sample_title,sample_status,biomaterial_type,named_annotation,sample_series_id,sample_taxid_ch1,possible_controls,sample_platform_id,encode_release_date,sample_contact_city,sample_contact_name,sample_molecule_ch1,sample_organism_ch1,sample_channel_count,sample_contact_email,sample_contact_state,sample_geo_accession,sample_data_row_count,sample_library_source,sample_contact_address,sample_contact_country,sample_source_name_ch1,sample_submission_date,sample_instrument_model,sample_last_update_date,sample_library_strategy,library_encode_accession,sample_contact_institute,sample_library_selection,biosample_encode_accession,technical_replicate_number,biological_replicate_number,experiment_encode_accession,sample_extract_protocol_ch1,sample_contact_zip_postal_code +0,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF540SXF_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,7028129," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__5,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120 +1,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322636,GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,MCF-7,ENCBS053YJT at ENCODE; https://www.encodeproject.org/ENCBS053YJT/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827569/suppl/GSM2827569_ENCFF673PVH_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121513,adult,6936801," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb200wlx__6,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB200WLX),Public on Oct 23 2017,immortalized cell line,NA000141117.1 (GSM2827569_ENCFF050ZFM_signal_p-value_GRCh38.bigWig)NA000142208.1 (GSM2827569_ENCFF893EFT_fold_change_over_control_GRCh38.bigWig)NA000143034.1 (GSM2827569_ENCFF279XPX_fold_change_over_control_hg19.bigWig)NA000143211.1 (GSM2827569_ENCFF097HJB_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827569,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB200WLX,ENCODE DCC,ChIP,ENCBS053YJT (SAMN06121513),1,3,ENCSR569XNP,not provided,94305-5120 +2,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF458JNP_peaks_GRCh38.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6830691," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__1,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120 +3,69 year,GSE105734,"Michael Snyder, StanfordMichael Snyder, Stanford",female,https://www.ncbi.nlm.nih.gov/sra?term=SRX3322637,GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,MCF-7,ENCBS168ISE at ENCODE; https://www.encodeproject.org/ENCBS168ISE/Derived from ENCODE donor ENCDO000AAE; https://www.encodeproject.org/ENCDO000AAE/growth protocol; https://www.encodeproject.org/documents/2a32df77-1325-4a2d-af71-2f2b68eb9830/@@download/attachment/Snyder_MCF7%20Cell%20Growth%20Protocol.pdfENCODE dbxrefs Cellosaurus CVCL_0031; http://web.expasy.org/cellosaurus/CVCL_0031ATCC sample source; http://www.atcc.org/Products/All/HTB-22.aspx,BED,FOS,"GRCh38, hg19",ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827570/suppl/GSM2827570_ENCFF834BCC_peaks_hg19.bed.gz,https://www.ncbi.nlm.nih.gov/biosample/SAMN06121651,adult,6839779," GRCh38, hg19",450-650,ChIP-seq,chip-seq_from_mcf-7_enclb859hnt__4,SRA,breast cancer (adenocarcinoma),ChIP-seq from MCF-7 (ENCLB859HNT),Public on Oct 23 2017,immortalized cell line,NA000141642.1 (GSM2827570_ENCFF250RPS_signal_p-value_GRCh38.bigWig)NA000142635.1 (GSM2827570_ENCFF936JHA_fold_change_over_control_hg19.bigWig)NA000142941.1 (GSM2827570_ENCFF490HEP_fold_change_over_control_GRCh38.bigWig)NA000143639.1 (GSM2827570_ENCFF950XOS_signal_p-value_hg19.bigWig),GSE105734,9606,"ENCSR594UCI, ENCSR217LRF",GPL11154,2016-12-19,Stanford,"ENCODE,,DCC",genomic DNA,Homo sapiens,1,encode-help@lists.stanford.edu,CA,GSM2827570,0,genomic,300 Pasteur Dr,USA,Homo sapiens MCF-7 immortalized cell line,Oct 21 2017,Illumina HiSeq 2000,May 15 2019,ChIP-Seq,ENCLB859HNT,ENCODE DCC,ChIP,ENCBS168ISE (SAMN06121651),1,1,ENCSR569XNP,not provided,94305-5120 diff --git a/tests/data/testconfigs/testpep1.yaml b/tests/data/testconfigs/testpep1.yaml new file mode 100644 index 0000000..97f5f10 --- /dev/null +++ b/tests/data/testconfigs/testpep1.yaml @@ -0,0 +1,42 @@ +pep_version: 2.1.0 +project_name: GSE105734 +sample_table: GSE105734_samples.csv +sample_modifiers: + append: + description: "Mammary gland, adenocarcinoma. (PMID: 4357757)FOS ChIP-seq on human MCF-7" + output_file_path: FILES + sample_description: https://www.encodeproject.org/experiments/ENCSR569XNP/, *************** + sample_data_processing: See GSM*_README.txt supplementary file linked below + derive: + sources: + FILES: ./{gse}/{file} + attributes: + - output_file_path +experiment_metadata: + series_type: Genome binding/occupancy profiling by high throughput sequencing + series_gp_id: PRJNA63443 + series_title: ChIP-seq from MCF-7 (ENCSR569XNP) + series_status: Public on Oct 23 2017 + series_project: ENCODE + series_summary: FOS ChIP-seq on human MCF-7 + + For data usage terms and conditions, please refer to http://www.genome.gov/27528022 and http://www.genome.gov/Pages/Research/ENCODE/ENCODE_Data_Use_Policy_for_External_Users_03-07-14.pdf + series_pubmed_id: 22955616 + series_sample_id: GSM2827569 + GSM2827570 + series_platform_id: GPL11154 + series_contact_city: Stanford + series_contact_name: ENCODE,,DCC + series_sample_taxid: 9606 + series_contact_email: encode-help@lists.stanford.edu + series_contact_state: CA + series_geo_accession: GSE105734 + series_overall_design: https://www.encodeproject.org/ENCSR569XNP/ + series_platform_taxid: 9606 + series_contact_address: 300 Pasteur Dr + series_contact_country: USA + series_sample_organism: Homo sapiens + series_submission_date: Oct 21 2017 + series_last_update_date: Jul 25 2021 + series_contact_institute: ENCODE DCC + series_platform_organism: Homo sapiens + series_supplementary_file: ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF124QJZ_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF160WPD_fold_change_over_control_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF170POB_optimal_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF174XVH_signal_p-value_GRCh38.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF185VWZ_fold_change_over_control_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF220ZQF_conservative_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF234YXL_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF345JQL_optimal_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF397ZDJ_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF419PYB_signal_p-value_hg19.bigWig + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF538VTB_conservative_idr_thresholded_peaks_hg19.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF663JQC_conservative_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF732LJO_optimal_idr_thresholded_peaks_GRCh38.bigBed + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF888RBY_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF890YVJ_conservative_idr_thresholded_peaks_GRCh38.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_ENCFF965AIZ_optimal_idr_thresholded_peaks_hg19.bed.gz + ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE105nnn/GSE105734/suppl/GSE105734_RAW.tar + series_contact_zip_postal_code: 94305-5120 + diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..e3b7a17 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,31 @@ +import peppy +from logging import getLogger +import pepembed +from pepembed.pepembed import PEPEncoder +from pepembed.const import * +import os +from peppy import Project +import flatdict + +_LOGGER = getLogger("pepembed") + +class Testpepembed: + def test_search(self): + """Basic example of a test""" + #hf_model = "sentence-transformers/all-MiniLM-L12-v2" #this is the default in argsparser + found = False + keywordsfilepath = os.path.join(os.getcwd() + "/tests/data/keywordstest.txt") + + encoder = PEPEncoder(DEFAULT_MODEL, keywords_file=keywordsfilepath) + + p = peppy.Project(os.path.join(os.getcwd() + "/tests/data/testconfigs/testpep1.yaml")) + p = p.to_dict(extended=True) + + d = encoder.mine_metadata_from_dict(p, min_desc_length=20) + + for k, v in flatdict.FlatDict(p['_config']).items(): + if str(v) in d: + found = True + + assert found == True +