Skip to content

Commit

Permalink
Merge pull request #5 from ChrisgKent/main
Browse files Browse the repository at this point in the history
Schema locked down for primerschemes
  • Loading branch information
ChrisgKent authored Oct 25, 2023
2 parents 51a94a7 + 1f0cbf8 commit 754d4cc
Show file tree
Hide file tree
Showing 57 changed files with 1,019 additions and 2,951 deletions.
104 changes: 82 additions & 22 deletions .github/workflows/build_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@
import json
import sys
import hashlib

from enum import Enum
import itertools

"""
Version Schema
requires an info.json in the version directory
{
"ampliconsize": 100,
"schemeversion": "v1.0.0",
"schemename": "scheme_name",
"primer.bed.md5": "md5",
"reference.fasta.md5": "md5",
}
"""

Expand All @@ -28,6 +37,16 @@ def parse_version(
) -> dict[str:str]:
version_dict = dict()

# Read in the info.json file
with open(version_path / "info.json") as f:
info_dict = json.load(f)

# Grab index.json fields
version_dict["algorithmversion"] = info_dict["algorithmversion"]
version_dict["status"] = info_dict["status"]
version_dict["authors"] = info_dict["authors"]
version_dict["citations"] = info_dict["citations"]

# Add the primer.bed file
primerbed = version_path / "primer.bed"
version_dict["primer.bed.url"] = create_rawlink(
Expand All @@ -49,24 +68,14 @@ def parse_version(
)
version_dict["config.json.md5"] = hashfile(config)

# Read in the config.json file
with open(config) as f:
config_dict = json.load(f)

# Grab config.json fields
version_dict["algorithmversion"] = config_dict["algorithmversion"]
version_dict["validated"] = config_dict["validated"]
version_dict["authors"] = config_dict["authors"]
version_dict["citation"] = config_dict["citation"]

# Check the hashes in the config.json file match the generated hashes
if version_dict["primer.bed.md5"] != config_dict["primer.bed.md5"]:
if version_dict["primer.bed.md5"] != info_dict["primer.bed.md5"]:
raise ValueError(
f"Hash mismatch for {version_dict['primer.bed.url']}. Expected {version_dict['primer.bed.md5']} but got {config_dict['primer.bed.md5']}"
f"Hash mismatch for {version_dict['primer.bed.url']}. Expected {version_dict['primer.bed.md5']} but got {info_dict['primer.bed.md5']}"
)
if version_dict["reference.fasta.md5"] != config_dict["reference.fasta.md5"]:
if version_dict["reference.fasta.md5"] != info_dict["reference.fasta.md5"]:
raise ValueError(
f"Hash mismatch for {version_dict['reference.fasta.url']}. Expected {version_dict['reference.fasta.md5']} but got {config_dict['reference.fasta.md5']}"
f"Hash mismatch for {version_dict['reference.fasta.url']}. Expected {version_dict['reference.fasta.md5']} but got {info_dict['reference.fasta.md5']}"
)

return version_dict
Expand Down Expand Up @@ -121,14 +130,54 @@ def parse_scheme(scheme_path, repo_url, scheme_name, pclass) -> dict[str:str]:
return scheme_dict


def main():
def traverse_json(json_dict):
"""Depth first search of the json_dict"""
for pclass, pclass_dict in json_dict.items():
for scheme_name, scheme_dict in pclass_dict.items():
for length, length_dict in scheme_dict.items():
for version, _version_dict in length_dict.items():
yield (pclass, scheme_name, length, version)


def check_consistency(existing_json, new_json):
"""
Checks that paths contained in both existing_json and new_json have the same hashes (files unaltered)
"""
# Find all paths
existing_paths: set[tuple[str]] = {x for x in traverse_json(existing_json)}
# Find all new paths
new_paths = {x for x in traverse_json(new_json)}

# Find all the paths that are in both
intersection = existing_paths & new_paths

for path in intersection:
# Check that the reference hashes are the same
existing_ref_hash = existing_json[path[0]][path[1]][path[2]][path[3]][
"reference.fasta.md5"
]
new_ref_hash = new_json[path[0]][path[1]][path[2]][path[3]][
"reference.fasta.md5"
]
if existing_ref_hash != new_ref_hash:
raise ValueError(
f"Hash changed for {path[0]}/{path[1]}/{path[2]}/{path[3]}/reference.fasta. Expected {existing_ref_hash} but got {new_ref_hash}"
)

# Check that the primer.bed hashes are the same
existing_bed_hash = existing_json[path[0]][path[1]][path[2]][path[3]][
"primer.bed.md5"
]
new_bed_hash = new_json[path[0]][path[1]][path[2]][path[3]]["primer.bed.md5"]
if existing_ref_hash != new_ref_hash:
raise ValueError(
f"Hash changed for {path[0]}/{path[1]}/{path[2]}/{path[3]}/primer.bed. Expected {existing_bed_hash} but got {new_bed_hash}"
)


def create_index(server_url, repo_url):
# For any Scheme, we can generate a JSON file with the following format:
json_dict = dict()

# server_url = https://github.com/
server_url = sys.argv[1]
repo_url = sys.argv[2]

# Parse panels and schemes
pclasses = ["primerschemes", "primerpanels"]
for pclass in pclasses:
Expand All @@ -146,9 +195,20 @@ def main():
# Add the pclass to the json_dict
json_dict[pclass] = pclass_dict

# Read in the existing index.json file
with open("index.json") as f:
existing_json_dict = json.load(f)

# Check persistence of existing files
check_consistency(existing_json_dict, json_dict)

with open("index.json", "w") as f:
json.dump(json_dict, f, indent=4, sort_keys=True)

return True


if __name__ == "__main__":
main()
server_url = sys.argv[1]
repo_url = sys.argv[2]
create_index(server_url, repo_url)
95 changes: 59 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# primerschemes
Repo for PrimerSchemes

Central repo for newly generated PrimerSchemes and PrimalPanels

## PrimerScheme Versioning

## New formats

### PrimerScheme Versioning

v(w).(x).(y)

Expand All @@ -14,60 +17,80 @@ v(w).(x).(y)
`y`: Misc version. Used to include changes to file format or change to primer re-balancing volumes (No primer changes)


## PrimerName Format

```{scheme}_{amplicon_number}_{direction}_{primer_number}```
### PrimerName Format

#### scheme
```{uuid}_{amplicon_number}_{direction}_{primer_number}```

- Name of the scheme or abbreviation
- All non alphanumeric will be replaced with `-`
- ```uuid```: 8 digit uuid linking each primer to the corresponding MSA / scheme

#### amplicon_number
- ```amplicon_number```: The number of the amplicon. 0 indexed int. Not enforced to be continuous

- Int
- Not enforced to be continuous
- ```direction```: If the primer is a forward or reverse primer. ```LEFT``` or ```RIGHT```

#### direction
- ```primer_number```: The number of each primer within each 'primercloud'. 0 indexed int. Not enforced to be continuous

- `LEFT`/`RIGHT`

#### primer_number
### Directory Format

- Int
- Not enforced to be continuous


## Directory Format
This is the new format for how schemes will be stored in this repo

```
{scheme_name}
├── {amplicon_size}
│   └── {version}
│   ├── primer.bed
│ ├── amplicon.bed
│   ├── {chrom_name}.fasta *
│ ├── README.md
│ └── work
│ ├── config.json
│ ├── {chrom_name}.png *
│ ├── {chrom_name}.html *
│ └── output.log
├── LICENSE
└── README.md
primerschemes / primerpanels
├── README.md
└── {schemename}
└── {ampliconsize}
└── {schemeversion}
   ├── README.md
   ├── info.json
   ├── primer.bed
   ├── reference.fasta
   └── work
   ├── {msa}.png *
   ├── {msa}.fasta *
   ├── amplicon.bed
   ├── config.json
   ├── file.log
   └── {misc files} *
```
\* Can have multiple

For autogenerated schemes: scheme_name = {common-name}--{taxid}
- ```schemename```: A lower case, only containing [a-z0-9-] and must start/end on a alphanumeric. Use of '--' is reserved for autogenerated schemes ({common-name}--{taxid})

- ```ampliconsize```: The ampliconsize specified when the scheme was created. Actual size is ±10%

## File Format
- ```schemeversion```: Follows naming in PrimerScheme Versioning

`{scheme_name}.fasta`: The MSA used for scheme generation. The scheme uses the MSA indexing for bed files

## File Formats

`{msa}.fasta`: The MSA used for scheme generation.

`primer.bed`: The bedfile that contains the primers in the current version

`amplicon.bed`: The bedfile that contains the amplicons of the current version

`reference.fasta`: The "reference" genomes which the primers are indexed against. Can either be consensus genomes derived from the MSA for a specific genome in the MSA (first genome)

`info.json`:
```json
{
"ampliconsize": "int",
"schemeversion": "str",
"schemename": "str",
"primer.bed.md5": "str",
"reference.fasta.md5": "str",
"status": ["withdrawn", "deprecated", "autogenerated","draft","testing","validated"],
"citations": "list[str]",
"authors": "list[str]",
"algorithmversion": "str",
}
```

`config.json`: All scheme params dumped from primaldigest

`file.log`: Output log from primaldigest

`{msa}.png`: Figure showing a scheme overview


------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 754d4cc

Please sign in to comment.