Skip to content

Commit

Permalink
Merge pull request #186 from include-dcc/validation-v5
Browse files Browse the repository at this point in the history
include data validator implementation
  • Loading branch information
lopierra authored Jun 11, 2024
2 parents 4fd0076 + ae6d4bf commit c010095
Show file tree
Hide file tree
Showing 14 changed files with 1,247 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[tool.poetry]
name = "linkml-project-template"
#name = "linkml-project-template"
name = "src"
version = "0.1.0"
description = "Enter description of your project here"
authors = ["Mark A. Miller <[email protected]>"]
Expand All @@ -24,3 +25,6 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry.extras]
docs = ["linkml", "mkdocs-material"]

[tool.poetry.scripts]
validate-data = "src.data_validation.cli:main"
Empty file added src/data_validation/__init__.py
Empty file.
43 changes: 43 additions & 0 deletions src/data_validation/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
from .validation import (
validate_study,
validate_participant,
validate_condition,
validate_biospecimen,
validate_datafile,
validate_dataset,
validate_datasetmanifest
)

# Dictionary to map entity names to validation functions
entity_validators = {
'study': validate_study,
'participant': validate_participant,
'condition': validate_condition,
'biospecimen': validate_biospecimen,
'datafile': validate_datafile,
'dataset': validate_dataset,
'datasetmanifest': validate_datasetmanifest
}


def main():
parser = argparse.ArgumentParser(description='Validate data from a CSV file using Pydantic models')
parser.add_argument('input_file', help='Path to the input CSV file')
parser.add_argument('-o', '--output', help='Path to the directory to save error logs')
parser.add_argument('entity', choices=entity_validators.keys(), help='Entity to validate')
args = parser.parse_args()

# Print a friendly prompt to indicate processing
print(f"Validating {args.entity} data from file: {args.input_file}")

# Retrieve the appropriate validation function based on the specified entity
validation_function = entity_validators[args.entity]
validation_function(args.input_file, args.output)

# Print a friendly message indicating completion
print("Validation complete!")


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions src/data_validation/validate_biospecimen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen
from pydantic import ValidationError


def validate_biospecimen_entry(row):
try:
instance = Biospecimen(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
sampleGlobalId=row['Sample Global ID'],
sampleExternalId=row['Sample External ID'],
sampleType=row['Sample Type'],
ageAtBiospecimenCollection=row['Age At Biospecimen Collection'],
parentSampleGlobalId=row['Parent Sample Global ID'],
parentSampleExternalId=row['Parent Sample External ID'],
parentSampleType=row['Parent Sample Type'],
collectionGlobalId=row['Collection Global ID'],
collectionExternalId=row['Collection External ID'],
collectionSampleType=row['Collection Sample Type'],
containerGlobalId=row['Container Global ID'],
containerExternalId=row['Container External ID'],
volume=row['Volume'],
volumeUnit=row['Volume Unit'],
concentration=row['Concentration'],
concentrationUnit=row['Concentration Unit'],
laboratoryProcedure=row['Laboratory Procedure'],
biospecimenStorage=row['Biospecimen Storage'],
sampleAvailability=row['Sample Availability'],
containerAvailability=row['Container Availability'],
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + row['Sample External ID'], e)
return False, error_details
34 changes: 34 additions & 0 deletions src/data_validation/validate_condition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition
from pydantic import ValidationError


def validate_condition_entry(row):
try:
instance = Condition(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
eventId=row['Event ID'],
eventType=row['Event Type'],
conditionMeasureSourceText=row['Condition or Measure Source Text'],
ageAtConditionMeasureObservation=row['Age At Condition or Measure Observation'],
conditionInterpretation=row['Condition Interpretation'],
conditionStatus=row['Condition Status'],
conditionDataSource=row['Condition Data Source'],
hpoLabel=row['HPO Label'],
hpoCode=row['HPO Code'],
mondoLabel=row['MONDO Label'],
mondoCode=row['MONDO Code'],
maxoLabel=row['MAXO Label'],
maxoCode=row['MAXO Code'],
otherLabel=row['Other Label'],
otherCode=row['Other Code'],
measureValue=row['Measure Value'],
measureUnit=row['Measure Unit']
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + str(row['Participant External ID']) + "-" + row['Event ID'], e)
return False, error_details
33 changes: 33 additions & 0 deletions src/data_validation/validate_datafile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile
from pydantic import ValidationError


def validate_datafile_entry(row):
try:
instance = DataFile(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
sampleGlobalId=row['Sample Global ID'],
sampleExternalId=row['Sample External ID'],
fileName=row['File Name'],
fileGlobalId=row['File Global ID'],
fileS3Location=row['File S3 Location'],
fileUploadLocation=row['File Upload Location'],
drsUri=row['DRS URI'],
fileHash=row['File Hash'],
dataAccess=row['Data Access'],
dataCategory=row['Data Category'],
dataType=row['Data Type'],
experimentalStrategy=str(row['Experimental Strategy']).split('|'),
experimentalPlatform=str(row['Experimental Platform']).split('|'),
fileFormat=row['File Format'],
fileSize=int(row['File Size']),
fileSizeUnit=row['File Size Unit']
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Sample External ID'] + "-" + row['File Global ID'], e)
return False, error_details
34 changes: 34 additions & 0 deletions src/data_validation/validate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset
from pydantic import ValidationError


def validate_dataset_entry(row):
try:
instance = Dataset(
studyCode=row['Study Code'],
datasetName=row['Dataset Name'],
datasetDescription=row['Dataset Description'],
datasetGlobalId=row['Dataset Global ID'],
datasetExternalId=row['Dataset External ID'],
expectedNumberOfParticipants=int(row['Expected Number of Participants']),
expectedNumberOfFiles=int(row['Expected Number of Files']),
dataCollectionStartYear=row['Data Collection Start Year'],
dataCollectionEndYear=row['Data Collection End Year'],
dataCategory=row['Data Category'],
dataType=row['Data Type'],
experimentalStrategy=str(row['Experimental Strategy']).split('|'),
experimentalPlatform=str(row['Experimental Platform']).split('|'),
publication=str(row['Publication']).split('|'),
accessLimitations=row['Access Limitations'],
accessRequirements=row['Access Requirements'],
dbgap=str(row['DbGaP']).split('|'),
otherRepository=row['Other Repository'],
otherAccessAuthority=row['Other Access Authority'],
isHarmonized=bool(row['Is Harmonized?'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e)
return False, error_details
20 changes: 20 additions & 0 deletions src/data_validation/validate_datasetmanifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset, DatasetManifest
from pydantic import ValidationError


def validate_datasetmanifest_entry(row):
try:
instance = DatasetManifest(
studyCode=row['Study Code'],
datasetName=row['Dataset Name'],
datasetGlobalId=row['Dataset Global ID'],
datasetExternalId=row['Dataset External ID'],
fileName=row['File Name'],
fileGlobalId=row['File Global ID']
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e)
return False, error_details
32 changes: 32 additions & 0 deletions src/data_validation/validate_participant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from src.include_linkml.include_pydantic import Study, Participant
from pydantic import ValidationError


def validate_participant_entry(row):
try:
instance = Participant(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
familyId=row['Family ID'],
familyType=row['Family Type'],
fatherId=row['Father ID'],
motherId=row['Mother ID'],
siblingId=row['Sibling ID'],
otherFamilyMemberId=row['Other Family Member ID'],
familyRelationship=row['Family Relationship'],
sex=row['Sex'],
race=row['Race'],
ethnicity=row['Ethnicity'],
downSyndromeStatus=row['Down Syndrome Status'],
ageAtFirstPatientEngagement=row['Age at First Patient Engagement'],
firstPatientEngagementEvent=row['First Patient Engagement Event'],
outcomesVitalStatus=row['Outcomes Vital Status'],
ageAtLastVitalStatus=row['Age at Last Vital Status']
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + row['Participant External ID'], e)
return False, error_details
38 changes: 38 additions & 0 deletions src/data_validation/validate_study.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from src.include_linkml.include_pydantic import Study
from pydantic import ValidationError


def validate_study_entry(row):
try:
instance = Study(
studyCode=row['Study Code'],
studyTitle=row['Study Title'],
program=row['Program'].split('|'),
studyDescription=row['Study Description'],
principalInvestigatorName=row['Principal Investigator Name'].split('|'),
studyContactName=row['Study Contact Name'].split('|'),
studyContactInstitution=row['Study Contact Institution'].split('|'),
studyContactEmail=row['Study Contact Email'].split('|'),
vbrEmail=row['VBR Email'],
vbrUrl=row['VBR URL'],
vbrReadme=row['VBR Readme'],
researchDomain=row['Research Domain'].split('|'),
participantLifespanStage=row['Participant Lifespan Stage'].split('|'),
selectionCriteria=row['Selection Criteria'],
studyDesign=row['Study Design'],
clinicalDataSourceType=row['Clinical Data Source Type'].split('|'),
dataCategory=row['Data Category'].split('|'),
studyWebsite=row['Study Website'],
dbgap=str(row['dbGaP']).split('|'),
publication=str(row['Publication']).split('|'),
expectedNumberOfParticipants=int(row['Expected Number of Participants']),
guidType=row['GUID Type'],
acknowledgments=row['Acknowledgements'].split('|'),
citationStatement=row['Citation Statement'].split('|')
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'], e)
return False, error_details
44 changes: 44 additions & 0 deletions src/data_validation/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from src.data_validation.validation_utils import validate_data
from src.data_validation.validate_study import validate_study_entry
from src.data_validation.validate_participant import validate_participant_entry
from src.data_validation.validate_condition import validate_condition_entry
from src.data_validation.validate_biospecimen import validate_biospecimen_entry
from src.data_validation.validate_datafile import validate_datafile_entry
from src.data_validation.validate_dataset import validate_dataset_entry
from src.data_validation.validate_datasetmanifest import validate_datasetmanifest_entry


def validate_study(file_path, output_path='.'):
string_columns = ['Study Code', 'Program', 'Research Domain', 'Participant Lifespan Stage',
'Clinical Data Source Type', 'Data Category', 'GUID Type']
return validate_data(file_path, string_columns, validate_study_entry, output_path)


def validate_participant(file_path, output_path='.'):
string_columns = ['Study Code', 'Family Type', 'Family Relationship', 'Sex', 'Race', 'Ethnicity',
'Down Syndrome Status', 'Outcomes Vital Status']
return validate_data(file_path, string_columns, validate_participant_entry, output_path)


def validate_condition(file_path, output_path='.'):
string_columns = ['Study Code', 'Condition Interpretation', 'Condition Status', 'Condition Data Source']
return validate_data(file_path, string_columns, validate_condition_entry, output_path)


def validate_biospecimen(file_path, output_path='.'):
string_columns = ['Study Code', 'Sample Availability', 'Container Availability']
return validate_data(file_path, string_columns, validate_biospecimen_entry, output_path)


def validate_datafile(file_path, output_path='.'):
string_columns = ['Study Code', 'Data Access', 'Data Category']
return validate_data(file_path, string_columns, validate_datafile_entry, output_path)


def validate_dataset(file_path, output_path='.'):
string_columns = ['Study Code', 'Data Category', 'Data Access']
return validate_data(file_path, string_columns, validate_dataset_entry, output_path)

def validate_datasetmanifest(file_path, output_path='.'):
string_columns = ['Study Code']
return validate_data(file_path, string_columns, validate_datasetmanifest_entry, output_path)
57 changes: 57 additions & 0 deletions src/data_validation/validation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
import os
from datetime import datetime


def clean_string(value):
if isinstance(value, str):
return value.lower().replace(' ', '_').replace('-', '_').replace('/', '_')
elif pd.isna(value):
return None
else:
return str(value).lower().replace(' ', '_').replace('-', '_').replace('/', '_')


def clean_dataframe_strings(df, string_columns):
df[string_columns] = df[string_columns].map(clean_string)


def validate_dataframe(df, entry_validator, input_file_name=None, output_path=None):
validation_results = df.apply(entry_validator, axis=1)
valid_count = validation_results[validation_results.apply(lambda x: x[0])].shape[0]
invalid_count = validation_results.shape[0] - valid_count
print("Number of errors by record type:")
for is_valid, error_info in validation_results:
if not is_valid:
print(f"{error_info[0]}: {str(error_info[1]).split()[0]}")
total_records = df.shape[0]
print(f"Total number of records in the file: {total_records}")
print(f"Number of records with error: {invalid_count}")
if output_path:
output_file_path = save_validation_results(validation_results, input_file_name, output_path)
print(f"Validation results saved to: {output_file_path}")
return valid_count, invalid_count


def save_validation_results(validation_results, input_file_name, output_path):
os.makedirs(output_path, exist_ok=True)
current_date = datetime.now().strftime("%Y-%m-%d")
output_file_name = f'{input_file_name}_validation_results_{current_date}.txt' if input_file_name else f'validation_results_{current_date}.txt'
output_file_path = os.path.join(output_path, output_file_name)
validation_results_str = [str(item) for item in validation_results]
with open(output_file_path, 'w') as file:
file.write('\n'.join(validation_results_str))
return output_file_path


def read_csv_file(file_path):
return pd.read_csv(file_path)


def validate_data(file_path, string_columns, validation_function, output_path='.'):
file_name = os.path.basename(file_path)
df = read_csv_file(file_path)
clean_dataframe_strings(df, string_columns)
valid_count, invalid_count = validate_dataframe(df, validation_function, input_file_name=file_name,
output_path=output_path)
return valid_count, invalid_count
Loading

0 comments on commit c010095

Please sign in to comment.