generated from linkml/linkml-project-template
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #186 from include-dcc/validation-v5
include data validator implementation
- Loading branch information
Showing
14 changed files
with
1,247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
[tool.poetry] | ||
name = "linkml-project-template" | ||
#name = "linkml-project-template" | ||
name = "src" | ||
version = "0.1.0" | ||
description = "Enter description of your project here" | ||
authors = ["Mark A. Miller <[email protected]>"] | ||
|
@@ -24,3 +25,6 @@ build-backend = "poetry.core.masonry.api" | |
|
||
[tool.poetry.extras] | ||
docs = ["linkml", "mkdocs-material"] | ||
|
||
[tool.poetry.scripts] | ||
validate-data = "src.data_validation.cli:main" |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import argparse | ||
from .validation import ( | ||
validate_study, | ||
validate_participant, | ||
validate_condition, | ||
validate_biospecimen, | ||
validate_datafile, | ||
validate_dataset, | ||
validate_datasetmanifest | ||
) | ||
|
||
# Dictionary to map entity names to validation functions | ||
entity_validators = { | ||
'study': validate_study, | ||
'participant': validate_participant, | ||
'condition': validate_condition, | ||
'biospecimen': validate_biospecimen, | ||
'datafile': validate_datafile, | ||
'dataset': validate_dataset, | ||
'datasetmanifest': validate_datasetmanifest | ||
} | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Validate data from a CSV file using Pydantic models') | ||
parser.add_argument('input_file', help='Path to the input CSV file') | ||
parser.add_argument('-o', '--output', help='Path to the directory to save error logs') | ||
parser.add_argument('entity', choices=entity_validators.keys(), help='Entity to validate') | ||
args = parser.parse_args() | ||
|
||
# Print a friendly prompt to indicate processing | ||
print(f"Validating {args.entity} data from file: {args.input_file}") | ||
|
||
# Retrieve the appropriate validation function based on the specified entity | ||
validation_function = entity_validators[args.entity] | ||
validation_function(args.input_file, args.output) | ||
|
||
# Print a friendly message indicating completion | ||
print("Validation complete!") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_biospecimen_entry(row): | ||
try: | ||
instance = Biospecimen( | ||
studyCode=row['Study Code'], | ||
participantGlobalId=row['Participant Global ID'], | ||
participantExternalId=row['Participant External ID'], | ||
sampleGlobalId=row['Sample Global ID'], | ||
sampleExternalId=row['Sample External ID'], | ||
sampleType=row['Sample Type'], | ||
ageAtBiospecimenCollection=row['Age At Biospecimen Collection'], | ||
parentSampleGlobalId=row['Parent Sample Global ID'], | ||
parentSampleExternalId=row['Parent Sample External ID'], | ||
parentSampleType=row['Parent Sample Type'], | ||
collectionGlobalId=row['Collection Global ID'], | ||
collectionExternalId=row['Collection External ID'], | ||
collectionSampleType=row['Collection Sample Type'], | ||
containerGlobalId=row['Container Global ID'], | ||
containerExternalId=row['Container External ID'], | ||
volume=row['Volume'], | ||
volumeUnit=row['Volume Unit'], | ||
concentration=row['Concentration'], | ||
concentrationUnit=row['Concentration Unit'], | ||
laboratoryProcedure=row['Laboratory Procedure'], | ||
biospecimenStorage=row['Biospecimen Storage'], | ||
sampleAvailability=row['Sample Availability'], | ||
containerAvailability=row['Container Availability'], | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Study Code'] + "-" + row['Sample External ID'], e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant, Condition | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_condition_entry(row): | ||
try: | ||
instance = Condition( | ||
studyCode=row['Study Code'], | ||
participantGlobalId=row['Participant Global ID'], | ||
participantExternalId=row['Participant External ID'], | ||
eventId=row['Event ID'], | ||
eventType=row['Event Type'], | ||
conditionMeasureSourceText=row['Condition or Measure Source Text'], | ||
ageAtConditionMeasureObservation=row['Age At Condition or Measure Observation'], | ||
conditionInterpretation=row['Condition Interpretation'], | ||
conditionStatus=row['Condition Status'], | ||
conditionDataSource=row['Condition Data Source'], | ||
hpoLabel=row['HPO Label'], | ||
hpoCode=row['HPO Code'], | ||
mondoLabel=row['MONDO Label'], | ||
mondoCode=row['MONDO Code'], | ||
maxoLabel=row['MAXO Label'], | ||
maxoCode=row['MAXO Code'], | ||
otherLabel=row['Other Label'], | ||
otherCode=row['Other Code'], | ||
measureValue=row['Measure Value'], | ||
measureUnit=row['Measure Unit'] | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Study Code'] + "-" + str(row['Participant External ID']) + "-" + row['Event ID'], e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_datafile_entry(row): | ||
try: | ||
instance = DataFile( | ||
studyCode=row['Study Code'], | ||
participantGlobalId=row['Participant Global ID'], | ||
participantExternalId=row['Participant External ID'], | ||
sampleGlobalId=row['Sample Global ID'], | ||
sampleExternalId=row['Sample External ID'], | ||
fileName=row['File Name'], | ||
fileGlobalId=row['File Global ID'], | ||
fileS3Location=row['File S3 Location'], | ||
fileUploadLocation=row['File Upload Location'], | ||
drsUri=row['DRS URI'], | ||
fileHash=row['File Hash'], | ||
dataAccess=row['Data Access'], | ||
dataCategory=row['Data Category'], | ||
dataType=row['Data Type'], | ||
experimentalStrategy=str(row['Experimental Strategy']).split('|'), | ||
experimentalPlatform=str(row['Experimental Platform']).split('|'), | ||
fileFormat=row['File Format'], | ||
fileSize=int(row['File Size']), | ||
fileSizeUnit=row['File Size Unit'] | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Sample External ID'] + "-" + row['File Global ID'], e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_dataset_entry(row): | ||
try: | ||
instance = Dataset( | ||
studyCode=row['Study Code'], | ||
datasetName=row['Dataset Name'], | ||
datasetDescription=row['Dataset Description'], | ||
datasetGlobalId=row['Dataset Global ID'], | ||
datasetExternalId=row['Dataset External ID'], | ||
expectedNumberOfParticipants=int(row['Expected Number of Participants']), | ||
expectedNumberOfFiles=int(row['Expected Number of Files']), | ||
dataCollectionStartYear=row['Data Collection Start Year'], | ||
dataCollectionEndYear=row['Data Collection End Year'], | ||
dataCategory=row['Data Category'], | ||
dataType=row['Data Type'], | ||
experimentalStrategy=str(row['Experimental Strategy']).split('|'), | ||
experimentalPlatform=str(row['Experimental Platform']).split('|'), | ||
publication=str(row['Publication']).split('|'), | ||
accessLimitations=row['Access Limitations'], | ||
accessRequirements=row['Access Requirements'], | ||
dbgap=str(row['DbGaP']).split('|'), | ||
otherRepository=row['Other Repository'], | ||
otherAccessAuthority=row['Other Access Authority'], | ||
isHarmonized=bool(row['Is Harmonized?']) | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset, DatasetManifest | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_datasetmanifest_entry(row): | ||
try: | ||
instance = DatasetManifest( | ||
studyCode=row['Study Code'], | ||
datasetName=row['Dataset Name'], | ||
datasetGlobalId=row['Dataset Global ID'], | ||
datasetExternalId=row['Dataset External ID'], | ||
fileName=row['File Name'], | ||
fileGlobalId=row['File Global ID'] | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from src.include_linkml.include_pydantic import Study, Participant | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_participant_entry(row): | ||
try: | ||
instance = Participant( | ||
studyCode=row['Study Code'], | ||
participantGlobalId=row['Participant Global ID'], | ||
participantExternalId=row['Participant External ID'], | ||
familyId=row['Family ID'], | ||
familyType=row['Family Type'], | ||
fatherId=row['Father ID'], | ||
motherId=row['Mother ID'], | ||
siblingId=row['Sibling ID'], | ||
otherFamilyMemberId=row['Other Family Member ID'], | ||
familyRelationship=row['Family Relationship'], | ||
sex=row['Sex'], | ||
race=row['Race'], | ||
ethnicity=row['Ethnicity'], | ||
downSyndromeStatus=row['Down Syndrome Status'], | ||
ageAtFirstPatientEngagement=row['Age at First Patient Engagement'], | ||
firstPatientEngagementEvent=row['First Patient Engagement Event'], | ||
outcomesVitalStatus=row['Outcomes Vital Status'], | ||
ageAtLastVitalStatus=row['Age at Last Vital Status'] | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Study Code'] + "-" + row['Participant External ID'], e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from src.include_linkml.include_pydantic import Study | ||
from pydantic import ValidationError | ||
|
||
|
||
def validate_study_entry(row): | ||
try: | ||
instance = Study( | ||
studyCode=row['Study Code'], | ||
studyTitle=row['Study Title'], | ||
program=row['Program'].split('|'), | ||
studyDescription=row['Study Description'], | ||
principalInvestigatorName=row['Principal Investigator Name'].split('|'), | ||
studyContactName=row['Study Contact Name'].split('|'), | ||
studyContactInstitution=row['Study Contact Institution'].split('|'), | ||
studyContactEmail=row['Study Contact Email'].split('|'), | ||
vbrEmail=row['VBR Email'], | ||
vbrUrl=row['VBR URL'], | ||
vbrReadme=row['VBR Readme'], | ||
researchDomain=row['Research Domain'].split('|'), | ||
participantLifespanStage=row['Participant Lifespan Stage'].split('|'), | ||
selectionCriteria=row['Selection Criteria'], | ||
studyDesign=row['Study Design'], | ||
clinicalDataSourceType=row['Clinical Data Source Type'].split('|'), | ||
dataCategory=row['Data Category'].split('|'), | ||
studyWebsite=row['Study Website'], | ||
dbgap=str(row['dbGaP']).split('|'), | ||
publication=str(row['Publication']).split('|'), | ||
expectedNumberOfParticipants=int(row['Expected Number of Participants']), | ||
guidType=row['GUID Type'], | ||
acknowledgments=row['Acknowledgements'].split('|'), | ||
citationStatement=row['Citation Statement'].split('|') | ||
) | ||
# Validation successful | ||
return True, None | ||
except ValidationError as e: | ||
# Validation failed | ||
error_details = (row['Study Code'], e) | ||
return False, error_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from src.data_validation.validation_utils import validate_data | ||
from src.data_validation.validate_study import validate_study_entry | ||
from src.data_validation.validate_participant import validate_participant_entry | ||
from src.data_validation.validate_condition import validate_condition_entry | ||
from src.data_validation.validate_biospecimen import validate_biospecimen_entry | ||
from src.data_validation.validate_datafile import validate_datafile_entry | ||
from src.data_validation.validate_dataset import validate_dataset_entry | ||
from src.data_validation.validate_datasetmanifest import validate_datasetmanifest_entry | ||
|
||
|
||
def validate_study(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Program', 'Research Domain', 'Participant Lifespan Stage', | ||
'Clinical Data Source Type', 'Data Category', 'GUID Type'] | ||
return validate_data(file_path, string_columns, validate_study_entry, output_path) | ||
|
||
|
||
def validate_participant(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Family Type', 'Family Relationship', 'Sex', 'Race', 'Ethnicity', | ||
'Down Syndrome Status', 'Outcomes Vital Status'] | ||
return validate_data(file_path, string_columns, validate_participant_entry, output_path) | ||
|
||
|
||
def validate_condition(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Condition Interpretation', 'Condition Status', 'Condition Data Source'] | ||
return validate_data(file_path, string_columns, validate_condition_entry, output_path) | ||
|
||
|
||
def validate_biospecimen(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Sample Availability', 'Container Availability'] | ||
return validate_data(file_path, string_columns, validate_biospecimen_entry, output_path) | ||
|
||
|
||
def validate_datafile(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Data Access', 'Data Category'] | ||
return validate_data(file_path, string_columns, validate_datafile_entry, output_path) | ||
|
||
|
||
def validate_dataset(file_path, output_path='.'): | ||
string_columns = ['Study Code', 'Data Category', 'Data Access'] | ||
return validate_data(file_path, string_columns, validate_dataset_entry, output_path) | ||
|
||
def validate_datasetmanifest(file_path, output_path='.'): | ||
string_columns = ['Study Code'] | ||
return validate_data(file_path, string_columns, validate_datasetmanifest_entry, output_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import pandas as pd | ||
import os | ||
from datetime import datetime | ||
|
||
|
||
def clean_string(value): | ||
if isinstance(value, str): | ||
return value.lower().replace(' ', '_').replace('-', '_').replace('/', '_') | ||
elif pd.isna(value): | ||
return None | ||
else: | ||
return str(value).lower().replace(' ', '_').replace('-', '_').replace('/', '_') | ||
|
||
|
||
def clean_dataframe_strings(df, string_columns): | ||
df[string_columns] = df[string_columns].map(clean_string) | ||
|
||
|
||
def validate_dataframe(df, entry_validator, input_file_name=None, output_path=None): | ||
validation_results = df.apply(entry_validator, axis=1) | ||
valid_count = validation_results[validation_results.apply(lambda x: x[0])].shape[0] | ||
invalid_count = validation_results.shape[0] - valid_count | ||
print("Number of errors by record type:") | ||
for is_valid, error_info in validation_results: | ||
if not is_valid: | ||
print(f"{error_info[0]}: {str(error_info[1]).split()[0]}") | ||
total_records = df.shape[0] | ||
print(f"Total number of records in the file: {total_records}") | ||
print(f"Number of records with error: {invalid_count}") | ||
if output_path: | ||
output_file_path = save_validation_results(validation_results, input_file_name, output_path) | ||
print(f"Validation results saved to: {output_file_path}") | ||
return valid_count, invalid_count | ||
|
||
|
||
def save_validation_results(validation_results, input_file_name, output_path): | ||
os.makedirs(output_path, exist_ok=True) | ||
current_date = datetime.now().strftime("%Y-%m-%d") | ||
output_file_name = f'{input_file_name}_validation_results_{current_date}.txt' if input_file_name else f'validation_results_{current_date}.txt' | ||
output_file_path = os.path.join(output_path, output_file_name) | ||
validation_results_str = [str(item) for item in validation_results] | ||
with open(output_file_path, 'w') as file: | ||
file.write('\n'.join(validation_results_str)) | ||
return output_file_path | ||
|
||
|
||
def read_csv_file(file_path): | ||
return pd.read_csv(file_path) | ||
|
||
|
||
def validate_data(file_path, string_columns, validation_function, output_path='.'): | ||
file_name = os.path.basename(file_path) | ||
df = read_csv_file(file_path) | ||
clean_dataframe_strings(df, string_columns) | ||
valid_count, invalid_count = validate_dataframe(df, validation_function, input_file_name=file_name, | ||
output_path=output_path) | ||
return valid_count, invalid_count |
Oops, something went wrong.