Skip to content

Commit

Permalink
Merge pull request #199 from include-dcc/validation_updates
Browse files Browse the repository at this point in the history
NA and other minor error handling
  • Loading branch information
madanucd authored Jul 25, 2024
2 parents 66842f8 + d03fae8 commit b48876d
Show file tree
Hide file tree
Showing 9 changed files with 171 additions and 155 deletions.
50 changes: 25 additions & 25 deletions src/data_validation/validate_biospecimen.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_biospecimen_entry(row):
try:
instance = Biospecimen(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
sampleGlobalId=row['Sample Global ID'],
sampleExternalId=row['Sample External ID'],
sampleType=row['Sample Type'],
ageAtBiospecimenCollection=row['Age At Biospecimen Collection'],
parentSampleGlobalId=row['Parent Sample Global ID'],
parentSampleExternalId=row['Parent Sample External ID'],
parentSampleType=row['Parent Sample Type'],
collectionGlobalId=row['Collection Global ID'],
collectionExternalId=row['Collection External ID'],
collectionSampleType=row['Collection Sample Type'],
containerGlobalId=row['Container Global ID'],
containerExternalId=row['Container External ID'],
volume=row['Volume'],
volumeUnit=row['Volume Unit'],
concentration=row['Concentration'],
concentrationUnit=row['Concentration Unit'],
laboratoryProcedure=row['Laboratory Procedure'],
biospecimenStorage=row['Biospecimen Storage'],
sampleAvailability=row['Sample Availability'],
containerAvailability=row['Container Availability'],
studyCode=handle_nan(row['study code']),
participantGlobalId=handle_nan(row['participant global id']),
participantExternalId=handle_nan(row['participant external id']),
sampleGlobalId=handle_nan(row['sample global id']),
sampleExternalId=handle_nan(row['sample external id']),
sampleType=handle_nan(row['sample type']),
ageAtBiospecimenCollection=handle_nan(row['age at biospecimen collection']),
parentSampleGlobalId=handle_nan(row['parent sample global id']),
parentSampleExternalId=handle_nan(row['parent sample external id']),
parentSampleType=handle_nan(row['parent sample type']),
collectionGlobalId=handle_nan(row['collection global id']),
collectionExternalId=handle_nan(row['collection external id']),
collectionSampleType=handle_nan(row['collection sample type']),
containerGlobalId=handle_nan(row['container global id']),
containerExternalId=handle_nan(row['container external id']),
volume=handle_nan(row['volume']),
volumeUnit=handle_nan(row['volume unit']),
concentration=handle_nan(row['concentration']),
concentrationUnit=handle_nan(row['concentration unit']),
laboratoryProcedure=handle_nan(row['laboratory procedure']),
biospecimenStorage=handle_nan(row['biospecimen storage']),
sampleAvailability=row['sample availability'],
containerAvailability=row['container availability']
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + row['Sample External ID'], e)
error_details = (str(row['study code']) + "-" + str(row['sample external id']), e)
return False, error_details
44 changes: 22 additions & 22 deletions src/data_validation/validate_condition.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_condition_entry(row):
try:
instance = Condition(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
eventId=row['Event ID'],
eventType=row['Event Type'],
conditionMeasureSourceText=row['Condition or Measure Source Text'],
ageAtConditionMeasureObservation=row['Age At Condition or Measure Observation'],
conditionInterpretation=row['Condition Interpretation'],
conditionStatus=row['Condition Status'],
conditionDataSource=row['Condition Data Source'],
hpoLabel=row['HPO Label'],
hpoCode=row['HPO Code'],
mondoLabel=row['MONDO Label'],
mondoCode=row['MONDO Code'],
maxoLabel=row['MAXO Label'],
maxoCode=row['MAXO Code'],
otherLabel=row['Other Label'],
otherCode=row['Other Code'],
measureValue=row['Measure Value'],
measureUnit=row['Measure Unit']
studyCode=row['study code'],
participantGlobalId=handle_nan(row['participant global id']),
participantExternalId=handle_nan(row['participant external id']),
eventId=handle_nan(row['event id']),
eventType=handle_nan(row['event type']),
conditionMeasureSourceText=handle_nan(row['condition or measure source text']),
ageAtConditionMeasureObservation=handle_nan(row['age at condition or measure observation']),
conditionInterpretation=row['condition interpretation'],
conditionStatus=row['condition status'],
conditionDataSource=row['condition data source'],
hpoLabel=handle_nan(row['hpo label']),
hpoCode=handle_nan(row['hpo code']),
mondoLabel=handle_nan(row['mondo label']),
mondoCode=handle_nan(row['mondo code']),
maxoLabel=handle_nan(row['maxo label']),
maxoCode=handle_nan(row['maxo code']),
otherLabel=handle_nan(row['other label']),
otherCode=handle_nan(row['other code']),
measureValue=handle_nan(row['measure value']),
measureUnit=handle_nan(row['measure unit'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + str(row['Participant External ID']) + "-" + row['Event ID'], e)
error_details = (str(row['study code']) + "-" + str(row['participant external id']) + "-" + str(row['event id']), e)
return False, error_details
42 changes: 21 additions & 21 deletions src/data_validation/validate_datafile.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_datafile_entry(row):
try:
instance = DataFile(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
sampleGlobalId=row['Sample Global ID'],
sampleExternalId=row['Sample External ID'],
fileName=row['File Name'],
fileGlobalId=row['File Global ID'],
fileS3Location=row['File S3 Location'],
fileUploadLocation=row['File Upload Location'],
drsUri=row['DRS URI'],
fileHash=row['File Hash'],
dataAccess=row['Data Access'],
dataCategory=row['Data Category'],
dataType=row['Data Type'],
experimentalStrategy=str(row['Experimental Strategy']).split('|'),
experimentalPlatform=str(row['Experimental Platform']).split('|'),
fileFormat=row['File Format'],
fileSize=int(row['File Size']),
fileSizeUnit=row['File Size Unit']
studyCode=row['study code'],
participantGlobalId=handle_nan(row['participant global id']),
participantExternalId=handle_nan(row['participant external id']),
sampleGlobalId=handle_nan(row['sample global id']),
sampleExternalId=handle_nan(row['sample external id']),
fileName=handle_nan(row['file name']),
fileGlobalId=handle_nan(row['file global id']),
fileS3Location=handle_nan(row['file s3 location']),
fileUploadLocation=handle_nan(row['file upload location']),
drsUri=handle_nan(row['drs uri']),
fileHash=handle_nan(row['file hash']),
dataAccess=row['data access'],
dataCategory=row['data category'],
dataType=handle_nan(row['data type']),
experimentalStrategy=row['experimental strategy'].split('|') if handle_nan(row['experimental strategy']) else [],
experimentalPlatform=row['experimental platform'].split('|') if handle_nan(row['experimental platform']) else [],
fileFormat=handle_nan(row['file format']),
fileSize=handle_nan(row['file size']),
fileSizeUnit=handle_nan(row['file size unit'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Sample External ID'] + "-" + row['File Global ID'], e)
error_details = (str(row['sample external id']) + "-" + str(row['file global id']), e)
return False, error_details
44 changes: 22 additions & 22 deletions src/data_validation/validate_dataset.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_dataset_entry(row):
try:
instance = Dataset(
studyCode=row['Study Code'],
datasetName=row['Dataset Name'],
datasetDescription=row['Dataset Description'],
datasetGlobalId=row['Dataset Global ID'],
datasetExternalId=row['Dataset External ID'],
expectedNumberOfParticipants=int(row['Expected Number of Participants']),
expectedNumberOfFiles=int(row['Expected Number of Files']),
dataCollectionStartYear=row['Data Collection Start Year'],
dataCollectionEndYear=row['Data Collection End Year'],
dataCategory=row['Data Category'],
dataType=row['Data Type'],
experimentalStrategy=str(row['Experimental Strategy']).split('|'),
experimentalPlatform=str(row['Experimental Platform']).split('|'),
publication=str(row['Publication']).split('|'),
accessLimitations=row['Access Limitations'],
accessRequirements=row['Access Requirements'],
dbgap=str(row['DbGaP']).split('|'),
otherRepository=row['Other Repository'],
otherAccessAuthority=row['Other Access Authority'],
isHarmonized=bool(row['Is Harmonized?'])
studyCode=row['study code'],
datasetName=handle_nan(row['dataset name']),
datasetDescription=handle_nan(row['dataset description']),
datasetGlobalId=handle_nan(row['dataset global id']),
datasetExternalId=handle_nan(row['dataset external id']),
expectedNumberOfParticipants=handle_nan(row['expected number of participants']),
expectedNumberOfFiles=handle_nan(row['expected number of files']),
dataCollectionStartYear=handle_nan(row['data collection start year']),
dataCollectionEndYear=handle_nan(row['data collection end year']),
dataCategory=row['data category'],
dataType=handle_nan(row['data type']),
experimentalStrategy=row['experimental strategy'].split('|') if handle_nan(row['experimental strategy']) else [],
experimentalPlatform=row['experimental platform'].split('|') if handle_nan(row['experimental platform']) else [],
publication=row['publication'].split('|') if handle_nan(row['publication']) else [],
accessLimitations=handle_nan(row['access limitations']),
accessRequirements=handle_nan(row['access requirements']),
dbgap=row['dbgap'].split('|') if handle_nan(row['dbgap']) else [],
otherRepository=handle_nan(row['other repository']),
otherAccessAuthority=handle_nan(row['other access authority']),
isHarmonized=bool(row['is harmonized?'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e)
error_details = (row['dataset name'] + "-" + str(row['dataset external id']), e)
return False, error_details
16 changes: 8 additions & 8 deletions src/data_validation/validate_datasetmanifest.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset, DatasetManifest
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_datasetmanifest_entry(row):
try:
instance = DatasetManifest(
studyCode=row['Study Code'],
datasetName=row['Dataset Name'],
datasetGlobalId=row['Dataset Global ID'],
datasetExternalId=row['Dataset External ID'],
fileName=row['File Name'],
fileGlobalId=row['File Global ID']
studyCode=row['study code'],
datasetName=handle_nan(row['dataset name']),
datasetGlobalId=handle_nan(row['dataset global id']),
datasetExternalId=handle_nan(row['dataset external id']),
fileName=handle_nan(row['file name']),
fileGlobalId=handle_nan(row['file global id'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Dataset Name'] + "-" + str(row['Dataset External ID']), e)
error_details = (str(row['dataset name']) + "-" + str(row['dataset external id']), e)
return False, error_details
40 changes: 20 additions & 20 deletions src/data_validation/validate_participant.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
from src.include_linkml.include_pydantic import Study, Participant
from src.data_validation.validation_utils import handle_nan
from pydantic import ValidationError


def validate_participant_entry(row):
try:
instance = Participant(
studyCode=row['Study Code'],
participantGlobalId=row['Participant Global ID'],
participantExternalId=row['Participant External ID'],
familyId=row['Family ID'],
familyType=row['Family Type'],
fatherId=row['Father ID'],
motherId=row['Mother ID'],
siblingId=row['Sibling ID'],
otherFamilyMemberId=row['Other Family Member ID'],
familyRelationship=row['Family Relationship'],
sex=row['Sex'],
race=row['Race'],
ethnicity=row['Ethnicity'],
downSyndromeStatus=row['Down Syndrome Status'],
ageAtFirstPatientEngagement=row['Age at First Patient Engagement'],
firstPatientEngagementEvent=row['First Patient Engagement Event'],
outcomesVitalStatus=row['Outcomes Vital Status'],
ageAtLastVitalStatus=row['Age at Last Vital Status']
studyCode=row['study code'],
participantGlobalId=handle_nan(row['participant global id']),
participantExternalId=handle_nan(row['participant external id']),
familyId=handle_nan(row['family id']),
familyType=row['family type'],
fatherId=handle_nan(row['father id']),
motherId=handle_nan(row['mother id']),
siblingId=handle_nan(row['sibling id']),
otherFamilyMemberId=handle_nan(row['other family member id']),
familyRelationship=row['family relationship'],
sex=row['sex'],
race=row['race'],
ethnicity=row['ethnicity'],
downSyndromeStatus=row['down syndrome status'],
ageAtFirstPatientEngagement=handle_nan(row['age at first patient engagement']),
firstPatientEngagementEvent=handle_nan(row['first patient engagement event']),
outcomesVitalStatus=row['outcomes vital status'],
ageAtLastVitalStatus=handle_nan(row['age at last vital status'])
)
# Validation successful
return True, None
except ValidationError as e:
# Validation failed
error_details = (row['Study Code'] + "-" + row['Participant External ID'], e)
error_details = (str(row['study code']) + "-" + str(row['participant external id']), e)
return False, error_details
Loading

0 comments on commit b48876d

Please sign in to comment.