CDCgov · cbrinson-rise8 · Dec 2, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -83,9 +83,13 @@ linkage evaluation phase. The following features are supported:
 
 :   The patient's email address.
 
-`DRIVERS_LICENSE`
+`IDENTIFIER`
 
-:   The patient's driver's license number.
+:   The patient's identifier. This is a catch-all for all identifiers for the patient
+
+`IDENTIFIER:<type>`
+
+:   The patient's specific identifier type. For example, `IDENTIFIER:MRN` would be the patient's medical record number.
 
 
 ### Blocking Key Types
@@ -97,10 +101,6 @@ patient data and used during query retrieval. The following blocking key types a
 
 :   The patients birthdate in the format `YYYY-MM-DD`.
 
-`MRN` (ID: **2**)
-
-:   The last 4 characters of a patient's medical record number.
-
 `SEX` (ID: **3**)
 
 :   The patient's sex in the format of `M`, `F`, or `U` for unknown.
@@ -129,6 +129,10 @@ patient data and used during query retrieval. The following blocking key types a
 
 :   The first 4 characters of the patient's email address.
 
+`IDENTIFIER` (ID: **10**)
+
+:  The identifier triplet containing only the type, authority, and last 4 digits of the value
+
 
 ### Evaluation Functions
 

@@ -9,7 +9,7 @@
             {
                 "blocking_keys": [
                     "BIRTHDATE",
-                    "MRN",
+                    "IDENTIFIER",  
                     "SEX"
                 ],
                 "evaluators": [
@@ -75,7 +75,7 @@
             {
                 "blocking_keys": [
                     "BIRTHDATE",
-                    "MRN",
+                    "IDENTIFIER",
                     "SEX"
                 ],
                 "evaluators": [
@@ -106,7 +106,7 @@
                         "CITY": 2.438553006137189,
                         "FIRST_NAME": 6.849475906891162,
                         "LAST_NAME": 6.350720397426025,
-                        "MRN": 0.3051262572525359,
+                        "IDENTIFIER": 0.3051262572525359,
                         "SEX": 0.7510419059643679,
                         "STATE": 0.022376768992488694,
                         "ZIP": 4.975031471124867
@@ -148,7 +148,7 @@
                         "CITY": 2.438553006137189,
                         "FIRST_NAME": 6.849475906891162,
                         "LAST_NAME": 6.350720397426025,
-                        "MRN": 0.3051262572525359,
+                        "IDENTIFIER": 0.3051262572525359,
                         "SEX": 0.7510419059643679,
                         "STATE": 0.022376768992488694,
                         "ZIP": 4.975031471124867

@@ -33,26 +33,19 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord:
         "birthDate": fhir_record.get("birthDate"),
         "sex": fhir_record.get("gender"),
         "address": fhir_record.get("address", []),
-        "mrn": None,
-        "ssn": None,
         "race": None,
         "gender": None,
         "telecom": fhir_record.get("telecom", []),
-        "drivers_license": None,
+        "identifiers": [],
     }
     for identifier in fhir_record.get("identifier", []):
-        for coding in identifier.get("type", {}).get("coding", []):
-            if coding.get("code") == "MR":
-                val["mrn"] = identifier.get("value")
-            elif coding.get("code") == "SS":
-                val["ssn"] = identifier.get("value")
-            elif coding.get("code") == "DL":
-                license_number = identifier.get("value")
-                authority = identifier.get("assigner", {}).get("identifier", {}).get("value", "")  # Assuming `issuer` contains authority info
-                val["drivers_license"] = {
-                    "value": license_number,
-                    "authority": authority
-                }
+        for code in identifier.get("type", {}).get("coding", []):
+            val["identifiers"].append({
+                "value": identifier.get("value"),
+                "type": code.get("code"),
+                "authority": identifier.get("assigner", {}).get("identifier", {}).get("value", ""),
+            })
+            break   # Sholdn't be more than 1 code
     for address in val["address"]:
         address["county"] = address.get("district", "")
         for extension in address.get("extension", []):

@@ -51,7 +51,7 @@ def compare(
     details: dict[str, typing.Any] = {"patient.reference_id": str(patient.reference_id)}
     for e in evals:
         # TODO: can we do this check earlier?
-        feature = getattr(schemas.Feature, e.feature, None)
+        feature = schemas.Feature.parse(e.feature)
         if feature is None:
             raise ValueError(f"Invalid comparison field: {e.feature}")
         # Evaluate the comparison function and append the result to the list

@@ -179,7 +179,7 @@ def compare_fuzzy_match(
       beyond which to classify the strings as a partial match.
     :return: A float indicating whether the features are a fuzzy match.
     """
-    similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
+    similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     for x in record.feature_iter(key):
         for y in patient.record.feature_iter(key):
@@ -203,11 +203,11 @@ def compare_probabilistic_fuzzy_match(
       beyond which to classify the strings as a partial match.
     :return: A float of the score the feature comparison earned.
     """
-    log_odds = kwargs.get("log_odds", {}).get(str(key))
+    log_odds = kwargs.get("log_odds", {}).get(str(key.attribute))
     if log_odds is None:
         raise ValueError(f"Log odds not found for feature {key}")
 
-    similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
+    similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     max_score = 0.0
     for x in patient.record.feature_iter(key):

@@ -121,14 +121,14 @@ class BlockingKey(enum.Enum):
     """
 
     BIRTHDATE = ("BIRTHDATE", 1, "Date of birth as YYYY-MM-DD")
-    MRN = ("MRN", 2, "Last 4 characters of Medical record number")
     SEX = ("SEX", 3, "Sex at birth; M, F or U")
     ZIP = ("ZIP", 4, "5 digital US Postal Code")
     FIRST_NAME = ("FIRST_NAME", 5, "First 4 characters of the first name")
     LAST_NAME = ("LAST_NAME", 6, "First 4 characters of the last name")
     ADDRESS = ("ADDRESS", 7, "First 4 characters of the address")
     PHONE = ("PHONE", 8, "Last 4 characters of the phone number")
     EMAIL = ("EMAIL", 9, "First 4 characters of the email address")
+    IDENTIFIER = ("IDENTIFIER", 10, "Identifier triplet with only last 4 character of the value. Format \"type:authority:value\"")
 
     def __init__(self, value: str, _id: int, description: str):
         self._value = value

@@ -11,6 +11,7 @@
 from .mpi import PatientRef
 from .mpi import PersonRef
 from .pii import Feature
+from .pii import FeatureAttribute
 from .pii import PIIRecord
 from .seed import Cluster
 from .seed import ClusterGroup
@@ -22,6 +23,7 @@
     "AlgorithmPass",
     "AlgorithmSummary",
     "Feature",
+    "FeatureAttribute",
     "PIIRecord",
     "Prediction",
     "LinkInput",

@@ -13,7 +13,6 @@
 
 from recordlinker.linking import matchers
 from recordlinker.models.mpi import BlockingKey
-from recordlinker.schemas.pii import Feature
 
 
 class Evaluator(pydantic.BaseModel):
@@ -23,10 +22,9 @@ class Evaluator(pydantic.BaseModel):
 
     model_config = pydantic.ConfigDict(from_attributes=True, use_enum_values=True)
 
-    feature: Feature
+    feature: str
     func: matchers.FeatureFunc
 
-
 class AlgorithmPass(pydantic.BaseModel):
     """
     The schema for an algorithm pass record.

@@ -0,0 +1,195 @@
+import enum
+import re
+import typing
+
+import pydantic
+
+
+class IdentifierType(enum.Enum):
+    """
+    Enum for various identifier types.
+    """
+    AC = "AC"
+    ACSN = "ACSN"
+    AIN = "AIN"
+    AM = "AM"
+    AMA = "AMA"
+    AN = "AN"
+    ANC = "ANC"
+    AND = "AND"
+    ANON = "ANON"
+    ANT = "ANT"
+    APRN = "APRN"
+    ASID = "ASID"
+    BA = "BA"
+    BC = "BC"
+    BCFN = "BCFN"
+    BCT = "BCT"
+    BR = "BR"
+    BRN = "BRN"
+    BSNR = "BSNR"
+    CAII = "CAII"
+    CC = "CC"
+    CONM = "CONM"
+    CY = "CY"
+    CZ = "CZ"
+    DC = "DC"
+    DCFN = "DCFN"
+    DDS = "DDS"
+    DEA = "DEA"
+    DFN = "DFN"
+    DI = "DI"
+    DL = "DL"
+    DN = "DN"
+    DO = "DO"
+    DP = "DP"
+    DPM = "DPM"
+    DR = "DR"
+    DS = "DS"
+    DSG = "DSG"
+    EI = "EI"
+    EN = "EN"
+    ESN = "ESN"
+    FDR = "FDR"
+    FDRFN = "FDRFN"
+    FGN = "FGN"
+    FI = "FI"
+    FILL = "FILL"
+    GI = "GI"
+    GIN = "GIN"
+    GL = "GL"
+    GN = "GN"
+    HC = "HC"
+    IND = "IND"
+    IRISTEM = "IRISTEM"
+    JHN = "JHN"
+    LACSN = "LACSN"
+    LANR = "LANR"
+    LI = "LI"
+    LN = "LN"
+    LR = "LR"
+    MA = "MA"
+    MB = "MB"
+    MC = "MC"
+    MCD = "MCD"
+    MCN = "MCN"
+    MCR = "MCR"
+    MCT = "MCT"
+    MD = "MD"
+    MI = "MI"
+    MR = "MR"
+    MRT = "MRT"
+    MS = "MS"
+    NBSNR = "NBSNR"
+    NCT = "NCT"
+    NE = "NE"
+    NH = "NH"
+    NI = "NI"
+    NII = "NII"
+    NIIP = "NIIP"
+    NP = "NP"
+    NPI = "NPI"
+    OBI = "OBI"
+    OD = "OD"
+    PA = "PA"
+    PC = "PC"
+    PCN = "PCN"
+    PE = "PE"
+    PEN = "PEN"
+    PGN = "PGN"
+    PHC = "PHC"
+    PHE = "PHE"
+    PHO = "PHO"
+    PI = "PI"
+    PIN = "PIN"
+    PLAC = "PLAC"
+    PN = "PN"
+    PNT = "PNT"
+    PPIN = "PPIN"
+    PPN = "PPN"
+    PRC = "PRC"
+    PRN = "PRN"
+    PT = "PT"
+    QA = "QA"
+    RI = "RI"
+    RN = "RN"
+    RPH = "RPH"
+    RR = "RR"
+    RRI = "RRI"
+    RRP = "RRP"
+    SAMN = "SAMN"
+    SB = "SB"
+    SID = "SID"
+    SL = "SL"
+    SN = "SN"
+    SNBSN = "SNBSN"
+    SNO = "SNO"
+    SP = "SP"
+    SR = "SR"
+    SRX = "SRX"
+    SS = "SS"
+    STN = "STN"
+    TAX = "TAX"
+    TN = "TN"
+    TPR = "TPR"
+    TRL = "TRL"
+    U = "U"
+    UDI = "UDI"
+    UPIN = "UPIN"
+    USID = "USID"
+    VN = "VN"
+    VP = "VP"
+    VS = "VS"
+    WC = "WC"
+    WCN = "WCN"
+    WP = "WP"
+    XV = "XV"
+    XX = "XX"
+
+    def __str__(self):
+        """
+        Return the value of the enum as a string.
+        """
+        return self.value
+
+
+class Identifier(pydantic.BaseModel):
+    """
+    The schema for an Identifier record
+    """
+
+    model_config = pydantic.ConfigDict(extra="allow")
+
+    type: IdentifierType
+    value: str
+    authority: typing.Optional[str] = None
+
+    @pydantic.field_validator("type", mode="before")
+    def parse_type(cls, value):
+        """
+        Parse type string into an IdentifierType enum
+        """
+        if value:   
+            return IdentifierType(value)
+        return value
+
+    #TODO: should we even keep this in? Can't return none for value so what to return if bad formatted SSN?
+    @pydantic.field_validator("value", mode="before")
+    def parse_value(cls, value, values):
+        """
+        Parse the value string
+        """
+        if values.data.get("type") == IdentifierType.SS:           
+            val = str(value).strip()
+
+            if re.match(r"^\d{3}-\d{2}-\d{4}$", val):
+                return val
+
+            if len(val) != 9 or not val.isdigit():
+                return ''
+
+            # Format back to the standard SSN format (XXX-XX-XXXX)
+            formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}"
+            return formatted_ssn
+
+        return value