Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llm06 sensitive info disclosure #4

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
data/
.env/
.idea/
.DS_Store
Expand Down
17 changes: 0 additions & 17 deletions ai_sanitizer_app/sensitive_data_sanitizer.py

This file was deleted.

5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ matplotlib
seaborn
tqdm
pytest
presidio-anonymizer
presidio_analyzer



File renamed without changes.
25 changes: 12 additions & 13 deletions ai_sanitizer_app/config.py → sensitive_info_disclosure/config.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,42 @@
SENSITIVE_DATA_CONFIGS = {
"EMAIL": {
"pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"placeholder": "[EMAIL_ADDRESS]"
"placeholder": "<EMAIL_ADDRESS>"
},
"CREDIT_CARD": {
"pattern": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
"placeholder": "[CREDIT_CARD]"
"placeholder": "<CREDIT_CARD>"
},
"US_SSN": {
"pattern": r"\b(?!\d{9}$)\d{3}-?\d{2}-?\d{4}\b",
"placeholder": "[US_SSN]"
"placeholder": "<US_SSN>"
},
"US_BANK_ACCOUNT": {
"pattern": r"\b\d{9}\b",
"placeholder": "[US_BANK_ACCOUNT]"
"placeholder": "<US_BANK_ACCOUNT>"
},
"PHONE_NUMBER": {
"pattern": r"\b\d{3}-?\d{3}-?\d{4}\b",
"placeholder": "[PHONE_NUMBER]"
"placeholder": "<PHONE_NUMBER>"
},
"IP_ADDRESS": {
"pattern": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
"placeholder": "[IP_ADDRESS]"
"placeholder": "<IP_ADDRESS>"
},
"UUID": {
"pattern": r"\b[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\b",
"placeholder": "[UUID]"
"placeholder": "<UUID>"
},
"US_DRIVING_LICENSE": {
"pattern": r"\b[A-Z]{1,2}\d{4,8}\b",
"placeholder": "[US_DRIVING_LICENSE]"
"placeholder": "<US_DRIVING_LICENSE>"
},
"IBAN_CODE": {
"pattern": r"\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{14}\b",
"placeholder": "[IBAN_CODE]"
"placeholder": "<IBAN_CODE>"
},
"PERSON_NAME": {
"pattern": r"\b[A-Z][a-z]*(?:-[A-Z][a-z]*)? "
r"(?:[A-Z]\.? )?[A-Z][a-z]*(?:-[A-Z][a-z]*)?(?:,? (Jr\.|Sr\.|III|IV|Ph\.D\.))?",
"placeholder": "[PERSON_NAME]"
"OTHER": {
"pattern": r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b",
"placeholder": ""
}
}
32 changes: 32 additions & 0 deletions sensitive_info_disclosure/sensitive_data_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re
from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine


class SensitiveDataSanitizer:
def __init__(self) -> None:
self.sensitive_data = SENSITIVE_DATA_CONFIGS
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()

def sanitize_input(self, input_content: str) -> str:
sanitized_content = input_content
for entity, details in self.sensitive_data.items():
if entity == "OTHER":
analysis_results = self.analyzer.analyze(
text=sanitized_content,
language="en",
)
anonymized_results = self.anonymizer.anonymize(
text=sanitized_content,
analyzer_results=analysis_results
)
sanitized_content = anonymized_results.text
else:
regex_pattern = details["pattern"]
placeholder = details["placeholder"]
sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE)
return sanitized_content


5 changes: 2 additions & 3 deletions tests/test_sensitive_data_sanitizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from ai_sanitizer_app.sensitive_data_sanitizer import SensitiveDataSanitizer
from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS
from sensitive_info_disclosure.sensitive_data_sanitizer import SensitiveDataSanitizer
from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS

test_cases = {
"CREDIT_CARD": "4012-8888-8888-8881",
Expand All @@ -12,7 +12,6 @@
"UUID": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11",
"US_DRIVING_LICENSE": "CA1234567",
"US_BANK_ACCOUNT": "123456789",
"PERSON_NAME": "John A. Doe Jr.",
}


Expand Down