This lesson covers data security principles and data masking techniques to protect sensitive information.
Data Masking is the process of hiding original data with modified content (characters or other data) to protect sensitive information while maintaining the format and structure of the original data.
Masking Objectives
Static Data Masking
Dynamic Data Masking
Substitution
Shuffling
Nulling/Deletion
Encryption
-- Example of data masking techniques
WITH masking_examples AS (
SELECT
customer_id,
-- Original sensitive data
email,
phone,
social_security_number,
credit_card_number,
-- Masked versions using different techniques
-- Email masking (substitution)
CASE
WHEN email LIKE '%@%' THEN
CONCAT('user', customer_id, '@example.com')
ELSE '[email protected]'
END as masked_email,
-- Phone masking (format preservation)
CASE
WHEN phone REGEXP '^[0-9]{10}$' THEN
CONCAT('XXX-XXX-', RIGHT(phone, 4))
WHEN phone REGEXP '^[0-9]{3}-[0-9]{3}-[0-9]{4}$' THEN
CONCAT('XXX-XXX-', SUBSTRING(phone, 9, 4))
ELSE 'XXX-XXX-XXXX'
END as masked_phone,
-- SSN masking (partial masking)
CASE
WHEN social_security_number REGEXP '^[0-9]{9}$' THEN
CONCAT('XXX-XX-', RIGHT(social_security_number, 4))
ELSE 'XXX-XX-XXXX'
END as masked_ssn,
-- Credit card masking (tokenization)
CASE
WHEN credit_card_number REGEXP '^[0-9]{16}$' THEN
CONCAT('XXXX-XXXX-XXXX-', RIGHT(credit_card_number, 4))
ELSE 'XXXX-XXXX-XXXX-XXXX'
END as masked_credit_card
FROM customers
)
SELECT
'Data Masking Example' as example_type,
customer_id,
masked_email,
masked_phone,
masked_ssn,
masked_credit_card,
-- Masking effectiveness
CASE
WHEN masked_email NOT LIKE '%@%' THEN 'Failed'
WHEN masked_email LIKE '%example.com' THEN 'Success'
ELSE 'Partial'
END as email_masking_status,
CASE
WHEN masked_phone LIKE 'XXX%' THEN 'Success'
ELSE 'Failed'
END as phone_masking_status
FROM masking_examples
LIMIT 10;
Anonymization vs Masking
Anonymization Techniques
Security Risks
Compliance Issues
Data Substitution
Data Subset Creation
Environment Isolation
-- Generate masked test data from production
WITH test_data_generation AS (
SELECT
customer_id,
-- Generate synthetic test data
CONCAT('testuser', customer_id) as test_username,
CONCAT('test', customer_id, '@example.com') as test_email,
-- Generate realistic but fake phone numbers
CONCAT('555-',
LPAD(FLOOR(RANDOM() * 1000), 3, '0'), '-',
LPAD(FLOOR(RANDOM() * 10000), 4, '0')) as test_phone,
-- Generate test addresses
CONCAT(FLOOR(RANDOM() * 9999) + 1, ' Test St.') as test_address,
CONCAT('Test City', FLOOR(RANDOM() * 10) + 1) as test_city,
-- Generate test dates within realistic ranges
DATE('2020-01-01', '+' || (FLOOR(RANDOM() * 1460)) || ' days') as test_created_date,
-- Generate test financial data
ROUND(RANDOM() * 100000 + 50000, 2) as test_income,
ROUND(RANDOM() * 10000, 2) as test_balance,
-- Data quality indicators
'SYNTHETIC' as data_source,
CURRENT_TIMESTAMP as generation_timestamp
FROM customers
WHERE customer_id <= 1000 -- Limit to subset for testing
)
SELECT
'Test Data Generation' as process_type,
COUNT(*) as records_generated,
COUNT(DISTINCT test_email) as unique_emails,
COUNT(DISTINCT test_phone) as unique_phones,
-- Data quality checks
COUNT(CASE WHEN test_email LIKE '%@example.com' THEN 1 END) as valid_emails,
COUNT(CASE WHEN test_phone REGEXP '^[0-9]{3}-[0-9]{3}-[0-9]{4}$' THEN 1 END) as valid_phones,
-- Date range validation
MIN(test_created_date) as earliest_date,
MAX(test_created_date) as latest_date,
-- Financial data validation
MIN(test_income) as min_income,
MAX(test_income) as max_income,
AVG(test_income) as avg_income
FROM test_data_generation;
Access Control
Data Protection
Monitoring and Auditing
Key Requirements
Data Subject Rights
Protected Health Information (PHI)
Security Rules
-- GDPR/HIPAA compliance monitoring dashboard
WITH compliance_metrics AS (
-- Data retention compliance
SELECT
'Data Retention' as compliance_area,
table_name,
COUNT(*) as total_records,
COUNT(CASE WHEN created_date < CURRENT_DATE - INTERVAL '7 years' THEN 1 END) as expired_records,
ROUND(COUNT(CASE WHEN created_date < CURRENT_DATE - INTERVAL '7 years' THEN 1 END) * 100.0 / COUNT(*), 2) as non_compliance_percentage,
CASE
WHEN COUNT(CASE WHEN created_date < CURRENT_DATE - INTERVAL '7 years' THEN 1 END) = 0 THEN 'Compliant'
WHEN COUNT(CASE WHEN created_date < CURRENT_DATE - INTERVAL '7 years' THEN 1 END) < COUNT(*) * 0.05 THEN 'Minor Issues'
ELSE 'Non-Compliant'
END as compliance_status
FROM customer_data
GROUP BY table_name
UNION ALL
-- Data access monitoring
SELECT
'Access Control' as compliance_area,
'Data Access Logs' as table_name,
COUNT(*) as total_access_events,
COUNT(CASE WHEN access_reason IS NULL THEN 1 END) as unauthorized_access,
ROUND(COUNT(CASE WHEN access_reason IS NULL THEN 1 END) * 100.0 / COUNT(*), 2) as non_compliance_percentage,
CASE
WHEN COUNT(CASE WHEN access_reason IS NULL THEN 1 END) = 0 THEN 'Compliant'
WHEN COUNT(CASE WHEN access_reason IS NULL THEN 1 END) < COUNT(*) * 0.01 THEN 'Minor Issues'
ELSE 'Non-Compliant'
END as compliance_status
FROM access_logs
WHERE access_timestamp >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY 'Data Access Logs'
UNION ALL
-- Data masking verification
SELECT
'Data Masking' as compliance_area,
'PII Fields' as table_name,
COUNT(*) as total_pii_fields,
COUNT(CASE WHEN is_masked = false THEN 1 END) as unmasked_fields,
ROUND(COUNT(CASE WHEN is_masked = false THEN 1 END) * 100.0 / COUNT(*), 2) as non_compliance_percentage,
CASE
WHEN COUNT(CASE WHEN is_masked = false THEN 1 END) = 0 THEN 'Compliant'
WHEN COUNT(CASE WHEN is_masked = false THEN 1 END) < COUNT(*) * 0.05 THEN 'Minor Issues'
ELSE 'Non-Compliant'
END as compliance_status
FROM pii_field_audit
GROUP BY 'PII Fields'
)
SELECT
compliance_area,
table_name,
total_records as total_items,
expired_records as compliance_issues,
non_compliance_percentage,
compliance_status,
-- Action recommendations
CASE
WHEN compliance_status = 'Non-Compliant' THEN 'Immediate Action Required'
WHEN compliance_status = 'Minor Issues' THEN 'Schedule Review'
ELSE 'Maintain Compliance'
END as recommended_action
FROM compliance_metrics
ORDER BY non_compliance_percentage DESC;
Data Classification
Privacy by Design
Documentation and Reporting
Data Collection
Data Storage
Data Processing
Data Disposal
Authentication
Authorization
Accountability
-- Monitor data access for security compliance
WITH access_analysis AS (
SELECT
user_id,
user_role,
table_accessed,
operation_type,
access_timestamp,
-- Risk assessment
CASE
WHEN table_accessed IN 'customers', 'patients', 'employees' THEN 'High Risk'
WHEN table_accessed IN 'orders', 'transactions', 'claims' THEN 'Medium Risk'
ELSE 'Low Risk'
END as risk_level,
-- Access pattern analysis
COUNT(*) OVER (PARTITION BY user_id, table_accessed) as user_table_access_count,
COUNT(*) OVER (PARTITION BY user_id) as total_user_access,
-- Time-based analysis
EXTRACT(HOUR FROM access_timestamp) as access_hour,
CASE
WHEN EXTRACT(HOUR FROM access_timestamp) BETWEEN 9 AND 17 THEN 'Business Hours'
ELSE 'After Hours'
END as access_time_category
FROM access_logs
WHERE access_timestamp >= CURRENT_DATE - INTERVAL '7 days'
)
SELECT
'Access Control Analysis' as analysis_type,
user_id,
user_role,
table_accessed,
operation_type,
risk_level,
user_table_access_count,
total_user_access,
access_time_category,
-- Security flags
CASE
WHEN risk_level = 'High Risk' AND user_role NOT IN 'admin', 'manager' THEN 'Suspicious Access'
WHEN access_time_category = 'After Hours' AND user_role NOT IN 'admin', 'analyst' THEN 'Unusual Time'
WHEN user_table_access_count > 1000 THEN 'Excessive Access'
ELSE 'Normal'
END as security_flag,
-- Recommended action
CASE
WHEN security_flag != 'Normal' THEN 'Investigate'
ELSE 'Monitor'
END as recommended_action
FROM access_analysis
WHERE security_flag != 'Normal'
ORDER BY risk_level DESC, user_table_access_count DESC;
Technical Controls
Administrative Controls
Physical Controls
Detection
Response
Recovery
In the next lesson, we'll explore bias, ethics, and fairness in data analytics to understand the social implications of data-driven decisions.