This lesson covers data quality dimensions and profiling techniques to ensure reliable data analysis.
Data Profiling is the systematic examination of data to understand its structure, content, and quality. It involves analyzing data to identify patterns, anomalies, and quality issues.
Profiling Objectives
Types of Duplicates
Duplicate Detection Methods
-- Find duplicate records based on key fields
WITH duplicate_candidates AS (
SELECT
customer_id,
email,
first_name,
last_name,
phone,
-- Count occurrences of same email
COUNT(*) OVER (PARTITION BY email) as email_count,
-- Count occurrences of same name
COUNT(*) OVER (PARTITION BY first_name, last_name) as name_count,
-- Count occurrences of same phone
COUNT(*) OVER (PARTITION BY phone) as phone_count
FROM customers
WHERE email IS NOT NULL
)
SELECT
'Duplicate Analysis' as analysis_type,
email,
first_name,
last_name,
phone,
email_count,
name_count,
phone_count,
-- Duplicate classification
CASE
WHEN email_count > 1 THEN 'Email Duplicate'
WHEN name_count > 1 THEN 'Name Duplicate'
WHEN phone_count > 1 THEN 'Phone Duplicate'
ELSE 'Unique'
END as duplicate_type,
-- Risk level
CASE
WHEN email_count > 2 OR name_count > 2 THEN 'High Risk'
WHEN email_count > 1 OR name_count > 1 THEN 'Medium Risk'
ELSE 'Low Risk'
END as risk_level
FROM duplicate_candidates
WHERE email_count > 1 OR name_count > 1 OR phone_count > 1
ORDER BY email_count DESC, name_count DESC;
Types of Missing Data
Missing Data Patterns
-- Analyze missing data patterns
SELECT
'Missing Data Analysis' as analysis_type,
table_name,
column_name,
data_type,
COUNT(*) as total_records,
COUNT(CASE WHEN column_value IS NULL THEN 1 END) as null_count,
COUNT(CASE WHEN column_value = '' THEN 1 END) as empty_count,
COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) as total_missing,
-- Missing percentage
ROUND(COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*), 2) as missing_percentage,
-- Missing pattern classification
CASE
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 50 THEN 'Severe Missing'
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 20 THEN 'High Missing'
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 5 THEN 'Moderate Missing'
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 0 THEN 'Low Missing'
ELSE 'Complete'
END as missing_severity,
-- Recommended action
CASE
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 50 THEN 'Investigate Data Collection'
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 20 THEN 'Impute or Remove'
WHEN COUNT(CASE WHEN column_value IS NULL OR column_value = '' THEN 1 END) * 100.0 / COUNT(*) > 5 THEN 'Monitor Closely'
ELSE 'No Action Needed'
END as recommended_action
FROM data_profile
GROUP BY table_name, column_name, data_type
ORDER BY missing_percentage DESC;
Data Pattern Types
Pattern Detection Methods
-- Analyze data patterns in text fields
SELECT
'Pattern Analysis' as analysis_type,
column_name,
data_type,
COUNT(*) as total_records,
-- Email pattern analysis
COUNT(CASE WHEN column_value LIKE '%@%.%' THEN 1 END) as email_format_count,
COUNT(CASE WHEN column_value REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 END) as valid_email_count,
-- Phone pattern analysis
COUNT(CASE WHEN column_value REGEXP '^\+?[0-9]{10,15}$' THEN 1 END) as phone_format_count,
-- Text pattern analysis
AVG(LENGTH(column_value)) as avg_length,
MIN(LENGTH(column_value)) as min_length,
MAX(LENGTH(column_value)) as max_length,
-- Pattern quality
CASE
WHEN data_type = 'email' AND COUNT(CASE WHEN column_value REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 END) = COUNT(*) THEN 'Excellent'
WHEN data_type = 'email' AND COUNT(CASE WHEN column_value REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 END) > COUNT(*) * 0.9 THEN 'Good'
WHEN data_type = 'email' AND COUNT(CASE WHEN column_value REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 END) > COUNT(*) * 0.7 THEN 'Fair'
ELSE 'Poor'
END as pattern_quality
FROM data_profile
WHERE data_type IN ('email', 'phone', 'text')
GROUP BY column_name, data_type
ORDER BY pattern_quality;
Data Type Validation
Business Rule Validation
-- Comprehensive data validation checks
SELECT
'Data Validation' as validation_type,
table_name,
column_name,
validation_rule,
COUNT(*) as total_records,
COUNT(CASE WHEN validation_passed = true THEN 1 END) as valid_records,
COUNT(CASE WHEN validation_passed = false THEN 1 END) as invalid_records,
-- Validation pass rate
ROUND(COUNT(CASE WHEN validation_passed = true THEN 1 END) * 100.0 / COUNT(*), 2) as pass_rate,
-- Validation status
CASE
WHEN COUNT(CASE WHEN validation_passed = true THEN 1 END) * 100.0 / COUNT(*) >= 95 THEN 'Excellent'
WHEN COUNT(CASE WHEN validation_passed = true THEN 1 END) * 100.0 / COUNT(*) >= 90 THEN 'Good'
WHEN COUNT(CASE WHEN validation_passed = true THEN 1 END) * 100.0 / COUNT(*) >= 80 THEN 'Fair'
ELSE 'Poor'
END as validation_status
FROM validation_results
GROUP BY table_name, column_name, validation_rule
ORDER BY pass_rate ASC;
Cross-Table Consistency
Within-Table Consistency
-- Cross-table consistency checks
WITH consistency_checks AS (
-- Check referential integrity
SELECT
'Referential Integrity' as check_type,
'orders.customer_id' as field_checked,
COUNT(*) as total_orders,
COUNT(CASE WHEN c.customer_id IS NULL THEN 1 END) as orphaned_orders,
-- Consistency rate
ROUND((COUNT(*) - COUNT(CASE WHEN c.customer_id IS NULL THEN 1 END)) * 100.0 / COUNT(*), 2) as consistency_rate
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
UNION ALL
-- Check date consistency
SELECT
'Date Consistency' as check_type,
'orders.order_date vs ship_date' as field_checked,
COUNT(*) as total_orders,
COUNT(CASE WHEN ship_date < order_date THEN 1 END) as inconsistent_dates,
-- Consistency rate
ROUND((COUNT(*) - COUNT(CASE WHEN ship_date < order_date THEN 1 END)) * 100.0 / COUNT(*), 2) as consistency_rate
FROM orders
WHERE ship_date IS NOT NULL
)
SELECT
check_type,
field_checked,
total_orders,
orphaned_orders as inconsistent_records,
consistency_rate,
-- Consistency status
CASE
WHEN consistency_rate >= 99 THEN 'Excellent'
WHEN consistency_rate >= 95 THEN 'Good'
WHEN consistency_rate >= 90 THEN 'Fair'
ELSE 'Poor'
END as consistency_status,
-- Action needed
CASE
WHEN consistency_rate < 90 THEN 'Immediate Action Required'
WHEN consistency_rate < 95 THEN 'Investigate Issues'
WHEN consistency_rate < 99 THEN 'Monitor Closely'
ELSE 'Maintain Standards'
END as action_needed
FROM consistency_checks
ORDER BY consistency_rate ASC;
Continuous Monitoring
Quality Metrics
Financial Impact
Operational Impact
Statistical Impact
Decision Impact
-- Analyze impact of data quality issues on analysis
SELECT
'Quality Impact Analysis' as analysis_type,
quality_issue_type,
affected_table,
affected_analysis,
COUNT(*) as affected_records,
-- Impact metrics
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM analysis_results), 2) as impact_percentage,
-- Business impact
CASE
WHEN quality_issue_type = 'Missing Key Fields' THEN 'High Impact'
WHEN quality_issue_type = 'Duplicate Records' THEN 'Medium Impact'
WHEN quality_issue_type = 'Invalid Formats' THEN 'Medium Impact'
WHEN quality_issue_type = 'Inconsistent Values' THEN 'Low Impact'
ELSE 'Unknown Impact'
END as business_impact,
-- Recommended action
CASE
WHEN quality_issue_type = 'Missing Key Fields' THEN 'Immediate Data Collection Fix'
WHEN quality_issue_type = 'Duplicate Records' THEN 'Deduplication Required'
WHEN quality_issue_type = 'Invalid Formats' THEN 'Validation Rules Needed'
WHEN quality_issue_type = 'Inconsistent Values' THEN 'Standardization Required'
ELSE 'Investigate Further'
END as recommended_action
FROM quality_impact_analysis
GROUP BY quality_issue_type, affected_table, affected_analysis
ORDER BY impact_percentage DESC;
Customer Analytics
Financial Reporting
Marketing Campaigns
Prevention Strategies
Correction Strategies
Monitoring Strategies
In the next lesson, we'll explore data security and masking to understand how to protect sensitive data while maintaining usability.