This lesson provides an introduction to machine learning concepts that are relevant for data analysts.
Supervised Learning
Unsupervised Learning
1. Classification
Goal: Predict discrete categories
Examples: Spam vs Not Spam, Customer Churn (Yes/No), Disease Diagnosis
Output: Class labels (0, 1, 2, etc.)
2. Regression
Goal: Predict continuous values
Examples: House Prices, Sales Forecast, Temperature Prediction
Output: Numerical values (123.45, 67.89, etc.)
1. Clustering
Goal: Group similar data points
Examples: Customer Segmentation, Document Grouping, Image Segmentation
Output: Cluster assignments (Cluster 1, Cluster 2, etc.)
2. Association
Goal: Find relationships between variables
Examples: Market Basket Analysis, Recommendation Systems
Output: Association rules (If A, then B)
3. Dimensionality Reduction
Goal: Reduce number of features while preserving information
Examples: Feature Extraction, Data Visualization, Noise Reduction
Output: Reduced feature set
Classification Data Preparation
-- Prepare data for supervised learning (classification)
WITH customer_features AS (
SELECT
customer_id,
-- Demographic features
age,
income_level,
gender,
location,
-- Behavioral features
total_purchases,
avg_order_value,
days_since_last_purchase,
website_visits_last_30d,
email_opens_last_30d,
-- Engagement features
support_tickets_count,
product_returns_count,
loyalty_program_status,
-- Target variable (what we want to predict)
CASE
WHEN churn_date IS NOT NULL AND churn_date <= CURRENT_DATE THEN 1
ELSE 0
END as churned,
-- Feature engineering
CASE
WHEN days_since_last_purchase > 90 THEN 'inactive'
WHEN days_since_last_purchase > 30 THEN 'at_risk'
ELSE 'active'
END as activity_status,
total_purchases / NULLIF(DATEDIFF(CURRENT_DATE, signup_date), 0) as purchase_frequency,
avg_order_value / NULLIF(income_level, 0) as spending_ratio
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
WHERE signup_date >= '2023-01-01'
),
training_data AS (
SELECT
customer_id,
-- Numeric features
age,
income_level,
total_purchases,
avg_order_value,
days_since_last_purchase,
website_visits_last_30d,
email_opens_last_30d,
support_tickets_count,
product_returns_count,
purchase_frequency,
spending_ratio,
-- Categorical features (would be encoded in ML pipeline)
gender,
location,
activity_status,
loyalty_program_status,
-- Target variable
churned,
-- Train/test split (80/20)
CASE
WHEN MOD(ABS(CRC32(customer_id)), 100) < 80 THEN 'training'
ELSE 'testing'
END as dataset_split
FROM customer_features
WHERE age IS NOT NULL
AND income_level IS NOT NULL
AND total_purchases IS NOT NULL
)
SELECT
dataset_split,
COUNT(*) as total_records,
COUNT(CASE WHEN churned = 1 THEN 1 END) as churned_customers,
COUNT(CASE WHEN churned = 0 THEN 1 END) as active_customers,
ROUND(COUNT(CASE WHEN churned = 1 THEN 1 END) * 100.0 / COUNT(*), 2) as churn_rate,
-- Feature statistics for training set
ROUND(AVG(CASE WHEN dataset_split = 'training' THEN age END), 1) as avg_age_training,
ROUND(AVG(CASE WHEN dataset_split = 'training' THEN income_level END), 0) as avg_income_training,
ROUND(AVG(CASE WHEN dataset_split = 'training' THEN total_purchases END), 1) as avg_purchases_training
FROM training_data
GROUP BY dataset_split
ORDER BY dataset_split;
Regression Data Preparation
-- Prepare data for supervised learning (regression)
WITH sales_features AS (
SELECT
sale_id,
-- Target variable (what we want to predict)
sale_amount as target_variable,
-- Temporal features
EXTRACT(YEAR FROM sale_date) as sale_year,
EXTRACT(MONTH FROM sale_date) as sale_month,
EXTRACT(DAY OF WEEK FROM sale_date) as sale_dayofweek,
EXTRACT(QUARTER FROM sale_date) as sale_quarter,
-- Product features
product_category,
product_price,
product_age_days,
-- Customer features
customer_age,
customer_income_level,
customer_previous_purchases,
customer_days_since_first_purchase,
-- Promotion features
promotion_applied,
discount_percentage,
-- Store/Channel features
store_location,
sales_channel,
-- Feature engineering
CASE
WHEN EXTRACT(MONTH FROM sale_date) IN (11, 12) THEN 'holiday_season'
WHEN EXTRACT(MONTH FROM sale_date) IN (6, 7, 8) THEN 'summer_season'
ELSE 'regular_season'
END as season_type,
product_price * (1 - discount_percentage / 100) as effective_price,
customer_previous_purchases / NULLIF(customer_days_since_first_purchase, 0) as customer_purchase_frequency
FROM sales s
JOIN products p ON s.product_id = p.product_id
JOIN customers c ON s.customer_id = c.customer_id
WHERE sale_date >= '2023-01-01'
),
training_data AS (
SELECT
sale_id,
target_variable,
-- Numeric features
sale_year,
sale_month,
sale_dayofweek,
sale_quarter,
product_price,
product_age_days,
customer_age,
customer_income_level,
customer_previous_purchases,
customer_days_since_first_purchase,
discount_percentage,
effective_price,
customer_purchase_frequency,
-- Categorical features
product_category,
season_type,
promotion_applied,
store_location,
sales_channel,
-- Train/validation/test split (70/15/15)
CASE
WHEN MOD(ABS(CRC32(sale_id)), 100) < 70 THEN 'training'
WHEN MOD(ABS(CRC32(sale_id)), 100) < 85 THEN 'validation'
ELSE 'testing'
END as dataset_split
FROM sales_features
WHERE target_variable IS NOT NULL
AND target_variable > 0
)
SELECT
dataset_split,
COUNT(*) as total_records,
ROUND(AVG(target_variable), 2) as avg_sale_amount,
ROUND(STDDEV(target_variable), 2) as stddev_sale_amount,
ROUND(MIN(target_variable), 2) as min_sale_amount,
ROUND(MAX(target_variable), 2) as max_sale_amount,
-- Feature correlations with target (simplified)
ROUND(CORR(target_variable, product_price), 3) as price_correlation,
ROUND(CORR(target_variable, discount_percentage), 3) as discount_correlation,
ROUND(CORR(target_variable, customer_income_level), 3) as income_correlation
FROM training_data
GROUP BY dataset_split
ORDER BY dataset_split;
Clustering Data Preparation
-- Prepare data for unsupervised learning (clustering)
WITH customer_behavior_features AS (
SELECT
customer_id,
-- Recency, Frequency, Monetary (RFM) features
DATEDIFF(CURRENT_DATE, MAX(order_date)) as recency_days,
COUNT(DISTINCT order_id) as frequency_count,
SUM(order_total) as monetary_value,
-- Behavioral features
AVG(order_total) as avg_order_value,
STDDEV(order_total) as order_value_variance,
COUNT(DISTINCT product_category) as category_diversity,
-- Temporal patterns
COUNT(DISTINCT EXTRACT(MONTH FROM order_date)) as active_months,
AVG(DATEDIFF(next_order_date, order_date)) as avg_days_between_orders,
-- Channel preferences
COUNT(CASE WHEN sales_channel = 'online' THEN 1 END) as online_orders,
COUNT(CASE WHEN sales_channel = 'retail' THEN 1 END) as retail_orders,
COUNT(CASE WHEN sales_channel = 'mobile' THEN 1 END) as mobile_orders,
-- Product preferences
COUNT(CASE WHEN product_category = 'electronics' THEN 1 END) as electronics_purchases,
COUNT(CASE WHEN product_category = 'clothing' THEN 1 END) as clothing_purchases,
COUNT(CASE WHEN product_category = 'home' THEN 1 END) as home_purchases,
-- Engagement metrics
website_visits_last_30d,
email_opens_last_30d,
support_tickets_count,
-- Derived features
monetary_value / NULLIF(frequency_count, 0) as avg_monetary_per_transaction,
CASE
WHEN recency_days <= 30 THEN 'very_recent'
WHEN recency_days <= 90 THEN 'recent'
WHEN recency_days <= 180 THEN 'moderate'
ELSE 'inactive'
END as recency_segment,
frequency_count / NULLIF(DATEDIFF(CURRENT_DATE, MIN(order_date)) / 30, 0) as monthly_frequency
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
LEFT JOIN (
SELECT
order_id,
customer_id,
LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date) as next_order_date
FROM orders
) order_dates ON o.order_id = order_dates.order_id
WHERE order_date >= '2023-01-01'
GROUP BY customer_id
),
feature_normalization AS (
SELECT
customer_id,
-- Normalize features to 0-1 scale for clustering
(recency_days - MIN(recency_days) OVER ()) / NULLIF(MAX(recency_days) OVER () - MIN(recency_days) OVER (), 0) as recency_normalized,
(frequency_count - MIN(frequency_count) OVER ()) / NULLIF(MAX(frequency_count) OVER () - MIN(frequency_count) OVER (), 0) as frequency_normalized,
(monetary_value - MIN(monetary_value) OVER ()) / NULLIF(MAX(monetary_value) OVER () - MIN(monetary_value) OVER (), 0) as monetary_normalized,
(avg_order_value - MIN(avg_order_value) OVER ()) / NULLIF(MAX(avg_order_value) OVER () - MIN(avg_order_value) OVER (), 0) as aov_normalized,
(category_diversity - MIN(category_diversity) OVER ()) / NULLIF(MAX(category_diversity) OVER () - MIN(category_diversity) OVER (), 0) as diversity_normalized,
(website_visits_last_30d - MIN(website_visits_last_30d) OVER ()) / NULLIF(MAX(website_visits_last_30d) OVER () - MIN(website_visits_last_30d) OVER (), 0) as web_visits_normalized
FROM customer_behavior_features
WHERE recency_days IS NOT NULL
AND frequency_count > 0
AND monetary_value > 0
)
SELECT
'Feature Statistics' as metric_type,
COUNT(*) as total_customers,
ROUND(AVG(recency_normalized), 3) as avg_recency_score,
ROUND(AVG(frequency_normalized), 3) as avg_frequency_score,
ROUND(AVG(monetary_normalized), 3) as avg_monetary_score,
ROUND(AVG(aov_normalized), 3) as avg_aov_score,
ROUND(AVG(diversity_normalized), 3) as avg_diversity_score,
ROUND(AVG(web_visits_normalized), 3) as avg_web_visits_score
FROM feature_normalization
UNION ALL
SELECT
'Data Quality' as metric_type,
COUNT(*) as total_customers,
COUNT(CASE WHEN recency_days IS NOT NULL THEN 1 END) as has_recency,
COUNT(CASE WHEN frequency_count > 0 THEN 1 END) as has_frequency,
COUNT(CASE WHEN monetary_value > 0 THEN 1 END) as has_monetary,
COUNT(CASE WHEN avg_order_value > 0 THEN 1 END) as has_aov,
COUNT(CASE WHEN category_diversity > 0 THEN 1 END) as has_diversity,
COUNT(CASE WHEN website_visits_last_30d >= 0 THEN 1 END) as has_web_visits
FROM customer_behavior_features;
Regression
Classification
1. Linear Regression
-- Linear regression example: Predict house prices
WITH house_features AS (
SELECT
property_id,
price as target_variable,
square_feet,
bedrooms,
bathrooms,
age_years,
lot_size,
garage_spaces,
-- Feature engineering
square_feet / NULLIF(bedrooms, 0) as sqft_per_bedroom,
bathrooms / NULLIF(bedrooms, 0) as bathroom_ratio,
CASE
WHEN age_years <= 5 THEN 'new'
WHEN age_years <= 20 THEN 'modern'
WHEN age_years <= 50 THEN 'established'
ELSE 'old'
END as age_category,
-- Location features
neighborhood_median_income,
school_rating,
crime_rate,
distance_to_downtown
FROM properties
WHERE price IS NOT NULL AND price > 0
),
regression_analysis AS (
SELECT
-- Simple linear regression coefficients (simplified)
CORR(square_feet, target_variable) as sqft_correlation,
CORR(bedrooms, target_variable) as bedrooms_correlation,
CORR(bathrooms, target_variable) as bathrooms_correlation,
CORR(age_years, target_variable) as age_correlation,
CORR(lot_size, target_variable) as lot_correlation,
-- Multiple regression approximation
ROUND(
AVG(target_variable) -
(CORR(square_feet, target_variable) * STDDEV(target_variable) / STDDEV(square_feet) * AVG(square_feet)) -
(CORR(bedrooms, target_variable) * STDDEV(target_variable) / STDDEV(bedrooms) * AVG(bedrooms)) -
(CORR(bathrooms, target_variable) * STDDEV(target_variable) / STDDEV(bathrooms) * AVG(bathrooms))
, 2) as intercept_approximation,
-- Feature importance ranking
ROW_NUMBER() OVER (ORDER BY ABS(CORR(square_feet, target_variable)) DESC) as sqft_rank,
ROW_NUMBER() OVER (ORDER BY ABS(CORR(bedrooms, target_variable)) DESC) as bedrooms_rank,
ROW_NUMBER() OVER (ORDER BY ABS(CORR(bathrooms, target_variable)) DESC) as bathrooms_rank,
ROW_NUMBER() OVER (ORDER BY ABS(CORR(age_years, target_variable)) DESC) as age_rank
FROM house_features
)
SELECT
'Feature Correlations' as analysis_type,
ROUND(sqft_correlation, 3) as sqft_correlation,
ROUND(bedrooms_correlation, 3) as bedrooms_correlation,
ROUND(bathrooms_correlation, 3) as bathrooms_correlation,
ROUND(age_correlation, 3) as age_correlation,
ROUND(lot_correlation, 3) as lot_correlation,
intercept_approximation as estimated_intercept
FROM regression_analysis
UNION ALL
SELECT
'Feature Importance' as analysis_type,
sqft_rank as sqft_importance,
bedrooms_rank as bedrooms_importance,
bathrooms_rank as bathrooms_importance,
age_rank as age_importance,
NULL as lot_correlation,
NULL as estimated_intercept
UNION ALL
SELECT
'Model Performance' as analysis_type,
COUNT(*) as total_properties,
ROUND(AVG(target_variable), 0) as avg_price,
ROUND(STDDEV(target_variable), 0) as price_stddev,
ROUND(AVG(square_feet), 0) as avg_sqft,
ROUND(AVG(bedrooms), 1) as avg_bedrooms,
NULL as lot_correlation,
NULL as estimated_intercept
FROM house_features;
2. Polynomial Regression
-- Polynomial regression example: Non-linear relationships
WITH sales_time_series AS (
SELECT
date,
sales_amount,
-- Time-based features
DATEDIFF(date, '2023-01-01') as day_number,
EXTRACT(DOY FROM date) as day_of_year,
EXTRACT(MONTH FROM date) as month_number,
-- Polynomial features
POWER(DATEDIFF(date, '2023-01-01'), 2) as day_squared,
POWER(DATEDIFF(date, '2023-01-01'), 3) as day_cubed,
POWER(EXTRACT(DOY FROM date), 2) as doy_squared,
-- Seasonal features
SIN(2 * PI() * EXTRACT(DOY FROM date) / 365.25) as sin_annual,
COS(2 * PI() * EXTRACT(DOY FROM date) / 365.25) as cos_annual,
SIN(2 * PI() * EXTRACT(DOY FROM date) / 7) as sin_weekly,
COS(2 * PI() * EXTRACT(DOY FROM date) / 7) as cos_weekly
FROM daily_sales
WHERE date >= '2023-01-01' AND date < '2024-01-01'
),
polynomial_features AS (
SELECT
date,
sales_amount,
day_number,
day_of_year,
month_number,
day_squared,
day_cubed,
doy_squared,
sin_annual,
cos_annual,
sin_weekly,
cos_weekly,
-- Lag features for time series
LAG(sales_amount, 1) OVER (ORDER BY date) as lag_1_day,
LAG(sales_amount, 7) OVER (ORDER BY date) as lag_7_days,
LAG(sales_amount, 30) OVER (ORDER BY date) as lag_30_days,
-- Moving averages
AVG(sales_amount) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as ma_7_days,
AVG(sales_amount) OVER (ORDER BY date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW) as ma_30_days
FROM sales_time_series
)
SELECT
'Linear Features' as feature_type,
ROUND(CORR(day_number, sales_amount), 3) as linear_correlation,
ROUND(CORR(day_of_year, sales_amount), 3) as doy_correlation,
ROUND(CORR(month_number, sales_amount), 3) as month_correlation,
NULL as poly_correlation,
NULL as seasonal_correlation,
NULL as lag_correlation
FROM polynomial_features
UNION ALL
SELECT
'Polynomial Features' as feature_type,
ROUND(CORR(day_squared, sales_amount), 3) as linear_correlation,
ROUND(CORR(day_cubed, sales_amount), 3) as doy_correlation,
ROUND(CORR(doy_squared, sales_amount), 3) as month_correlation,
NULL as poly_correlation,
NULL as seasonal_correlation,
NULL as lag_correlation
FROM polynomial_features
UNION ALL
SELECT
'Seasonal Features' as feature_type,
ROUND(CORR(sin_annual, sales_amount), 3) as linear_correlation,
ROUND(CORR(cos_annual, sales_amount), 3) as doy_correlation,
ROUND(CORR(sin_weekly, sales_amount), 3) as month_correlation,
ROUND(CORR(cos_weekly, sales_amount), 3) as poly_correlation,
NULL as seasonal_correlation,
NULL as lag_correlation
FROM polynomial_features
UNION ALL
SELECT
'Lag Features' as feature_type,
ROUND(CORR(lag_1_day, sales_amount), 3) as linear_correlation,
ROUND(CORR(lag_7_days, sales_amount), 3) as doy_correlation,
ROUND(CORR(lag_30_days, sales_amount), 3) as month_correlation,
ROUND(CORR(ma_7_days, sales_amount), 3) as poly_correlation,
ROUND(CORR(ma_30_days, sales_amount), 3) as seasonal_correlation,
NULL as lag_correlation
FROM polynomial_features;
1. Logistic Regression
-- Logistic regression example: Customer churn prediction
WITH customer_churn_features AS (
SELECT
customer_id,
-- Target variable
CASE
WHEN churn_date IS NOT NULL THEN 1
ELSE 0
END as churned,
-- Customer demographics
age,
income_level,
tenure_months,
-- Usage metrics
monthly_spend,
avg_monthly_spend,
usage_frequency,
support_tickets_count,
-- Engagement metrics
last_login_days_ago,
feature_adoption_rate,
-- Satisfaction metrics
satisfaction_score,
net_promoter_score,
-- Derived features
monthly_spend / NULLIF(income_level, 0) as spend_to_income_ratio,
usage_frequency / NULLIF(tenure_months, 0) as usage_frequency_per_month,
satisfaction_score * feature_adoption_rate as engagement_score
FROM customers
WHERE signup_date >= '2022-01-01'
),
logistic_regression_features AS (
SELECT
churned,
age,
income_level,
tenure_months,
monthly_spend,
usage_frequency,
support_tickets_count,
last_login_days_ago,
feature_adoption_rate,
satisfaction_score,
spend_to_income_ratio,
usage_frequency_per_month,
engagement_score,
-- Log transformation for skewed features
LOG(NULLIF(monthly_spend, 0)) as log_monthly_spend,
LOG(NULLIF(usage_frequency, 0)) as log_usage_frequency
FROM customer_churn_features
WHERE monthly_spend > 0 AND usage_frequency > 0
),
feature_analysis AS (
SELECT
churned,
-- Point-biserial correlation for binary target
AVG(CASE WHEN churned = 1 THEN age END) - AVG(CASE WHEN churned = 0 THEN age END) as age_diff,
AVG(CASE WHEN churned = 1 THEN income_level END) - AVG(CASE WHEN churned = 0 THEN income_level END) as income_diff,
AVG(CASE WHEN churned = 1 THEN tenure_months END) - AVG(CASE WHEN churned = 0 THEN tenure_months END) as tenure_diff,
AVG(CASE WHEN churned = 1 THEN monthly_spend END) - AVG(CASE WHEN churned = 0 THEN monthly_spend END) as spend_diff,
AVG(CASE WHEN churned = 1 THEN satisfaction_score END) - AVG(CASE WHEN churned = 0 THEN satisfaction_score END) as satisfaction_diff,
-- Correlation coefficients
CORR(age, churned) as age_correlation,
CORR(income_level, churned) as income_correlation,
CORR(tenure_months, churned) as tenure_correlation,
CORR(monthly_spend, churned) as spend_correlation,
CORR(satisfaction_score, churned) as satisfaction_correlation
FROM logistic_regression_features
)
SELECT
'Churn Analysis' as analysis_type,
COUNT(*) as total_customers,
COUNT(CASE WHEN churned = 1 THEN 1 END) as churned_customers,
ROUND(COUNT(CASE WHEN churned = 1 THEN 1 END) * 100.0 / COUNT(*), 2) as churn_rate,
ROUND(AVG(CASE WHEN churned = 1 THEN age END), 1) as avg_age_churned,
ROUND(AVG(CASE WHEN churned = 0 THEN age END), 1) as avg_age_not_churned,
ROUND(age_diff, 1) as age_difference,
ROUND(age_correlation, 3) as age_correlation
FROM feature_analysis
UNION ALL
SELECT
'Feature Importance' as analysis_type,
NULL as total_customers,
NULL as churned_customers,
NULL as churn_rate,
ROUND(income_diff, 0) as avg_age_churned,
ROUND(tenure_diff, 1) as avg_age_not_churned,
ROUND(spend_diff, 2) as age_difference,
ROUND(spend_correlation, 3) as age_correlation
FROM feature_analysis
UNION ALL
SELECT
'Model Insights' as analysis_type,
NULL as total_customers,
NULL as churned_customers,
NULL as churn_rate,
ROUND(satisfaction_diff, 2) as avg_age_churned,
ROUND(satisfaction_correlation, 3) as avg_age_not_churned,
NULL as age_difference,
NULL as age_correlation
FROM feature_analysis;
2. Decision Trees
-- Decision tree example: Credit risk assessment
WITH credit_applications AS (
SELECT
application_id,
-- Target variable
CASE
WHEN loan_defaulted = 1 THEN 'high_risk'
WHEN late_payments > 2 THEN 'medium_risk'
ELSE 'low_risk'
END as risk_category,
-- Applicant features
age,
annual_income,
credit_score,
employment_years,
debt_to_income_ratio,
-- Loan features
loan_amount,
loan_term_months,
interest_rate,
-- History features
previous_loans_count,
previous_defaults_count,
-- Derived features
loan_amount / NULLIF(annual_income, 0) as loan_to_income_ratio,
credit_score / 10.0 + employment_years as credit_stability_score,
CASE
WHEN previous_loans_count = 0 THEN 'first_time'
WHEN previous_defaults_count = 0 THEN 'good_history'
ELSE 'risky_history'
END as credit_history_type
FROM loan_applications
WHERE application_date >= '2023-01-01'
),
decision_tree_splits AS (
SELECT
risk_category,
-- Find optimal split points (simplified)
CASE
WHEN credit_score >= 700 THEN 'high_credit'
WHEN credit_score >= 600 THEN 'medium_credit'
ELSE 'low_credit'
END as credit_score_bucket,
CASE
WHEN debt_to_income_ratio <= 0.3 THEN 'low_debt'
WHEN debt_to_income_ratio <= 0.5 THEN 'medium_debt'
ELSE 'high_debt'
END as debt_ratio_bucket,
CASE
WHEN employment_years >= 5 THEN 'stable_employment'
WHEN employment_years >= 2 THEN 'moderate_employment'
ELSE 'unstable_employment'
END as employment_stability,
CASE
WHEN loan_to_income_ratio <= 0.2 THEN 'small_loan'
WHEN loan_to_income_ratio <= 0.5 THEN 'medium_loan'
ELSE 'large_loan'
END as loan_size_bucket
FROM credit_applications
),
tree_analysis AS (
SELECT
credit_score_bucket,
debt_ratio_bucket,
employment_stability,
loan_size_bucket,
risk_category,
COUNT(*) as applications_count,
-- Calculate purity (Gini impurity approximation)
COUNT(CASE WHEN risk_category = 'low_risk' THEN 1 END) * 1.0 / COUNT(*) as low_risk_ratio,
COUNT(CASE WHEN risk_category = 'medium_risk' THEN 1 END) * 1.0 / COUNT(*) as medium_risk_ratio,
COUNT(CASE WHEN risk_category = 'high_risk' THEN 1 END) * 1.0 / COUNT(*) as high_risk_ratio
FROM decision_tree_splits
GROUP BY credit_score_bucket, debt_ratio_bucket, employment_stability, loan_size_bucket, risk_category
)
SELECT
'Credit Score Split' as split_type,
credit_score_bucket as split_value,
COUNT(*) as total_applications,
ROUND(AVG(low_risk_ratio) * 100, 1) as low_risk_pct,
ROUND(AVG(medium_risk_ratio) * 100, 1) as medium_risk_pct,
ROUND(AVG(high_risk_ratio) * 100, 1) as high_risk_pct,
CASE
WHEN AVG(high_risk_ratio) > 0.5 THEN 'High Risk Node'
WHEN AVG(low_risk_ratio) > 0.7 THEN 'Low Risk Node'
ELSE 'Mixed Risk Node'
END as node_classification
FROM tree_analysis
GROUP BY credit_score_bucket
UNION ALL
SELECT
'Debt Ratio Split' as split_type,
debt_ratio_bucket as split_value,
COUNT(*) as total_applications,
ROUND(AVG(low_risk_ratio) * 100, 1) as low_risk_pct,
ROUND(AVG(medium_risk_ratio) * 100, 1) as medium_risk_pct,
ROUND(AVG(high_risk_ratio) * 100, 1) as high_risk_pct,
CASE
WHEN AVG(high_risk_ratio) > 0.5 THEN 'High Risk Node'
WHEN AVG(low_risk_ratio) > 0.7 THEN 'Low Risk Node'
ELSE 'Mixed Risk Node'
END as node_classification
FROM tree_analysis
GROUP BY debt_ratio_bucket
UNION ALL
SELECT
'Employment Split' as split_type,
employment_stability as split_value,
COUNT(*) as total_applications,
ROUND(AVG(low_risk_ratio) * 100, 1) as low_risk_pct,
ROUND(AVG(medium_risk_ratio) * 100, 1) as medium_risk_pct,
ROUND(AVG(high_risk_ratio) * 100, 1) as high_risk_pct,
CASE
WHEN AVG(high_risk_ratio) > 0.5 THEN 'High Risk Node'
WHEN AVG(low_risk_ratio) > 0.7 THEN 'Low Risk Node'
ELSE 'Mixed Risk Node'
END as node_classification
FROM tree_analysis
GROUP BY employment_stability;
Algorithm Steps
Key Concepts
K-means Implementation (Simplified)
-- K-means clustering implementation using SQL
WITH customer_features AS (
SELECT
customer_id,
-- Normalize features for clustering
(total_spend - MIN(total_spend) OVER ()) / NULLIF(MAX(total_spend) OVER () - MIN(total_spend) OVER (), 0) as spend_normalized,
(purchase_frequency - MIN(purchase_frequency) OVER ()) / NULLIF(MAX(purchase_frequency) OVER ()) - MIN(purchase_frequency) OVER (), 0) as frequency_normalized,
(avg_basket_size - MIN(avg_basket_size) OVER ()) / NULLIF(MAX(avg_basket_size) OVER ()) - MIN(avg_basket_size) OVER (), 0) as basket_normalized,
(days_since_last_purchase - MIN(days_since_last_purchase) OVER ()) / NULLIF(MAX(days_since_last_purchase) OVER ()) - MIN(days_since_last_purchase) OVER (), 0) as recency_normalized
FROM customer_summary
),
initial_centroids AS (
-- Initialize K centroids (K=4 for this example)
SELECT
cluster_id,
spend_normalized as centroid_spend,
frequency_normalized as centroid_frequency,
basket_normalized as centroid_basket,
recency_normalized as centroid_recency
FROM (
SELECT
1 as cluster_id, spend_normalized, frequency_normalized, basket_normalized, recency_normalized
FROM customer_features
ORDER BY customer_id
LIMIT 1
UNION ALL
SELECT
2 as cluster_id, spend_normalized, frequency_normalized, basket_normalized, recency_normalized
FROM customer_features
ORDER BY customer_id DESC
LIMIT 1
UNION ALL
SELECT
3 as cluster_id, spend_normalized, frequency_normalized, basket_normalized, recency_normalized
FROM customer_features
ORDER BY spend_normalized
LIMIT 1
UNION ALL
SELECT
4 as cluster_id, spend_normalized, frequency_normalized, basket_normalized, recency_normalized
FROM customer_features
ORDER BY frequency_normalized DESC
LIMIT 1
) initial_seeds
),
iteration_1_assignments AS (
-- Assign customers to nearest centroids
SELECT
cf.customer_id,
cf.spend_normalized,
cf.frequency_normalized,
cf.basket_normalized,
cf.recency_normalized,
-- Calculate distance to each centroid
ic.cluster_id,
POWER(cf.spend_normalized - ic.centroid_spend, 2) +
POWER(cf.frequency_normalized - ic.centroid_frequency, 2) +
POWER(cf.basket_normalized - ic.centroid_basket, 2) +
POWER(cf.recency_normalized - ic.centroid_recency, 2) as distance_squared,
ROW_NUMBER() OVER (PARTITION BY cf.customer_id ORDER BY
POWER(cf.spend_normalized - ic.centroid_spend, 2) +
POWER(cf.frequency_normalized - ic.centroid_frequency, 2) +
POWER(cf.basket_normalized - ic.centroid_basket, 2) +
POWER(cf.recency_normalized - ic.centroid_recency, 2) ASC) as distance_rank
FROM customer_features cf
CROSS JOIN initial_centroids ic
),
iteration_1_clusters AS (
SELECT
customer_id,
spend_normalized,
frequency_normalized,
basket_normalized,
recency_normalized,
cluster_id as assigned_cluster
FROM iteration_1_assignments
WHERE distance_rank = 1
),
iteration_1_centroids AS (
-- Recalculate centroids
SELECT
cluster_id,
AVG(spend_normalized) as new_centroid_spend,
AVG(frequency_normalized) as new_centroid_frequency,
AVG(basket_normalized) as new_centroid_basket,
AVG(recency_normalized) as new_centroid_recency,
COUNT(*) as cluster_size
FROM iteration_1_clusters
GROUP BY cluster_id
),
final_clusters AS (
SELECT
i1c.customer_id,
i1c.assigned_cluster,
i1c.spend_normalized,
i1c.frequency_normalized,
i1c.basket_normalized,
i1c.recency_normalized,
i1c1.cluster_size,
-- Denormalize for interpretation
i1c.spend_normalized * (SELECT MAX(total_spend) - MIN(total_spend) FROM customer_summary) + (SELECT MIN(total_spend) FROM customer_summary) as estimated_spend,
i1c.frequency_normalized * (SELECT MAX(purchase_frequency) - MIN(purchase_frequency) FROM customer_summary) + (SELECT MIN(purchase_frequency) FROM customer_summary) as estimated_frequency
FROM iteration_1_clusters i1c
JOIN iteration_1_centroids i1c1 ON i1c.assigned_cluster = i1c1.cluster_id
)
SELECT
'Cluster Summary' as analysis_type,
assigned_cluster as cluster_id,
COUNT(*) as customer_count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 1) as percentage_of_customers,
ROUND(AVG(estimated_spend), 2) as avg_spend,
ROUND(AVG(estimated_frequency), 1) as avg_frequency,
ROUND(AVG(spend_normalized), 3) as avg_spend_normalized,
ROUND(AVG(frequency_normalized), 3) as avg_frequency_normalized,
ROUND(AVG(basket_normalized), 3) as avg_basket_normalized,
ROUND(AVG(recency_normalized), 3) as avg_recency_normalized,
-- Cluster characterization
CASE
WHEN AVG(spend_normalized) > 0.7 AND AVG(frequency_normalized) > 0.7 THEN 'High Value Champions'
WHEN AVG(spend_normalized) > 0.5 AND AVG(frequency_normalized) < 0.3 THEN 'Big Spenders'
WHEN AVG(spend_normalized) < 0.3 AND AVG(frequency_normalized) > 0.7 THEN 'Frequent Bargain Hunters'
WHEN AVG(recency_normalized) > 0.7 THEN 'At Risk Customers'
ELSE 'Average Customers'
END as cluster_profile
FROM final_clusters
GROUP BY assigned_cluster
ORDER BY assigned_cluster;
Optimal K Selection (Elbow Method)
-- Elbow method to find optimal K
WITH cluster_analysis AS (
SELECT
customer_id,
spend_normalized,
frequency_normalized,
basket_normalized,
recency_normalized
FROM customer_features
),
k_analysis AS (
-- Test different values of K
SELECT
2 as k_value,
-- Simplified within-cluster sum of squares calculation
SUM(POWER(spend_normalized - AVG(spend_normalized) OVER (), 2) +
POWER(frequency_normalized - AVG(frequency_normalized) OVER (), 2) +
POWER(basket_normalized - AVG(basket_normalized) OVER (), 2) +
POWER(recency_normalized - AVG(recency_normalized) OVER (), 2)) as wcss
FROM cluster_analysis
GROUP BY customer_id -- This is simplified - real implementation would use actual clustering
UNION ALL
SELECT
3 as k_value,
SUM(POWER(spend_normalized - AVG(spend_normalized) OVER (), 2) +
POWER(frequency_normalized - AVG(frequency_normalized) OVER (), 2) +
POWER(basket_normalized - AVG(basket_normalized) OVER (), 2) +
POWER(recency_normalized - AVG(recency_normalized) OVER (), 2)) as wcss
FROM cluster_analysis
GROUP BY customer_id
UNION ALL
SELECT
4 as k_value,
SUM(POWER(spend_normalized - AVG(spend_normalized) OVER (), 2) +
POWER(frequency_normalized - AVG(frequency_normalized) OVER (), 2) +
POWER(basket_normalized - AVG(basket_normalized) OVER (), 2) +
POWER(recency_normalized - AVG(recency_normalized) OVER (), 2)) as wcss
FROM cluster_analysis
GROUP BY customer_id
UNION ALL
SELECT
5 as k_value,
SUM(POWER(spend_normalized - AVG(spend_normalized) OVER (), 2) +
POWER(frequency_normalized - AVG(frequency_normalized) OVER (), 2) +
POWER(basket_normalized - AVG(basket_normalized) OVER (), 2) +
POWER(recency_normalized - AVG(recency_normalized) OVER (), 2)) as wcss
FROM cluster_analysis
GROUP BY customer_id
),
elbow_calculation AS (
SELECT
k_value,
wcss,
LAG(wcss) OVER (ORDER BY k_value) as previous_wcss,
-- Calculate percentage reduction
(LAG(wcss) OVER (ORDER BY k_value) - wcss) / LAG(wcss) OVER (ORDER BY k_value) * 100 as reduction_percentage
FROM k_analysis
)
SELECT
k_value as number_of_clusters,
ROUND(wcss, 2) as within_cluster_sum_of_squares,
ROUND(reduction_percentage, 2) as reduction_from_previous_k,
CASE
WHEN reduction_percentage < 10 THEN 'Diminishing Returns - Consider this K'
WHEN reduction_percentage >= 20 THEN 'Significant Improvement'
ELSE 'Moderate Improvement'
END as interpretation
FROM elbow_calculation
ORDER BY k_value;
Customer Segmentation Analysis
-- Detailed cluster analysis for business insights
WITH customer_clusters AS (
SELECT
customer_id,
cluster_id,
total_spend,
purchase_frequency,
avg_basket_size,
days_since_last_purchase,
customer_lifetime_value,
-- Behavioral metrics
website_visits_last_30d,
email_open_rate,
product_category_diversity,
return_rate
FROM final_customer_clusters
),
cluster_profiles AS (
SELECT
cluster_id,
COUNT(*) as cluster_size,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 1) as population_percentage,
-- Financial metrics
ROUND(AVG(total_spend), 2) as avg_total_spend,
ROUND(AVG(customer_lifetime_value), 2) as avg_clv,
ROUND(AVG(avg_basket_size), 2) as avg_basket_size,
-- Behavioral metrics
ROUND(AVG(purchase_frequency), 1) as avg_frequency,
ROUND(AVG(days_since_last_purchase), 1) as avg_recency,
ROUND(AVG(website_visits_last_30d), 1) as avg_web_visits,
ROUND(AVG(email_open_rate), 3) as avg_open_rate,
ROUND(AVG(product_category_diversity), 1) as avg_diversity,
ROUND(AVG(return_rate), 3) as avg_return_rate
FROM customer_clusters
GROUP BY cluster_id
),
cluster_rankings AS (
SELECT
cluster_id,
cluster_size,
population_percentage,
avg_total_spend,
avg_clv,
avg_frequency,
avg_recency,
-- Rank clusters by different metrics
ROW_NUMBER() OVER (ORDER BY avg_total_spend DESC) as spend_rank,
ROW_NUMBER() OVER (ORDER BY avg_clv DESC) as clv_rank,
ROW_NUMBER() OVER (ORDER BY avg_frequency DESC) as frequency_rank,
ROW_NUMBER() OVER (ORDER BY avg_recency ASC) as recency_rank, -- Lower is better
ROW_NUMBER() OVER (ORDER BY avg_web_visits DESC) as engagement_rank
FROM cluster_profiles
)
SELECT
cluster_id,
cluster_size,
population_percentage,
avg_total_spend,
avg_clv,
avg_frequency,
avg_recency,
avg_web_visits,
spend_rank,
clv_rank,
frequency_rank,
recency_rank,
engagement_rank,
-- Business segment naming
CASE
WHEN spend_rank = 1 AND frequency_rank <= 2 THEN 'VIP Champions'
WHEN spend_rank <= 2 AND frequency_rank <= 3 THEN 'Loyal Customers'
WHEN frequency_rank = 1 AND spend_rank <= 3 THEN 'Frequent Buyers'
WHEN avg_recency > 90 THEN 'At Risk'
WHEN avg_total_spend < 50 AND frequency_rank > 3 THEN 'Low Value'
WHEN avg_web_visits > 20 AND frequency_rank <= 3 THEN 'Digital Engaged'
ELSE 'Standard Customers'
END as segment_name,
-- Marketing recommendations
CASE
WHEN spend_rank = 1 AND frequency_rank <= 2 THEN 'Exclusive offers, VIP support, early access'
WHEN spend_rank <= 2 AND frequency_rank <= 3 THEN 'Loyalty programs, cross-selling, premium support'
WHEN frequency_rank = 1 AND spend_rank <= 3 THEN 'Volume discounts, bundle offers'
WHEN avg_recency > 90 THEN 'Re-engagement campaigns, special offers, feedback requests'
WHEN avg_total_spend < 50 AND frequency_rank > 3 THEN 'Entry-level products, educational content'
WHEN avg_web_visits > 20 AND frequency_rank <= 3 THEN 'Digital-first campaigns, app notifications'
ELSE 'Standard marketing, periodic newsletters'
END as marketing_strategy,
-- Product recommendations
CASE
WHEN spend_rank = 1 AND frequency_rank <= 2 THEN 'Premium products, exclusive items'
WHEN spend_rank <= 2 AND frequency_rank <= 3 THEN 'Quality products, value bundles'
WHEN frequency_rank = 1 AND spend_rank <= 3 THEN 'Bulk packages, consumables'
WHEN avg_recency > 90 THEN 'New products, special promotions'
WHEN avg_total_spend < 50 AND frequency_rank > 3 THEN 'Budget-friendly options, trial products'
WHEN avg_web_visits > 20 AND frequency_rank <= 3 THEN 'Digital products, online exclusives'
ELSE 'Standard product range'
END as product_strategy
FROM cluster_rankings
ORDER BY avg_clv DESC;
Data Mining
Machine Learning
| Aspect | Data Mining | Machine Learning |
|---|---|---|
| Primary Goal | Discover patterns | Make predictions |
| Output | Insights, rules | Predictive models |
| Validation | Statistical significance | Prediction accuracy |
| Human Involvement | High (interpretation) | Medium (feature engineering) |
| Data Requirements | Large datasets | Labeled/unlabeled data |
| Time Horizon | Exploratory, one-time | Ongoing, iterative |
Data Mining: Association Rules (Market Basket Analysis)
-- Data mining example: Market basket analysis
WITH transaction_items AS (
SELECT
transaction_id,
product_id,
product_name,
product_category,
unit_price,
quantity
FROM transaction_details
WHERE transaction_date >= '2023-01-01'
),
item_pairs AS (
-- Find all pairs of items purchased together
SELECT
t1.product_id as item1_id,
t1.product_name as item1_name,
t2.product_id as item2_id,
t2.product_name as item2_name,
t1.transaction_id
FROM transaction_items t1
JOIN transaction_items t2 ON t1.transaction_id = t2.transaction_id
AND t1.product_id < t2.product_id -- Avoid duplicates
),
association_metrics AS (
SELECT
item1_id,
item1_name,
item2_id,
item2_name,
COUNT(DISTINCT transaction_id) as together_count,
-- Support: How often items appear together
COUNT(DISTINCT transaction_id) * 100.0 / (SELECT COUNT(DISTINCT transaction_id) FROM transaction_items) as support_percentage,
-- Confidence: How often item2 appears when item1 is purchased
COUNT(DISTINCT transaction_id) * 100.0 /
(SELECT COUNT(DISTINCT transaction_id) FROM transaction_items WHERE product_id = item1_id) as confidence_percentage,
-- Lift: How much more likely item2 is purchased when item1 is purchased
(COUNT(DISTINCT transaction_id) / (SELECT COUNT(DISTINCT transaction_id) FROM transaction_items)) /
((SELECT COUNT(DISTINCT transaction_id) FROM transaction_items WHERE product_id = item1_id) / (SELECT COUNT(DISTINCT transaction_id) FROM transaction_items)) *
(SELECT COUNT(DISTINCT transaction_id) FROM transaction_items) / (SELECT COUNT(DISTINCT transaction_id) FROM transaction_items WHERE product_id = item2_id) as lift_ratio
FROM item_pairs
GROUP BY item1_id, item1_name, item2_id, item2_name
HAVING COUNT(DISTINCT transaction_id) >= 10 -- Minimum support threshold
)
SELECT
item1_name as "If Customer Buys",
item2_name as "They Also Buy",
together_count as "Times Together",
ROUND(support_percentage, 2) as "Support %",
ROUND(confidence_percentage, 2) as "Confidence %",
ROUND(lift_ratio, 2) as "Lift Ratio",
CASE
WHEN lift_ratio > 2.0 THEN 'Strong Association'
WHEN lift_ratio > 1.5 THEN 'Moderate Association'
WHEN lift_ratio > 1.0 THEN 'Weak Association'
ELSE 'No Association'
END as association_strength,
-- Business action
CASE
WHEN lift_ratio > 2.0 AND confidence_percentage > 30 THEN 'Bundle these products'
WHEN lift_ratio > 1.5 AND confidence_percentage > 20 THEN 'Cross-sell recommendation'
WHEN lift_ratio > 1.0 THEN 'Consider for promotion'
ELSE 'No action needed'
END as business_recommendation
FROM association_metrics
WHERE support_percentage >= 0.5 -- Minimum support filter
AND confidence_percentage >= 10 -- Minimum confidence filter
ORDER BY lift_ratio DESC, confidence_percentage DESC
LIMIT 20;
Machine Learning: Predictive Model
-- Machine learning example: Customer lifetime value prediction
WITH customer_features AS (
SELECT
customer_id,
-- Historical features (what we know)
age,
income_level,
signup_date,
total_purchases,
total_spend,
avg_order_value,
purchase_frequency,
days_since_last_purchase,
-- Behavioral features
website_visits_last_30d,
email_open_rate,
support_tickets_count,
product_category_diversity,
-- Target variable (what we want to predict)
customer_lifetime_value,
-- Feature engineering
DATEDIFF(CURRENT_DATE, signup_date) as customer_age_days,
total_spend / NULLIF(total_purchases, 0) as avg_transaction_value,
total_purchases / NULLIF(DATEDIFF(CURRENT_DATE, signup_date) / 30, 0) as monthly_purchase_rate,
CASE
WHEN days_since_last_purchase <= 30 THEN 'active'
WHEN days_since_last_purchase <= 90 THEN 'at_risk'
ELSE 'inactive'
END as activity_status
FROM customers
WHERE signup_date >= '2022-01-01'
),
model_training_data AS (
SELECT
customer_id,
-- Features for ML model
age,
income_level,
customer_age_days,
total_purchases,
avg_transaction_value,
monthly_purchase_rate,
website_visits_last_30d,
email_open_rate,
support_tickets_count,
product_category_diversity,
-- Target variable
customer_lifetime_value,
-- Train/test split
CASE
WHEN MOD(ABS(CRC32(customer_id)), 100) < 80 THEN 'training'
ELSE 'testing'
END as dataset_split
FROM customer_features
WHERE total_purchases > 0 AND customer_lifetime_value > 0
),
feature_importance AS (
-- Calculate feature importance using correlation with target
SELECT
'age' as feature_name,
CORR(age, customer_lifetime_value) as correlation_with_target,
ABS(CORR(age, customer_lifetime_value)) as absolute_importance
FROM model_training_data
WHERE dataset_split = 'training'
UNION ALL
SELECT
'income_level' as feature_name,
CORR(income_level, customer_lifetime_value) as correlation_with_target,
ABS(CORR(income_level, customer_lifetime_value)) as absolute_importance
FROM model_training_data
WHERE dataset_split = 'training'
UNION ALL
SELECT
'total_purchases' as feature_name,
CORR(total_purchases, customer_lifetime_value) as correlation_with_target,
ABS(CORR(total_purchases, customer_lifetime_value)) as absolute_importance
FROM model_training_data
WHERE dataset_split = 'training'
UNION ALL
SELECT
'monthly_purchase_rate' as feature_name,
CORR(monthly_purchase_rate, customer_lifetime_value) as correlation_with_target,
ABS(CORR(monthly_purchase_rate, customer_lifetime_value)) as absolute_importance
FROM model_training_data
WHERE dataset_split = 'training'
),
model_evaluation AS (
-- Simplified model evaluation metrics
SELECT
dataset_split,
COUNT(*) as sample_size,
ROUND(AVG(customer_lifetime_value), 2) as avg_actual_clv,
ROUND(STDDEV(customer_lifetime_value), 2) as stddev_clv,
-- Simple baseline model (predict average)
ROUND(AVG(customer_lifetime_value), 2) as baseline_prediction,
-- Baseline error
ROUND(AVG(POWER(customer_lifetime_value - AVG(customer_lifetime_value) OVER (), 2)), 2) as baseline_mse,
-- Feature-based prediction (simplified linear combination)
ROUND(AVG(age * 10 + income_level * 0.5 + total_purchases * 50 + monthly_purchase_rate * 100), 2) as ml_prediction,
-- ML model error
ROUND(AVG(POWER(customer_lifetime_value - (age * 10 + income_level * 0.5 + total_purchases * 50 + monthly_purchase_rate * 100), 2)), 2) as ml_mse
FROM model_training_data
GROUP BY dataset_split
)
SELECT
'Feature Importance' as analysis_type,
feature_name,
ROUND(correlation_with_target, 3) as correlation,
ROUND(absolute_importance, 3) as importance_score,
ROW_NUMBER() OVER (ORDER BY absolute_importance DESC) as importance_rank,
CASE
WHEN absolute_importance > 0.5 THEN 'High Importance'
WHEN absolute_importance > 0.3 THEN 'Medium Importance'
ELSE 'Low Importance'
END as importance_level
FROM feature_importance
UNION ALL
SELECT
'Model Performance' as analysis_type,
dataset_split as feature_name,
sample_size as correlation,
baseline_mse as importance_score,
ml_mse as importance_rank,
ROUND((baseline_mse - ml_mse) / baseline_mse * 100, 2) as importance_level
FROM model_evaluation
UNION ALL
SELECT
'Business Impact' as analysis_type,
'ML Improvement' as feature_name,
'Error Reduction' as correlation,
ROUND((SELECT baseline_mse FROM model_evaluation WHERE dataset_split = 'testing') -
(SELECT ml_mse FROM model_evaluation WHERE dataset_split = 'testing'), 2) as importance_score,
'Percentage' as importance_rank,
ROUND(((SELECT baseline_mse FROM model_evaluation WHERE dataset_split = 'testing') -
(SELECT ml_mse FROM model_evaluation WHERE dataset_split = 'testing')) /
(SELECT baseline_mse FROM model_evaluation WHERE dataset_split = 'testing') * 100, 2) as importance_level
FROM model_evaluation
WHERE dataset_split = 'testing';
Data Mining Use Cases
Machine Learning Use Cases
Hybrid Approaches
In the next module, we'll explore business intelligence and governance to understand how to implement these analytics in organizational contexts.