Комментарии (1)
Ответ сгенерирован нейросетью и может содержать ошибки
Методы и техники SQL запросов, которые я применял
SQL — это основной инструмент работы аналитика. За 10+ лет я использовал множество техник для оптимизации и решения сложных аналитических задач. Расскажу о наиболее полезных.
1. Window Functions (оконные функции)
Использование: анализировать данные в контексте (текущая строка + соседние).
-- Пример: рассчитать cumulative revenue по дням
SELECT
date,
revenue,
SUM(revenue) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cumulative_revenue,
LAG(revenue) OVER (ORDER BY date) as previous_day_revenue,
revenue - LAG(revenue) OVER (ORDER BY date) as day_over_day_change,
ROW_NUMBER() OVER (ORDER BY revenue DESC) as revenue_rank
FROM daily_metrics
ORDER BY date;
-- Пример: для каждого пользователя найти предыдущую и следующую покупку
SELECT
user_id,
purchase_date,
LAG(purchase_date) OVER (PARTITION BY user_id ORDER BY purchase_date) as previous_purchase,
LEAD(purchase_date) OVER (PARTITION BY user_id ORDER BY purchase_date) as next_purchase,
purchase_date - LAG(purchase_date) OVER (PARTITION BY user_id ORDER BY purchase_date) as days_since_last_purchase
FROM purchases
WHERE user_id = 123;
-- Пример: процентиль распределения
SELECT
PERCENT_RANK() OVER (ORDER BY revenue) * 100 as percentile,
user_id,
revenue
FROM user_lifetime_values
WHERE percentile >= 90; -- Top 10% users
Когда использовать:
- Cohort analysis (относительно когорты)
- Time series analysis (тренды)
- Ranking (top 10, bottom 10)
- Moving averages (сглаживание noise)
2. CTEs (Common Table Expressions)
Использование: разбить сложный запрос на читаемые части.
-- Пример: многоэтапный анализ
WITH user_segments AS (
-- Шаг 1: создать сегменты
SELECT
user_id,
CASE
WHEN lifetime_value >= 1000 THEN 'VIP'
WHEN lifetime_value >= 100 THEN 'Regular'
ELSE 'Casual'
END as segment,
lifetime_value,
d30_retention
FROM users
),
churned_users AS (
-- Шаг 2: найти уходящих
SELECT
user_id,
segment
FROM user_segments
WHERE d30_retention < 0.1
),
churned_by_segment AS (
-- Шаг 3: агрегировать
SELECT
segment,
COUNT(DISTINCT user_id) as churned_count
FROM churned_users
GROUP BY segment
),
total_by_segment AS (
SELECT
segment,
COUNT(DISTINCT user_id) as total_count
FROM user_segments
GROUP BY segment
)
-- Финальный результат
SELECT
t.segment,
c.churned_count,
t.total_count,
ROUND(100.0 * c.churned_count / t.total_count, 1) as churn_rate
FROM total_by_segment t
LEFT JOIN churned_by_segment c ON t.segment = c.segment;
Преимущества:
- Читаемость
- Переиспользование (несколько CTE в одном запросе)
- Логичное разделение шагов
3. Subqueries & Derived Tables
Использование: фильтрация по агрегированным данным.
-- Пример: найти users, которые потратили больше чем median
SELECT
user_id,
total_spent
FROM users u
JOIN (
SELECT
user_id,
SUM(amount) as total_spent
FROM purchases
GROUP BY user_id
) p ON u.id = p.user_id
WHERE p.total_spent > (
SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY SUM(amount))
FROM purchases
GROUP BY user_id
);
-- Пример: для каждого дня найти % активных users
SELECT
date,
ROUND(100.0 * dau / total_users, 1) as engagement_percent
FROM (
SELECT
DATE(created_at) as date,
COUNT(DISTINCT user_id) as dau,
(SELECT COUNT(*) FROM users WHERE created_at <= DATE(created_at)) as total_users
FROM events
GROUP BY DATE(created_at)
) daily_stats;
4. Joins (разные типы)
Использование: объединение данных из разных таблиц.
-- INNER JOIN: только совпадающие
SELECT
u.user_id,
u.email,
COUNT(p.id) as purchase_count
FROM users u
INNER JOIN purchases p ON u.id = p.user_id
GROUP BY u.user_id, u.email;
-- LEFT JOIN: все пользователи, даже без покупок
SELECT
u.user_id,
u.email,
COALESCE(COUNT(p.id), 0) as purchase_count
FROM users u
LEFT JOIN purchases p ON u.id = p.user_id
GROUP BY u.user_id, u.email;
-- FULL OUTER JOIN: все комбинации (редко нужен)
SELECT *
FROM users u
FULL OUTER JOIN purchases p ON u.id = p.user_id;
-- ANTI JOIN (using NOT IN или NOT EXISTS): что есть в A но нет в B
SELECT
u.user_id
FROM users u
WHERE NOT EXISTS (
SELECT 1
FROM purchases p
WHERE p.user_id = u.id
); -- Users who never made a purchase
-- UNION: комбинировать результаты двух queries
SELECT user_id, email FROM active_users
UNION
SELECT user_id, email FROM inactive_users;
5. Aggregation Functions
Использование: свести данные в статистику.
-- Базовые
SELECT
segment,
COUNT(*) as count,
COUNT(DISTINCT user_id) as unique_users,
SUM(revenue) as total_revenue,
AVG(revenue) as avg_revenue,
MIN(revenue) as min_revenue,
MAX(revenue) as max_revenue,
STDDEV(revenue) as std_dev,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue) as median
FROM purchases
GROUP BY segment;
-- Условные агрегации
SELECT
COUNT(DISTINCT CASE WHEN revenue > 100 THEN user_id END) as high_value_users,
COUNT(DISTINCT CASE WHEN revenue <= 100 AND revenue > 10 THEN user_id END) as medium_value_users,
COUNT(DISTINCT CASE WHEN revenue <= 10 THEN user_id END) as low_value_users
FROM purchases;
-- FILTER clause (PostgreSQL)
SELECT
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM orders;
6. STRING Functions
Использование: работать с текстом.
-- Разбить строку на части
SELECT
SPLIT_PART(full_name, ' ', 1) as first_name,
SPLIT_PART(full_name, ' ', 2) as last_name
FROM users;
-- Regex matching
SELECT
user_id,
email
FROM users
WHERE email ~ '^[a-zA-Z0-9._%+-]+@gmail\\.com$'; -- Only Gmail
-- String manipulation
SELECT
UPPER(city) as city_uppercase,
LOWER(email) as email_lowercase,
LENGTH(description) as description_length,
SUBSTRING(description, 1, 50) as description_preview
FROM users;
-- CONCAT
SELECT
CONCAT(first_name, ' ', last_name) as full_name,
'User_' || user_id as user_identifier -- || is concatenation operator
FROM users;
7. DATE Functions
Использование: работать с датами и временем.
-- Временные интервалы
SELECT
user_id,
created_at,
NOW() as current_time,
NOW() - created_at as age,
EXTRACT(DAY FROM NOW() - created_at) as days_since_creation,
DATE_TRUNC('month', created_at) as month_start -- First day of month
FROM users;
-- Date arithmetic
SELECT
DATE(purchase_date) as date,
DATE(purchase_date) + INTERVAL '7 days' as week_later,
DATE_ADD(purchase_date, INTERVAL 30 DAY) as month_later,
LAST_DAY(purchase_date) as month_end
FROM purchases;
-- Cohort analysis
SELECT
DATE_TRUNC('week', created_at) as cohort_week,
EXTRACT(WEEK FROM DATE(last_activity) - DATE(created_at)) as week_number,
COUNT(DISTINCT user_id) as users
FROM users
GROUP BY DATE_TRUNC('week', created_at), week_number;
8. CASE Statements (условная логика)
Использование: категоризация и логика.
-- Simple CASE
SELECT
user_id,
CASE status
WHEN 'active' THEN 'Активный'
WHEN 'inactive' THEN 'Неактивный'
WHEN 'suspended' THEN 'Заблокирован'
ELSE 'Неизвестно'
END as status_ru
FROM users;
-- Searched CASE (more flexible)
SELECT
user_id,
revenue,
CASE
WHEN revenue >= 10000 THEN 'VIP'
WHEN revenue >= 1000 THEN 'Premium'
WHEN revenue >= 100 THEN 'Regular'
ELSE 'Free'
END as tier,
CASE
WHEN region IN ('US', 'CA', 'MX') THEN 'Americas'
WHEN region IN ('UK', 'DE', 'FR') THEN 'Europe'
WHEN region IN ('JP', 'CN', 'IN') THEN 'Asia'
ELSE 'Other'
END as region_group
FROM users;
9. GROUP BY & HAVING
Использование: фильтрация по агрегатам.
-- HAVING фильтрует результаты агрегации
SELECT
user_id,
COUNT(*) as purchase_count,
SUM(amount) as total_spent,
AVG(amount) as avg_purchase
FROM purchases
GROUP BY user_id
HAVING COUNT(*) >= 3 -- Only users with 3+ purchases
AND SUM(amount) > 100
ORDER BY total_spent DESC;
10. Recursive CTEs (для hierarchical data)
Использование: работать с деревьями (категории, иерархия команд).
-- Пример: найти весь path от подкатегории до корневой
WITH RECURSIVE category_path AS (
-- Anchor: начать с leaf categories
SELECT
id,
name,
parent_id,
name as full_path,
1 as level
FROM categories
WHERE parent_id IS NULL -- Root categories
UNION ALL
-- Recursive: join с parent
SELECT
c.id,
c.name,
c.parent_id,
cp.full_path || ' > ' || c.name,
cp.level + 1
FROM categories c
JOIN category_path cp ON c.parent_id = cp.id
)
SELECT * FROM category_path
ORDER BY full_path;
11. INDEX & QUERY OPTIMIZATION
Использование: ускорить запросы на больших данных.
-- Explain план для анализа
EXPLAIN ANALYZE
SELECT user_id, COUNT(*)
FROM purchases
WHERE created_at >= '2024-01-01'
GROUP BY user_id;
-- Создать индекс
CREATE INDEX idx_purchases_created_at ON purchases(created_at);
CREATE INDEX idx_purchases_user_created ON purchases(user_id, created_at); -- Composite
-- Partition большие таблицы
CREATE TABLE purchases_2024 PARTITION OF purchases
FOR VALUES FROM ('2024-01-01') TO ('2025-01-01');
12. DISTINCT & GROUP BY vs DISTINCT
Когда использовать что:
-- DISTINCT: когда просто нужны уникальные значения
SELECT DISTINCT user_id FROM purchases; -- 50K unique users
-- GROUP BY: когда нужны агрегаты + уникальные значения
SELECT
user_id,
COUNT(*) as purchase_count,
SUM(amount) as total
FROM purchases
GROUP BY user_id; -- 50K unique users + statistics
-- DISTINCT ON (PostgreSQL specific): keep first row per group
SELECT DISTINCT ON (user_id) user_id, email, created_at
FROM users
ORDER BY user_id, created_at DESC;
13. UNION & Set Operations
Использование: объединять результаты разных queries.
-- UNION: combine + remove duplicates
SELECT user_id FROM subscribed_users
UNION
SELECT user_id FROM trial_users;
-- UNION ALL: combine + keep duplicates (faster)
SELECT revenue FROM ios_purchases
UNION ALL
SELECT revenue FROM android_purchases; -- For SUM
-- EXCEPT: A minus B
SELECT user_id FROM all_users
EXCEPT
SELECT user_id FROM churned_users; -- Active users
-- INTERSECT: A и B together
SELECT user_id FROM email_subscribers
INTERSECT
SELECT user_id FROM app_users; -- Users in both
14. Денормализованные queries & Views
Использование: создать reusable queries.
-- Создать materialized view (snapshot, нужно refresh)
CREATE MATERIALIZED VIEW user_metrics AS
SELECT
u.user_id,
u.email,
COUNT(p.id) as purchase_count,
SUM(p.amount) as lifetime_value,
MAX(p.created_at) as last_purchase_date,
COUNT(DISTINCT DATE(p.created_at)) as purchase_days
FROM users u
LEFT JOIN purchases p ON u.id = p.user_id
GROUP BY u.user_id, u.email;
-- Refresh when needed
REFRESH MATERIALIZED VIEW user_metrics;
-- Use it
SELECT * FROM user_metrics WHERE lifetime_value > 1000;
15. Performance Tips
Которые я использую в большинстве запросов:
-- 1. Фильтруй рано (в WHERE, не в JOIN или HAVING)
SELECT *
FROM purchases
WHERE created_at >= '2024-01-01' -- Filter early!
AND status = 'completed';
-- 2. Агрегируй до JOIN если возможно
SELECT
u.user_id,
summary.total_spent
FROM users u
LEFT JOIN (
SELECT user_id, SUM(amount) as total_spent
FROM purchases
WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY)
GROUP BY user_id
) summary ON u.id = summary.user_id;
-- 3. Используй LIMIT для testing
SELECT * FROM large_table LIMIT 100; -- Quick preview
-- 4. Избегай SELECT *
SELECT user_id, email, created_at -- Only needed columns
FROM users;
-- 5. Используй EXPLAIN для больших запросов
EXPLAIN ANALYZE
SELECT ...
Практические примеры из опыта
Пример 1: Cohort Retention Анализ
WITH cohort_data AS (
SELECT
DATE_TRUNC('week', u.created_at) as cohort_date,
u.user_id,
MAX(DATE(e.created_at)) as last_activity
FROM users u
LEFT JOIN events e ON u.id = e.user_id
GROUP BY DATE_TRUNC('week', u.created_at), u.user_id
)
SELECT
cohort_date,
EXTRACT(WEEK FROM last_activity - cohort_date) as weeks_since_cohort,
COUNT(DISTINCT user_id) as users_active
FROM cohort_data
WHERE cohort_date >= '2024-01-01'
GROUP BY cohort_date, weeks_since_cohort
ORDER BY cohort_date, weeks_since_cohort;
Пример 2: Multi-level Funnel Analysis
WITH funnel AS (
SELECT
user_id,
MAX(CASE WHEN event = 'view' THEN 1 ELSE 0 END) as step_1_viewed,
MAX(CASE WHEN event = 'add_to_cart' THEN 1 ELSE 0 END) as step_2_carted,
MAX(CASE WHEN event = 'checkout' THEN 1 ELSE 0 END) as step_3_checkout,
MAX(CASE WHEN event = 'purchase' THEN 1 ELSE 0 END) as step_4_purchased
FROM events
WHERE DATE(created_at) >= CURRENT_DATE - INTERVAL 30 DAY
GROUP BY user_id
)
SELECT
SUM(step_1_viewed) as viewed,
SUM(step_2_carted) as carted,
SUM(step_3_checkout) as checkout,
SUM(step_4_purchased) as purchased,
ROUND(100.0 * SUM(step_2_carted) / SUM(step_1_viewed), 1) as view_to_cart,
ROUND(100.0 * SUM(step_3_checkout) / SUM(step_2_carted), 1) as cart_to_checkout,
ROUND(100.0 * SUM(step_4_purchased) / SUM(step_3_checkout), 1) as checkout_to_purchase
FROM funnel;
SQL — это мощный инструмент, и овладение этими техниками позволяет мне решать даже сложные аналитические задачи эффективно и элегантно.