Комментарии (1)
🐱
claude-haiku-4.5PrepBro AI29 мар. 2026 г.(ред.)
Ответ сгенерирован нейросетью и может содержать ошибки
SQL команды для Data Scientists
SQL — критический навык для работы с данными. Рассмотрю основные команды и паттерны, которые использую ежедневно.
1. SELECT — фундаментальная команда
-- Базовый SELECT
SELECT name, email, created_at FROM users LIMIT 10;
-- SELECT с фильтрацией (WHERE)
SELECT * FROM orders
WHERE amount > 100
AND status = 'completed'
AND created_at >= '2024-01-01';
-- SELECT с условиями (CASE WHEN)
SELECT
user_id,
CASE
WHEN amount > 1000 THEN 'VIP'
WHEN amount > 500 THEN 'Premium'
ELSE 'Regular'
END as customer_tier
FROM orders;
-- Distinct — уникальные значения
SELECT DISTINCT country FROM users WHERE is_active = true;
-- ORDER BY с сортировкой
SELECT * FROM products
ORDER BY created_at DESC, price ASC;
2. Aggregation functions
Для подсчета, суммирования, усреднения.
-- Базовые агрегаты
SELECT
COUNT(*) as total_users,
COUNT(DISTINCT country) as countries,
SUM(total_spent) as revenue,
AVG(total_spent) as avg_spending,
MIN(created_at) as earliest_user,
MAX(created_at) as latest_user
FROM users;
-- GROUP BY для группировки
SELECT
DATE_TRUNC('month', created_at) as month,
COUNT(*) as orders,
SUM(amount) as revenue,
AVG(amount) as avg_order,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month DESC;
-- HAVING для фильтрации ПОСЛЕ группировки
SELECT
category,
COUNT(*) as product_count,
AVG(price) as avg_price
FROM products
GROUP BY category
HAVING COUNT(*) > 10 AND AVG(price) > 100;
3. JOIN операции
Для комбинирования данных из разных таблиц.
-- INNER JOIN — только совпадающие
SELECT
u.user_id,
u.name,
o.order_id,
o.amount
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id;
-- LEFT JOIN — все с левой таблицы + совпадающие справа
SELECT
u.user_id,
u.name,
COUNT(o.order_id) as order_count
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name;
-- FULL OUTER JOIN — все с обеих сторон
SELECT
u.user_id,
u.name,
d.discount_id
FROM users u
FULL OUTER JOIN discounts d ON u.user_id = d.user_id;
-- Multiple joins
SELECT
u.name,
o.order_id,
p.product_name,
op.quantity,
op.price
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id
INNER JOIN order_products op ON o.order_id = op.order_id
INNER JOIN products p ON op.product_id = p.product_id;
4. Window functions
Мощные функции для анализа.
-- ROW_NUMBER — ранжирование
SELECT
user_id,
created_at,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as rank,
RANK() OVER (ORDER BY amount DESC) as rank_with_ties,
DENSE_RANK() OVER (ORDER BY amount DESC) as dense_rank
FROM orders;
-- PARTITION BY — группировка внутри window function
SELECT
user_id,
order_date,
amount,
SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date) as running_total,
AVG(amount) OVER (PARTITION BY user_id) as user_avg
FROM orders;
-- LAG и LEAD — получение данных из предыдущих/следующих строк
SELECT
order_date,
amount,
LAG(amount, 1) OVER (ORDER BY order_date) as prev_amount,
LEAD(amount, 1) OVER (ORDER BY order_date) as next_amount,
amount - LAG(amount) OVER (ORDER BY order_date) as change
FROM orders;
-- FIRST_VALUE и LAST_VALUE
SELECT
user_id,
order_date,
amount,
FIRST_VALUE(amount) OVER (PARTITION BY user_id ORDER BY order_date) as first_order,
LAST_VALUE(amount) OVER (PARTITION BY user_id ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_order
FROM orders;
5. Subqueries и CTE (WITH)
-- Subquery в WHERE
SELECT * FROM users
WHERE user_id IN (
SELECT user_id FROM orders
WHERE amount > 1000
);
-- CTE (Common Table Expression) — более читаемо
WITH high_value_orders AS (
SELECT user_id, COUNT(*) as order_count, SUM(amount) as total_spent
FROM orders
WHERE amount > 1000
GROUP BY user_id
),
user_segments AS (
SELECT
u.user_id,
u.name,
COALESCE(h.order_count, 0) as high_value_order_count,
COALESCE(h.total_spent, 0) as high_value_spent
FROM users u
LEFT JOIN high_value_orders h ON u.user_id = h.user_id
)
SELECT * FROM user_segments WHERE high_value_order_count > 0;
6. UPDATE, INSERT, DELETE
Данные изменяются редко в analytics, но нужно знать.
-- INSERT — добавление строк
INSERT INTO users (name, email, country)
VALUES ('John Doe', 'john@example.com', 'USA');
-- INSERT from SELECT
INSERT INTO user_backup
SELECT * FROM users WHERE created_at < '2020-01-01';
-- UPDATE — изменение данных
UPDATE users
SET last_login = NOW()
WHERE user_id = 123;
-- UPDATE с JOIN
UPDATE users u
SET is_vip = true
FROM orders o
WHERE u.user_id = o.user_id
AND o.amount > 5000;
-- DELETE — удаление
DELETE FROM users WHERE is_active = false AND last_login < NOW() - INTERVAL '1 year';
7. String functions
-- Работа со строками
SELECT
UPPER(name) as name_upper,
LOWER(email) as email_lower,
LENGTH(name) as name_length,
SUBSTRING(email FROM 1 FOR 5) as email_prefix,
CONCAT(first_name, ' ', last_name) as full_name,
TRIM(name) as trimmed_name,
REPLACE(email, '@example.com', '@company.com') as new_email,
SPLIT_PART(email, '@', 1) as username -- PostgreSQL
FROM users;
-- Pattern matching
SELECT * FROM users
WHERE email LIKE '%@gmail.com';
-- Regular expressions (PostgreSQL)
SELECT * FROM users
WHERE email ~ '^[a-z]+@[a-z]+\\.[a-z]+$';
8. Date/Time functions
-- Работа с датами
SELECT
NOW() as current_time,
CURRENT_DATE as today,
DATE_TRUNC('month', created_at) as month_start,
DATE_TRUNC('week', created_at) as week_start,
EXTRACT(YEAR FROM created_at) as year,
EXTRACT(MONTH FROM created_at) as month,
EXTRACT(DAY FROM created_at) as day,
EXTRACT(DOW FROM created_at) as day_of_week, -- 0=Sunday
created_at + INTERVAL '7 days' as week_later,
created_at - INTERVAL '30 days' as month_ago,
DATEDIFF('day', created_at, NOW()) as days_old
FROM users;
-- TO_DATE для парсинга
SELECT TO_DATE('2024-01-15', 'YYYY-MM-DD') as parsed_date;
9. UNION, EXCEPT, INTERSECT
-- UNION — объединение результатов (уникальные)
SELECT name FROM users WHERE country = 'USA'
UNION
SELECT name FROM premium_users WHERE country = 'USA';
-- UNION ALL — объединение (все)
SELECT user_id FROM active_users
UNION ALL
SELECT user_id FROM inactive_users;
-- EXCEPT — разница (в первом, но не во втором)
SELECT user_id FROM users
EXCEPT
SELECT user_id FROM banned_users;
-- INTERSECT — пересечение (в обоих)
SELECT user_id FROM premium_users
INTERSECT
SELECT user_id FROM orders_made_this_month;
10. Оптимизация SQL
-- Плохо: Full table scan
SELECT * FROM large_table WHERE YEAR(created_at) = 2024;
-- Хорошо: Используй диапазон
SELECT * FROM large_table
WHERE created_at >= '2024-01-01' AND created_at < '2025-01-01';
-- Плохо: Функция на индексированном поле
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';
-- Хорошо: Без функции (если поле хранится в lowercase)
SELECT * FROM users WHERE email = 'john@example.com';
-- Плохо: SELECT *
SELECT * FROM large_table;
-- Хорошо: Только нужные колонки
SELECT user_id, name, email FROM users;
11. Window function для ML features
-- RFM анализ (Recency, Frequency, Monetary)
WITH user_stats AS (
SELECT
user_id,
MAX(order_date) as last_order_date,
COUNT(*) as purchase_frequency,
SUM(amount) as lifetime_value,
DATEDIFF('day', MAX(order_date), CURRENT_DATE) as recency_days
FROM orders
GROUP BY user_id
)
SELECT
user_id,
recency_days,
purchase_frequency,
lifetime_value,
NTILE(5) OVER (ORDER BY recency_days DESC) as recency_quintile,
NTILE(5) OVER (ORDER BY purchase_frequency) as frequency_quintile,
NTILE(5) OVER (ORDER BY lifetime_value) as monetary_quintile
FROM user_stats;
-- Time-based features для churn prediction
SELECT
user_id,
DATE_TRUNC('month', order_date) as month,
SUM(amount) as monthly_revenue,
COUNT(*) as monthly_orders,
LAG(SUM(amount)) OVER (PARTITION BY user_id ORDER BY DATE_TRUNC('month', order_date)) as prev_month_revenue
FROM orders
GROUP BY user_id, DATE_TRUNC('month', order_date);
12. Полезные техники для Data Science
-- Аномалия детекция (IQR метод)
WITH quartiles AS (
SELECT
amount,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY amount) as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY amount) as q3
FROM orders
)
SELECT
order_id,
amount,
CASE
WHEN amount < q1 - 1.5 * (q3 - q1) THEN 'outlier_low'
WHEN amount > q3 + 1.5 * (q3 - q1) THEN 'outlier_high'
ELSE 'normal'
END as anomaly_flag
FROM orders, quartiles;
-- Cohort analysis
WITH cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', MIN(created_at)) as cohort_month
FROM orders
GROUP BY user_id
)
SELECT
DATE_TRUNC('month', o.order_date) as order_month,
c.cohort_month,
DATEDIFF('month', c.cohort_month, DATE_TRUNC('month', o.order_date)) as months_since_cohort,
COUNT(DISTINCT o.user_id) as users
FROM orders o
JOIN cohorts c ON o.user_id = c.user_id
GROUP BY c.cohort_month, DATE_TRUNC('month', o.order_date);
Мой typical workflow
-- 1. Explore data
SELECT * FROM raw_data LIMIT 100;
-- 2. Check data quality
SELECT
COUNT(*) as total_rows,
COUNT(DISTINCT user_id) as unique_users,
COUNT(CASE WHEN value IS NULL THEN 1 END) as null_values
FROM raw_data;
-- 3. Aggregate и clean
WITH cleaned AS (
SELECT
user_id,
DATE_TRUNC('day', created_at) as event_date,
value,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at) as event_num
FROM raw_data
WHERE value IS NOT NULL AND created_at >= '2024-01-01'
)
SELECT * FROM cleaned;
-- 4. Export для Python/Pandas
COPY (SELECT * FROM cleaned_data) TO STDOUT WITH CSV HEADER;
Performance tips
- Всегда используй WHERE для фильтрации
- Создавай индексы на JOIN и WHERE полях
- Избегай функций на индексированных полях
- Используй EXPLAIN для анализа query plans
- GROUP BY более efficient чем subqueries
- Partition big tables по дате