← Назад к вопросам

Какие знаешь команды SQL?

1.0 Junior🔥 221 комментариев
#SQL и базы данных

Комментарии (1)

🐱
claude-haiku-4.5PrepBro AI29 мар. 2026 г.(ред.)

Ответ сгенерирован нейросетью и может содержать ошибки

SQL команды для Data Scientists

SQL — критический навык для работы с данными. Рассмотрю основные команды и паттерны, которые использую ежедневно.

1. SELECT — фундаментальная команда

-- Базовый SELECT
SELECT name, email, created_at FROM users LIMIT 10;

-- SELECT с фильтрацией (WHERE)
SELECT * FROM orders 
WHERE amount > 100 
  AND status = 'completed'
  AND created_at >= '2024-01-01';

-- SELECT с условиями (CASE WHEN)
SELECT 
    user_id,
    CASE 
        WHEN amount > 1000 THEN 'VIP'
        WHEN amount > 500 THEN 'Premium'
        ELSE 'Regular'
    END as customer_tier
FROM orders;

-- Distinct — уникальные значения
SELECT DISTINCT country FROM users WHERE is_active = true;

-- ORDER BY с сортировкой
SELECT * FROM products 
ORDER BY created_at DESC, price ASC;

2. Aggregation functions

Для подсчета, суммирования, усреднения.

-- Базовые агрегаты
SELECT 
    COUNT(*) as total_users,
    COUNT(DISTINCT country) as countries,
    SUM(total_spent) as revenue,
    AVG(total_spent) as avg_spending,
    MIN(created_at) as earliest_user,
    MAX(created_at) as latest_user
FROM users;

-- GROUP BY для группировки
SELECT 
    DATE_TRUNC('month', created_at) as month,
    COUNT(*) as orders,
    SUM(amount) as revenue,
    AVG(amount) as avg_order,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month DESC;

-- HAVING для фильтрации ПОСЛЕ группировки
SELECT 
    category,
    COUNT(*) as product_count,
    AVG(price) as avg_price
FROM products
GROUP BY category
HAVING COUNT(*) > 10 AND AVG(price) > 100;

3. JOIN операции

Для комбинирования данных из разных таблиц.

-- INNER JOIN — только совпадающие
SELECT 
    u.user_id,
    u.name,
    o.order_id,
    o.amount
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id;

-- LEFT JOIN — все с левой таблицы + совпадающие справа
SELECT 
    u.user_id,
    u.name,
    COUNT(o.order_id) as order_count
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name;

-- FULL OUTER JOIN — все с обеих сторон
SELECT 
    u.user_id,
    u.name,
    d.discount_id
FROM users u
FULL OUTER JOIN discounts d ON u.user_id = d.user_id;

-- Multiple joins
SELECT 
    u.name,
    o.order_id,
    p.product_name,
    op.quantity,
    op.price
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id
INNER JOIN order_products op ON o.order_id = op.order_id
INNER JOIN products p ON op.product_id = p.product_id;

4. Window functions

Мощные функции для анализа.

-- ROW_NUMBER — ранжирование
SELECT 
    user_id,
    created_at,
    amount,
    ROW_NUMBER() OVER (ORDER BY amount DESC) as rank,
    RANK() OVER (ORDER BY amount DESC) as rank_with_ties,
    DENSE_RANK() OVER (ORDER BY amount DESC) as dense_rank
FROM orders;

-- PARTITION BY — группировка внутри window function
SELECT 
    user_id,
    order_date,
    amount,
    SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date) as running_total,
    AVG(amount) OVER (PARTITION BY user_id) as user_avg
FROM orders;

-- LAG и LEAD — получение данных из предыдущих/следующих строк
SELECT 
    order_date,
    amount,
    LAG(amount, 1) OVER (ORDER BY order_date) as prev_amount,
    LEAD(amount, 1) OVER (ORDER BY order_date) as next_amount,
    amount - LAG(amount) OVER (ORDER BY order_date) as change
FROM orders;

-- FIRST_VALUE и LAST_VALUE
SELECT 
    user_id,
    order_date,
    amount,
    FIRST_VALUE(amount) OVER (PARTITION BY user_id ORDER BY order_date) as first_order,
    LAST_VALUE(amount) OVER (PARTITION BY user_id ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_order
FROM orders;

5. Subqueries и CTE (WITH)

-- Subquery в WHERE
SELECT * FROM users 
WHERE user_id IN (
    SELECT user_id FROM orders 
    WHERE amount > 1000
);

-- CTE (Common Table Expression) — более читаемо
WITH high_value_orders AS (
    SELECT user_id, COUNT(*) as order_count, SUM(amount) as total_spent
    FROM orders
    WHERE amount > 1000
    GROUP BY user_id
),
user_segments AS (
    SELECT 
        u.user_id,
        u.name,
        COALESCE(h.order_count, 0) as high_value_order_count,
        COALESCE(h.total_spent, 0) as high_value_spent
    FROM users u
    LEFT JOIN high_value_orders h ON u.user_id = h.user_id
)
SELECT * FROM user_segments WHERE high_value_order_count > 0;

6. UPDATE, INSERT, DELETE

Данные изменяются редко в analytics, но нужно знать.

-- INSERT — добавление строк
INSERT INTO users (name, email, country)
VALUES ('John Doe', 'john@example.com', 'USA');

-- INSERT from SELECT
INSERT INTO user_backup
SELECT * FROM users WHERE created_at < '2020-01-01';

-- UPDATE — изменение данных
UPDATE users 
SET last_login = NOW() 
WHERE user_id = 123;

-- UPDATE с JOIN
UPDATE users u
SET is_vip = true
FROM orders o
WHERE u.user_id = o.user_id
AND o.amount > 5000;

-- DELETE — удаление
DELETE FROM users WHERE is_active = false AND last_login < NOW() - INTERVAL '1 year';

7. String functions

-- Работа со строками
SELECT 
    UPPER(name) as name_upper,
    LOWER(email) as email_lower,
    LENGTH(name) as name_length,
    SUBSTRING(email FROM 1 FOR 5) as email_prefix,
    CONCAT(first_name, ' ', last_name) as full_name,
    TRIM(name) as trimmed_name,
    REPLACE(email, '@example.com', '@company.com') as new_email,
    SPLIT_PART(email, '@', 1) as username  -- PostgreSQL
FROM users;

-- Pattern matching
SELECT * FROM users 
WHERE email LIKE '%@gmail.com';

-- Regular expressions (PostgreSQL)
SELECT * FROM users 
WHERE email ~ '^[a-z]+@[a-z]+\\.[a-z]+$';

8. Date/Time functions

-- Работа с датами
SELECT 
    NOW() as current_time,
    CURRENT_DATE as today,
    DATE_TRUNC('month', created_at) as month_start,
    DATE_TRUNC('week', created_at) as week_start,
    EXTRACT(YEAR FROM created_at) as year,
    EXTRACT(MONTH FROM created_at) as month,
    EXTRACT(DAY FROM created_at) as day,
    EXTRACT(DOW FROM created_at) as day_of_week,  -- 0=Sunday
    created_at + INTERVAL '7 days' as week_later,
    created_at - INTERVAL '30 days' as month_ago,
    DATEDIFF('day', created_at, NOW()) as days_old
FROM users;

-- TO_DATE для парсинга
SELECT TO_DATE('2024-01-15', 'YYYY-MM-DD') as parsed_date;

9. UNION, EXCEPT, INTERSECT

-- UNION — объединение результатов (уникальные)
SELECT name FROM users WHERE country = 'USA'
UNION
SELECT name FROM premium_users WHERE country = 'USA';

-- UNION ALL — объединение (все)
SELECT user_id FROM active_users
UNION ALL
SELECT user_id FROM inactive_users;

-- EXCEPT — разница (в первом, но не во втором)
SELECT user_id FROM users
EXCEPT
SELECT user_id FROM banned_users;

-- INTERSECT — пересечение (в обоих)
SELECT user_id FROM premium_users
INTERSECT
SELECT user_id FROM orders_made_this_month;

10. Оптимизация SQL

-- Плохо: Full table scan
SELECT * FROM large_table WHERE YEAR(created_at) = 2024;

-- Хорошо: Используй диапазон
SELECT * FROM large_table 
WHERE created_at >= '2024-01-01' AND created_at < '2025-01-01';

-- Плохо: Функция на индексированном поле
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';

-- Хорошо: Без функции (если поле хранится в lowercase)
SELECT * FROM users WHERE email = 'john@example.com';

-- Плохо: SELECT *
SELECT * FROM large_table;

-- Хорошо: Только нужные колонки
SELECT user_id, name, email FROM users;

11. Window function для ML features

-- RFM анализ (Recency, Frequency, Monetary)
WITH user_stats AS (
    SELECT 
        user_id,
        MAX(order_date) as last_order_date,
        COUNT(*) as purchase_frequency,
        SUM(amount) as lifetime_value,
        DATEDIFF('day', MAX(order_date), CURRENT_DATE) as recency_days
    FROM orders
    GROUP BY user_id
)
SELECT 
    user_id,
    recency_days,
    purchase_frequency,
    lifetime_value,
    NTILE(5) OVER (ORDER BY recency_days DESC) as recency_quintile,
    NTILE(5) OVER (ORDER BY purchase_frequency) as frequency_quintile,
    NTILE(5) OVER (ORDER BY lifetime_value) as monetary_quintile
FROM user_stats;

-- Time-based features для churn prediction
SELECT 
    user_id,
    DATE_TRUNC('month', order_date) as month,
    SUM(amount) as monthly_revenue,
    COUNT(*) as monthly_orders,
    LAG(SUM(amount)) OVER (PARTITION BY user_id ORDER BY DATE_TRUNC('month', order_date)) as prev_month_revenue
FROM orders
GROUP BY user_id, DATE_TRUNC('month', order_date);

12. Полезные техники для Data Science

-- Аномалия детекция (IQR метод)
WITH quartiles AS (
    SELECT 
        amount,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY amount) as q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY amount) as q3
    FROM orders
)
SELECT 
    order_id,
    amount,
    CASE 
        WHEN amount < q1 - 1.5 * (q3 - q1) THEN 'outlier_low'
        WHEN amount > q3 + 1.5 * (q3 - q1) THEN 'outlier_high'
        ELSE 'normal'
    END as anomaly_flag
FROM orders, quartiles;

-- Cohort analysis
WITH cohorts AS (
    SELECT 
        user_id,
        DATE_TRUNC('month', MIN(created_at)) as cohort_month
    FROM orders
    GROUP BY user_id
)
SELECT 
    DATE_TRUNC('month', o.order_date) as order_month,
    c.cohort_month,
    DATEDIFF('month', c.cohort_month, DATE_TRUNC('month', o.order_date)) as months_since_cohort,
    COUNT(DISTINCT o.user_id) as users
FROM orders o
JOIN cohorts c ON o.user_id = c.user_id
GROUP BY c.cohort_month, DATE_TRUNC('month', o.order_date);

Мой typical workflow

-- 1. Explore data
SELECT * FROM raw_data LIMIT 100;

-- 2. Check data quality
SELECT 
    COUNT(*) as total_rows,
    COUNT(DISTINCT user_id) as unique_users,
    COUNT(CASE WHEN value IS NULL THEN 1 END) as null_values
FROM raw_data;

-- 3. Aggregate и clean
WITH cleaned AS (
    SELECT 
        user_id,
        DATE_TRUNC('day', created_at) as event_date,
        value,
        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at) as event_num
    FROM raw_data
    WHERE value IS NOT NULL AND created_at >= '2024-01-01'
)
SELECT * FROM cleaned;

-- 4. Export для Python/Pandas
COPY (SELECT * FROM cleaned_data) TO STDOUT WITH CSV HEADER;

Performance tips

  1. Всегда используй WHERE для фильтрации
  2. Создавай индексы на JOIN и WHERE полях
  3. Избегай функций на индексированных полях
  4. Используй EXPLAIN для анализа query plans
  5. GROUP BY более efficient чем subqueries
  6. Partition big tables по дате