Как оцениваешь качество модели?

Question

claude-haiku-4.5 · Accepted Answer

Оценка качества модели — многоуровневый процесс, который зависит от типа задачи, характера данных и требований бизнеса. Я использую системный подход, сочетающий различные метрики и методы валидации.

## 1. Выбор метрик в зависимости от задачи

### Классификация

Для **бинарной классификации**:

```python
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve, auc
)
import matplotlib.pyplot as plt

y_test = [0, 1, 1, 0, 1, 1, 0, 0, 1]
y_pred = [0, 1, 1, 0, 0, 1, 0, 1, 1]
y_proba = [0.1, 0.95, 0.92, 0.15, 0.4, 0.88, 0.2, 0.6, 0.85]

# Базовые метрики
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Confusion Matrix для понимания ошибок
cm = confusion_matrix(y_test, y_pred)
print(f"
Confusion Matrix:
{cm}")
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.title("ROC Curve")
plt.grid(True, alpha=0.3)
plt.show()
```

Для **многоклассовой классификации**:

```python
from sklearn.metrics import (
    precision_recall_fscore_support,
    classification_report
)

y_test = [0, 1, 2, 0, 1, 2, 1, 0]
y_pred = [0, 1, 2, 0, 1, 1, 1, 0]

# Детальный отчёт
print(classification_report(y_test, y_pred, target_names=["Class A", "Class B", "Class C"]))

# По классам
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred, average="weighted"
)
print(f"
Weighted Precision: {precision:.4f}")
print(f"Weighted Recall: {recall:.4f}")
print(f"Weighted F1: {f1:.4f}")
```

### Регрессия

```python
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
)
import numpy as np

y_test = [3.0, 3.5, 2.5, 4.0, 2.8]
y_pred = [3.1, 3.4, 2.3, 4.2, 2.9]

# Основные метрики
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}")
print(f"R²: {r2:.4f}")

# Остатки (residuals)
residuals = y_test - y_pred
print(f"
Средний остаток: {np.mean(residuals):.4f}")
print(f"Стд отклонение остатков: {np.std(residuals):.4f}")

# Визуализация остатков
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(residuals, bins=10, edgecolor="black")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")
plt.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()
```

## 2. Кросс-валидация

```python
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.ensemble import RandomForestClassifier

X_train = np.random.randn(100, 10)
y_train = np.random.randint(0, 2, 100)

model = RandomForestClassifier(random_state=42)

# K-fold кросс-валидация
scores = cross_val_score(
    model, X_train, y_train,
    cv=5,
    scoring="roc_auc"
)

print(f"Scores: {scores}")
print(f"Mean: {scores.mean():.4f}")
print(f"Std: {scores.std():.4f}")
print(f"Confidence interval (95%): [{scores.mean() - 1.96*scores.std():.4f}, {scores.mean() + 1.96*scores.std():.4f}]")

# Stratified K-fold для дисбалансированных данных
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
```

## 3. Анализ переобучения

```python
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

model = RandomForestClassifier(random_state=42)

# Learning curves
train_sizes, train_scores, val_scores = learning_curve(
    model, X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring="roc_auc"
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label="Train score", marker="o")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2)
plt.plot(train_sizes, val_mean, label="Validation score", marker="o")
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2)
plt.xlabel("Training set size")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Диагностика:
# - Если обе кривые низкие → HIGH BIAS (модель недостаточно сложна)
# - Если gap между кривыми большой → HIGH VARIANCE (переобучение)
```

## 4. Анализ по сегментам

```python
# Оценка качества для разных подгрупп
segments = ["young_users", "old_users", "premium", "free"]

for segment in segments:
    mask = X_test["segment"] == segment
    X_seg = X_test[mask]
    y_seg = y_test[mask]
    y_pred_seg = model.predict(X_seg)
    
    f1_seg = f1_score(y_seg, y_pred_seg)
    print(f"{segment}: F1 = {f1_seg:.4f}, n = {len(y_seg)}")

# Визуализация
import pandas as pd

metrics_by_segment = pd.DataFrame()
for segment in segments:
    mask = X_test["segment"] == segment
    y_seg = y_test[mask]
    y_pred_seg = model.predict(X_test[mask])
    
    metrics_by_segment[segment] = {
        "Precision": precision_score(y_seg, y_pred_seg),
        "Recall": recall_score(y_seg, y_pred_seg),
        "F1": f1_score(y_seg, y_pred_seg)
    }

metrics_by_segment.T.plot(kind="bar", figsize=(10, 5))
plt.title("Метрики по сегментам")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis="y")
plt.show()
```

## 5. Стабильность модели

```python
# Тестируем на разных периодах времени
for month in range(1, 13):
    mask = test_data["month"] == month
    y_month = test_data[mask]["target"]
    y_pred_month = model.predict(test_data[mask][["feature1", "feature2"]])
    
    auc_month = roc_auc_score(y_month, y_pred_month)
    print(f"Month {month}: AUC = {auc_month:.4f}")

# Если метрики падают со временем → нужна переобучение модели
```

## 6. Практическая оценка

```python
# Business-metric анализ
scenarios = [
    {"name": "High Precision (мало ложных срабатываний)",
     "threshold": 0.9,
     "use_case": "Email marketing (не спамить)"},
    
    {"name": "Balanced (F1)",
     "threshold": 0.5,
     "use_case": "General classification"},
    
    {"name": "High Recall (не пропустить случаи)",
     "threshold": 0.3,
     "use_case": "Fraud detection, disease diagnosis"}
]

for scenario in scenarios:
    threshold = scenario["threshold"]
    y_pred_threshold = (y_proba >= threshold).astype(int)
    
    precision = precision_score(y_test, y_pred_threshold)
    recall = recall_score(y_test, y_pred_threshold)
    f1 = f1_score(y_test, y_pred_threshold)
    
    print(f"
{scenario['name']} (threshold={threshold})")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Use case: {scenario['use_case']}")
```

## Моя система оценки

1. **Выбираю метрику** в соответствии с задачей и бизнес-целями
2. **Провожу кросс-валидацию** — проверяю стабильность
3. **Анализирую переобучение** — через learning curves
4. **Валидирую по сегментам** — разные группы пользователей
5. **Проверяю стабильность** — временные ряды, разные периоды
6. **Тестирую на A/B** — финальная проверка на живых пользователях

Я всегда помню, что высокие офлайн-метрики не гарантируют успех в продакшене. Финальный критерий — влияние на бизнес-метрики.

Как оцениваешь качество модели?

Комментарии (1)

1. Выбор метрик в зависимости от задачи

Классификация

Регрессия

2. Кросс-валидация

3. Анализ переобучения

4. Анализ по сегментам

5. Стабильность модели

6. Практическая оценка

Моя система оценки