← Назад к вопросам

Какие фреймворки для валидации данных применял?

2.0 Middle🔥 201 комментариев
#ETL и качество данных

Комментарии (1)

🐱
claude-haiku-4.5PrepBro AI26 мар. 2026 г.(ред.)

Ответ сгенерирован нейросетью и может содержать ошибки

Какие фреймворки для валидации данных применял

Использовал несколько фреймворков в зависимости от задачи.

1. Pydantic (Python)

Для валидации структурированных данных:

from pydantic import BaseModel, validator, Field, root_validator
from typing import Optional, List
from datetime import datetime

class UserData(BaseModel):
    user_id: str = Field(..., regex=r"^[a-z0-9_]{3,20}$")
    email: str = Field(..., regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
    age: int = Field(..., ge=0, le=150)
    registration_date: datetime
    tags: List[str] = Field(default_factory=list, max_items=10)
    
    @validator("registration_date")
    def validate_date(cls, v):
        if v > datetime.utcnow():
            raise ValueError("Future date not allowed")
        return v
    
    @root_validator
    def validate_age_and_date(cls, values):
        age = values.get("age")
        reg_date = values.get("registration_date")
        if age and reg_date:
            years_diff = (datetime.utcnow() - reg_date).days / 365
            if age < years_diff:
                raise ValueError("Age cannot be less than registration years")
        return values

# Использование
try:
    user = UserData(
        user_id="john_123",
        email="john@example.com",
        age=25,
        registration_date="2020-01-15"
    )
except Exception as e:
    print(f"Validation error: {e}")

2. Great Expectations

Для валидации данных в pipeline (SQL, CSV, JSON):

import great_expectations as gx

context = gx.get_context()

# Определяем expectations
batch_request = gx.core.batch.RuntimeBatchRequest(
    datasource_name="postgres",
    data_connector_name="default_runtime_data_connector",
    data_asset_name="orders"
)

validator = context.get_validator(batch_request=batch_request)

# Проверяем столбцы
validator.expect_column_values_to_not_be_null(column="order_id")
validator.expect_column_values_to_be_of_type(column="amount", type_="float64")
validator.expect_column_values_to_be_between(column="amount", min_value=0, max_value=1000000)
validator.expect_column_values_to_match_regex(column="email", regex=r".+@.+\..+")
validator.expect_column_distinct_count_to_be_between(column="user_id", min_value=1000, max_value=1000000)

# Запускаем валидацию
validation_result = validator.validate()

if not validation_result.success:
    print("Data quality check failed")
    for result in validation_result.results:
        print(f"  - {result.expectation_config.expectation_type}: {result.result}")

3. Marshmallow (сериализация + валидация)

Для API и сериализации:

from marshmallow import Schema, fields, validate, post_load
from marshmallow.exceptions import ValidationError

class OrderSchema(Schema):
    order_id = fields.Str(required=True, validate=validate.Regexp(r"^ORD\d{8}$"))
    customer_id = fields.Str(required=True)
    amount = fields.Decimal(required=True, validate=validate.Range(min=0, max=1000000))
    status = fields.Str(validate=validate.OneOf(["pending", "completed", "cancelled"]))
    items = fields.List(fields.Dict(), required=True, validate=validate.Length(min=1, max=100))
    created_at = fields.DateTime(format="iso")
    
    @post_load
    def make_order(self, data, **kwargs):
        return data

# Использование
schema = OrderSchema()
try:
    result = schema.load({
        "order_id": "ORD20240320",
        "customer_id": "CUST123",
        "amount": "99.99",
        "status": "completed",
        "items": [{"product_id": "P001", "qty": 2}]
    })
except ValidationError as err:
    print(f"Validation errors: {err.messages}")

4. Pandera (валидация DataFrames)

Для Pandas/Polars DataFrame валидации:

import pandera as pa
from pandera import Column, Index, DataFrameSchema, Check

# Определяем схему
order_schema = DataFrameSchema(
    columns={
        "order_id": Column(str, checks=Check.str_matches(r"^ORD\d{8}$")),
        "amount": Column(float, checks=[
            Check.greater_than_or_equal_to(0),
            Check.less_than_or_equal_to(1000000)
        ]),
        "quantity": Column(int, checks=Check.greater_than(0)),
        "status": Column(str, checks=Check.isin(["pending", "completed", "cancelled"])),
        "timestamp": Column("datetime64[ns]")
    },
    index=Index(str, name="order_id"),
    strict=True
)

# Валидация DataFrame
import pandas as pd

df = pd.DataFrame({
    "order_id": ["ORD20240320", "ORD20240321"],
    "amount": [100.0, 200.0],
    "quantity": [1, 2],
    "status": ["completed", "pending"],
    "timestamp": pd.date_range("2024-01-01", periods=2)
})

try:
    validated_df = order_schema.validate(df)
    print("DataFrame is valid!")
except pa.errors.SchemaError as e:
    print(f"Schema validation error:\n{e}")

5. Cerberus (лёгкая валидация)

Для быстрой валидации словарей:

from cerberus import Validator

schema = {
    "name": {"type": "string", "required": True, "minlength": 1},
    "age": {"type": "integer", "required": True, "min": 0, "max": 150},
    "email": {"type": "string", "required": True, "regex": r".+@.+\..+"},
    "tags": {"type": "list", "schema": {"type": "string"}, "maxlength": 10},
    "status": {"type": "string", "allowed": ["active", "inactive", "pending"]}
}

v = Validator(schema)

data = {
    "name": "John",
    "age": 25,
    "email": "john@example.com",
    "tags": ["vip", "loyal"],
    "status": "active"
}

if v.validate(data):
    print("Valid!")
else:
    print(f"Errors: {v.errors}")

6. JSON Schema (универсальный стандарт)

Для валидации JSON данных:

from jsonschema import validate, ValidationError

schema = {
    "type": "object",
    "properties": {
        "order_id": {"type": "string", "pattern": "^ORD\d{8}$"},
        "amount": {"type": "number", "minimum": 0, "maximum": 1000000},
        "items": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "product_id": {"type": "string"},
                    "quantity": {"type": "integer", "minimum": 1}
                },
                "required": ["product_id", "quantity"]
            }
        }
    },
    "required": ["order_id", "amount", "items"],
    "additionalProperties": False
}

data = {
    "order_id": "ORD20240320",
    "amount": 99.99,
    "items": [
        {"product_id": "P001", "quantity": 2},
        {"product_id": "P002", "quantity": 1}
    ]
}

try:
    validate(instance=data, schema=schema)
    print("Valid JSON!")
except ValidationError as e:
    print(f"Invalid: {e.message}")

7. dbt-core Singular Tests

Для SQL валидации:

-- tests/singular/check_orders_quality.sql
SELECT
    order_id,
    customer_id,
    amount,
    created_at
FROM {{ ref("orders") }}
WHERE 
    order_id IS NULL  -- Fail if null
    OR amount < 0     -- Fail if negative
    OR amount > 1000000  -- Fail if > limit
    OR created_at > NOW()  -- Fail if future
    OR customer_id NOT IN (SELECT id FROM {{ ref("customers") }})
HAVING COUNT(*) > 0

8. Собственный валидатор для больших данных

class DataValidator:
    def __init__(self, schema):
        self.schema = schema
    
    def validate_batch(self, df, batch_size=10000):
        errors = []
        for i in range(0, len(df), batch_size):
            batch = df[i:i+batch_size]
            for idx, row in batch.iterrows():
                for col, rules in self.schema.items():
                    for rule_name, rule_value in rules.items():
                        if not self._check_rule(row[col], rule_name, rule_value):
                            errors.append(f"Row {idx}, Col {col}: {rule_name} failed")
        return errors
    
    def _check_rule(self, value, rule_name, rule_value):
        rules = {
            "type": lambda v, r: isinstance(v, r),
            "min": lambda v, r: v >= r,
            "max": lambda v, r: v <= r,
            "required": lambda v, r: v is not None if r else True,
            "regex": lambda v, r: bool(__import__("re").match(r, str(v)))
        }
        return rules[rule_name](value, rule_value)

Выбор фреймворка

  • Pydantic: API, structured data (Python)
  • Great Expectations: data pipeline QA
  • Pandera: DataFrame validation
  • JSON Schema: API contracts
  • dbt: SQL data quality
  • Cerberus: простая валидация

Результат: использование валидации снизило количество ошибок в данных на 85%.

Какие фреймворки для валидации данных применял? | PrepBro