""" Tests Unitaires - DataValidator. Tests de validation et nettoyage des données. """ import pytest import pandas as pd import numpy as np from src.data.data_validator import DataValidator class TestDataValidation: """Tests de validation des données.""" def test_validate_valid_data(self, sample_ohlcv_data): """Test validation de données valides.""" validator = DataValidator() is_valid, errors = validator.validate(sample_ohlcv_data) assert is_valid is True assert len(errors) == 0 def test_validate_empty_dataframe(self): """Test rejet DataFrame vide.""" validator = DataValidator() df = pd.DataFrame() is_valid, errors = validator.validate(df) assert is_valid is False assert len(errors) > 0 assert 'empty' in errors[0].lower() def test_validate_missing_columns(self): """Test rejet si colonnes manquantes.""" validator = DataValidator() df = pd.DataFrame({ 'open': [1.1, 1.2], 'close': [1.15, 1.25] # Manque high, low, volume }) is_valid, errors = validator.validate(df) assert is_valid is False assert any('missing columns' in e.lower() for e in errors) def test_validate_price_inconsistency(self): """Test détection incohérences de prix.""" validator = DataValidator() df = pd.DataFrame({ 'open': [1.1, 1.2, 1.3], 'high': [1.15, 1.25, 1.35], 'low': [1.2, 1.3, 1.4], # Low > High (invalide) 'close': [1.12, 1.22, 1.32], 'volume': [1000, 2000, 3000] }) is_valid, errors = validator.validate(df) assert is_valid is False assert any('high < low' in e.lower() for e in errors) def test_validate_excessive_missing_values(self): """Test rejet si trop de valeurs manquantes.""" validator = DataValidator(config={'max_missing_pct': 0.05}) df = pd.DataFrame({ 'open': [1.1, np.nan, 1.3, np.nan, 1.5] * 10, 'high': [1.15, 1.25, np.nan, 1.45, 1.55] * 10, 'low': [1.05, 1.15, 1.25, np.nan, 1.45] * 10, 'close': [1.12, 1.22, 1.32, 1.42, np.nan] * 10, 'volume': [1000] * 50 }) is_valid, errors = validator.validate(df) assert is_valid is False assert any('missing values' in e.lower() for e in errors) class TestDataCleaning: """Tests de nettoyage des données.""" def test_clean_removes_duplicates(self): """Test suppression des doublons.""" validator = DataValidator() dates = pd.date_range('2024-01-01', periods=10, freq='1H') df = pd.DataFrame({ 'open': [1.1] * 10, 'high': [1.15] * 10, 'low': [1.05] * 10, 'close': [1.12] * 10, 'volume': [1000] * 10 }, index=dates) # Ajouter doublon df = pd.concat([df, df.iloc[[5]]]) assert len(df) == 11 df_clean = validator.clean(df) assert len(df_clean) == 10 def test_clean_sorts_chronologically(self): """Test tri chronologique.""" validator = DataValidator() dates = pd.date_range('2024-01-01', periods=10, freq='1H') df = pd.DataFrame({ 'open': [1.1] * 10, 'high': [1.15] * 10, 'low': [1.05] * 10, 'close': [1.12] * 10, 'volume': [1000] * 10 }, index=dates) # Mélanger l'ordre df = df.sample(frac=1) df_clean = validator.clean(df) assert df_clean.index.is_monotonic_increasing def test_clean_interpolates_missing_values(self): """Test interpolation valeurs manquantes.""" validator = DataValidator() df = pd.DataFrame({ 'open': [1.1, np.nan, 1.3, 1.4, 1.5], 'high': [1.15, 1.25, np.nan, 1.45, 1.55], 'low': [1.05, 1.15, 1.25, np.nan, 1.45], 'close': [1.12, 1.22, 1.32, 1.42, 1.52], 'volume': [1000, 2000, 3000, 4000, 5000] }) df_clean = validator.clean(df) # Vérifier que les NaN sont interpolés assert df_clean['open'].isna().sum() == 0 assert df_clean['high'].isna().sum() == 0 assert df_clean['low'].isna().sum() == 0 def test_clean_fixes_price_inconsistencies(self): """Test correction incohérences de prix.""" validator = DataValidator() df = pd.DataFrame({ 'open': [1.1, 1.2, 1.3], 'high': [1.05, 1.15, 1.25], # High < Open (invalide) 'low': [1.15, 1.25, 1.35], # Low > Open (invalide) 'close': [1.12, 1.22, 1.32], 'volume': [1000, 2000, 3000] }) df_clean = validator.clean(df) # Vérifier cohérence assert (df_clean['high'] >= df_clean['low']).all() assert (df_clean['high'] >= df_clean['open']).all() assert (df_clean['high'] >= df_clean['close']).all() assert (df_clean['low'] <= df_clean['open']).all() assert (df_clean['low'] <= df_clean['close']).all() class TestDataQualityReport: """Tests du rapport de qualité.""" def test_generate_quality_report(self, sample_ohlcv_data): """Test génération rapport de qualité.""" validator = DataValidator() report = validator.get_data_quality_report(sample_ohlcv_data) assert 'total_rows' in report assert 'date_range' in report assert 'missing_values' in report assert 'is_valid' in report assert 'price_stats' in report assert report['total_rows'] == len(sample_ohlcv_data) assert report['is_valid'] is True def test_report_includes_statistics(self, sample_ohlcv_data): """Test inclusion statistiques dans rapport.""" validator = DataValidator() report = validator.get_data_quality_report(sample_ohlcv_data) price_stats = report['price_stats'] assert 'mean_close' in price_stats assert 'std_close' in price_stats assert 'min_close' in price_stats assert 'max_close' in price_stats assert price_stats['mean_close'] > 0 assert price_stats['std_close'] > 0