Skip to content

Statistics Demoยค

Demonstrates basic statistical analysis: numeric, string, and boolean statistics on timeseries data.

Run it: python examples/statistics_demo.py

Modules demonstrated: NumericStatistics, StringStatistics, BooleanStatistics

Related guides: Signal Analytics


#!/usr/bin/env python3
"""
Demonstration of statistics features in ts-shape.

This script shows how to use:
1. NumericStatistics (comprehensive numeric column statistics)
2. StringStatistics (string column analysis)
3. BooleanStatistics (boolean column analysis)
4. TimestampStatistics (timestamp column analysis)
5. TimeGroupedStatistics (time-windowed aggregations)
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
import os

# Add parent directory to path to import ts_shape
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))

from ts_shape.features.stats.numeric_stats import NumericStatistics
from ts_shape.features.stats.string_stats import StringStatistics
from ts_shape.features.stats.boolean_stats import BooleanStatistics
from ts_shape.features.stats.timestamp_stats import TimestampStatistics
from ts_shape.features.time_stats.time_stats_numeric import TimeGroupedStatistics


def create_sample_dataframe():
    """Create a synthetic DataFrame with multiple column types."""
    np.random.seed(42)
    n = 200

    start_time = datetime(2024, 1, 1, 8, 0, 0)
    timestamps = [start_time + timedelta(minutes=i * 5) for i in range(n)]

    df = pd.DataFrame({
        'systime': pd.to_datetime(timestamps),
        'value_double': np.random.normal(100.0, 15.0, n),
        'value_integer': np.random.randint(0, 50, n),
        'value_string': np.random.choice(
            ['PRODUCT_A', 'PRODUCT_B', 'PRODUCT_C', 'PRODUCT_D', None],
            n, p=[0.35, 0.30, 0.20, 0.10, 0.05],
        ),
        'value_bool': np.random.choice(
            [True, False, None],
            n, p=[0.6, 0.35, 0.05],
        ),
    })

    return df


def demo_numeric_statistics():
    """Demo 1: Numeric column statistics."""
    print("\n" + "=" * 70)
    print("DEMO 1: Numeric Statistics")
    print("=" * 70)

    df = create_sample_dataframe()
    col = 'value_double'
    print(f"\nAnalyzing column '{col}' ({len(df)} rows)")

    # Individual statistics
    print("\n--- Individual Statistics ---")
    print(f"  Mean:     {NumericStatistics.column_mean(df, col):.4f}")
    print(f"  Median:   {NumericStatistics.column_median(df, col):.4f}")
    print(f"  Std Dev:  {NumericStatistics.column_std(df, col):.4f}")
    print(f"  Variance: {NumericStatistics.column_variance(df, col):.4f}")
    print(f"  Min:      {NumericStatistics.column_min(df, col):.4f}")
    print(f"  Max:      {NumericStatistics.column_max(df, col):.4f}")
    print(f"  Range:    {NumericStatistics.column_range(df, col):.4f}")
    print(f"  IQR:      {NumericStatistics.column_iqr(df, col):.4f}")
    print(f"  Skewness: {NumericStatistics.column_skewness(df, col):.4f}")
    print(f"  Kurtosis: {NumericStatistics.column_kurtosis(df, col):.4f}")

    # Quantiles
    print("\n--- Quantiles ---")
    print(f"  Q1 (25%): {NumericStatistics.column_quantile(df, col, 0.25):.4f}")
    print(f"  Q2 (50%): {NumericStatistics.column_quantile(df, col, 0.50):.4f}")
    print(f"  Q3 (75%): {NumericStatistics.column_quantile(df, col, 0.75):.4f}")
    print(f"  P90:      {NumericStatistics.column_quantile(df, col, 0.90):.4f}")

    # Derived measures
    print("\n--- Derived Measures ---")
    cv = NumericStatistics.coefficient_of_variation(df, col)
    print(f"  Coefficient of Variation: {cv:.4f}" if cv else "  Coefficient of Variation: N/A")
    print(f"  Standard Error of Mean:   {NumericStatistics.standard_error_mean(df, col):.4f}")

    # Summary as DataFrame
    print("\n--- Full Summary (as DataFrame) ---")
    try:
        summary_df = NumericStatistics.summary_as_dataframe(df, col)
        for col_name in summary_df.columns:
            val = summary_df[col_name].iloc[0]
            if isinstance(val, float):
                print(f"  {col_name}: {val:.4f}")
            else:
                print(f"  {col_name}: {val}")
    except AttributeError as e:
        # pandas >= 2.0 removed Series.mad(); skip gracefully
        print(f"  (Skipped: {e} -- use individual methods instead)")

    # Built-in describe
    print("\n--- pandas describe() ---")
    desc = NumericStatistics.describe(df[['value_double', 'value_integer']])
    print(desc.to_string())


def demo_string_statistics():
    """Demo 2: String column statistics."""
    print("\n" + "=" * 70)
    print("DEMO 2: String Statistics")
    print("=" * 70)

    df = create_sample_dataframe()
    col = 'value_string'
    print(f"\nAnalyzing column '{col}' ({len(df)} rows)")

    print("\n--- Basic String Statistics ---")
    print(f"  Unique values:   {StringStatistics.count_unique(df, col)}")
    print(f"  Most frequent:   {StringStatistics.most_frequent(df, col)}")
    print(f"  Count of most frequent: {StringStatistics.count_most_frequent(df, col)}")
    print(f"  Null count:      {StringStatistics.count_null(df, col)}")
    print(f"  Avg string length: {StringStatistics.average_string_length(df, col):.1f}")
    print(f"  Longest string:  {StringStatistics.longest_string(df, col)}")
    print(f"  Shortest string: {StringStatistics.shortest_string(df, col)}")

    print("\n--- Top 3 Most Common Strings ---")
    top3 = StringStatistics.most_common_n_strings(df, 3, col)
    print(top3.to_string())

    print("\n--- Pattern Matching ---")
    print(f"  Contains 'PRODUCT': {StringStatistics.contains_substring_count(df, 'PRODUCT', col)}")
    print(f"  Starts with 'P':    {StringStatistics.starts_with_count(df, 'P', col)}")
    print(f"  Contains digits:    {StringStatistics.contains_digit_count(df, col)}")
    print(f"  Uppercase pct:      {StringStatistics.uppercase_percentage(df, col):.1f}%")

    print("\n--- String Length Summary ---")
    length_summary = StringStatistics.string_length_summary(df, col)
    print(length_summary.to_string(index=False))

    # Full summary
    print("\n--- Full Summary ---")
    summary = StringStatistics.summary_as_dict(df, col)
    for key, value in summary.items():
        print(f"  {key}: {value}")


def demo_boolean_statistics():
    """Demo 3: Boolean column statistics."""
    print("\n" + "=" * 70)
    print("DEMO 3: Boolean Statistics")
    print("=" * 70)

    df = create_sample_dataframe()
    col = 'value_bool'
    print(f"\nAnalyzing column '{col}' ({len(df)} rows)")

    print("\n--- Boolean Statistics ---")
    print(f"  True count:      {BooleanStatistics.count_true(df, col)}")
    print(f"  False count:     {BooleanStatistics.count_false(df, col)}")
    print(f"  Null count:      {BooleanStatistics.count_null(df, col)}")
    print(f"  Not-null count:  {BooleanStatistics.count_not_null(df, col)}")
    print(f"  True percentage: {BooleanStatistics.true_percentage(df, col):.1f}%")
    print(f"  False percentage:{BooleanStatistics.false_percentage(df, col):.1f}%")
    print(f"  Mode:            {BooleanStatistics.mode(df, col)}")
    print(f"  Is balanced:     {BooleanStatistics.is_balanced(df, col)}")

    # Full summary as DataFrame
    print("\n--- Full Summary ---")
    summary_df = BooleanStatistics.summary_as_dataframe(df, col)
    print(summary_df.to_string(index=False))


def demo_timestamp_statistics():
    """Demo 4: Timestamp column statistics."""
    print("\n" + "=" * 70)
    print("DEMO 4: Timestamp Statistics")
    print("=" * 70)

    df = create_sample_dataframe()
    col = 'systime'
    print(f"\nAnalyzing column '{col}' ({len(df)} rows)")

    print("\n--- Timestamp Range ---")
    print(f"  Earliest:         {TimestampStatistics.earliest_timestamp(df, col)}")
    print(f"  Latest:           {TimestampStatistics.latest_timestamp(df, col)}")
    print(f"  Time range:       {TimestampStatistics.timestamp_range(df, col)}")
    print(f"  Median timestamp: {TimestampStatistics.median_timestamp(df, col)}")

    print("\n--- Time Gap Analysis ---")
    print(f"  Average time gap: {TimestampStatistics.average_time_gap(df, col)}")
    print(f"  Std dev of gaps:  {TimestampStatistics.standard_deviation_timestamps(df, col)}")

    print("\n--- Distribution ---")
    print(f"  Most frequent hour: {TimestampStatistics.most_frequent_hour(df, col)}")
    print(f"  Most frequent day:  {TimestampStatistics.most_frequent_day(df, col)}")
    print(f"  Null timestamps:    {TimestampStatistics.count_null(df, col)}")
    print(f"  Valid timestamps:   {TimestampStatistics.count_not_null(df, col)}")

    print("\n--- Hour Distribution ---")
    hour_dist = TimestampStatistics.hour_distribution(df, col)
    print(hour_dist.sort_index().to_string())

    print("\n--- Top Activity Days ---")
    top_days = TimestampStatistics.days_with_most_activity(df, col, n=3)
    print(top_days.to_string())


def demo_time_grouped_statistics():
    """Demo 5: Time-grouped statistics."""
    print("\n" + "=" * 70)
    print("DEMO 5: Time-Grouped Statistics")
    print("=" * 70)

    df = create_sample_dataframe()
    print(f"\nDataset: {len(df)} rows, grouped by 1-hour windows")

    # Single statistic
    print("\n--- Hourly Mean ---")
    hourly_mean = TimeGroupedStatistics.calculate_statistic(
        df, 'systime', 'value_double', '1h', 'mean',
    )
    print(hourly_mean.head(10).to_string())

    # Multiple statistics
    print("\n--- Hourly Statistics (mean, min, max, range) ---")
    multi_stats = TimeGroupedStatistics.calculate_statistics(
        df, 'systime', 'value_double', '1h', ['mean', 'min', 'max', 'range'],
    )
    print(multi_stats.head(10).to_string())

    # Diff statistic (last - first in window)
    print("\n--- Hourly Difference (last - first) ---")
    hourly_diff = TimeGroupedStatistics.calculate_statistic(
        df, 'systime', 'value_double', '1h', 'diff',
    )
    print(hourly_diff.head(10).to_string())

    # Custom function
    print("\n--- Custom Aggregation (coefficient of variation per hour) ---")
    cv_func = lambda x: x.std() / x.mean() if x.mean() != 0 else 0
    custom = TimeGroupedStatistics.calculate_custom_func(
        df, 'systime', 'value_double', '1h', cv_func,
    )
    print(custom.head(10).to_string())


def main():
    """Run all statistics demonstrations."""
    print("\n" + "=" * 70)
    print("Statistics Features Demonstration")
    print("=" * 70)

    try:
        demo_numeric_statistics()
        demo_string_statistics()
        demo_boolean_statistics()
        demo_timestamp_statistics()
        demo_time_grouped_statistics()

        print("\n" + "=" * 70)
        print("All demonstrations completed successfully!")
        print("=" * 70)

    except Exception as e:
        print(f"\nError during demonstration: {e}")
        import traceback
        traceback.print_exc()
        return 1

    return 0


if __name__ == "__main__":
    exit(main())