Skip to content

ts_shape.features.stats.timestamp_stats ¤

Classes:

  • TimestampStatistics

    Provides class methods to calculate statistics on timestamp columns in a pandas DataFrame.

TimestampStatistics ¤

TimestampStatistics(dataframe: DataFrame, column_name: str = 'systime')

Bases: Base

Provides class methods to calculate statistics on timestamp columns in a pandas DataFrame. The default column for calculations is 'systime'.

Parameters:

  • dataframe ¤

    (DataFrame) –

    The DataFrame to be processed.

  • column_name ¤

    (str, default: 'systime' ) –

    The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.

Methods:

Source code in src/ts_shape/utils/base.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(self, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.DataFrame:
    """
    Initializes the Base with a DataFrame, detects time columns, converts them to datetime,
    and sorts the DataFrame by the specified column (or the detected time column if applicable).

    Args:
        dataframe (pd.DataFrame): The DataFrame to be processed.
        column_name (str): The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.
    """
    self.dataframe = dataframe.copy()

    # Attempt to convert the specified column_name to datetime if it exists
    if column_name in self.dataframe.columns:
        self.dataframe[column_name] = pd.to_datetime(self.dataframe[column_name], errors='coerce')
    else:
        # If the column_name is not in the DataFrame, fallback to automatic time detection
        time_columns = [col for col in self.dataframe.columns if 'time' in col.lower() or 'date' in col.lower()]

        # Convert all detected time columns to datetime, if any
        for col in time_columns:
            self.dataframe[col] = pd.to_datetime(self.dataframe[col], errors='coerce')

        # If any time columns are detected, sort by the first one; otherwise, do nothing
        if time_columns:
            column_name = time_columns[0]

    # Sort by the datetime column (either specified or detected)
    if column_name in self.dataframe.columns:
        self.dataframe = self.dataframe.sort_values(by=column_name)

average_time_gap classmethod ¤

average_time_gap(dataframe: DataFrame, column_name: str = 'systime') -> Timedelta

Returns the average time gap between consecutive timestamps.

Source code in src/ts_shape/features/stats/timestamp_stats.py
76
77
78
79
80
81
@classmethod
def average_time_gap(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Timedelta:
    """Returns the average time gap between consecutive timestamps."""
    sorted_times = dataframe[column_name].dropna().sort_values()
    time_deltas = sorted_times.diff().dropna()
    return time_deltas.mean()

count_most_frequent_timestamp classmethod ¤

count_most_frequent_timestamp(dataframe: DataFrame, column_name: str = 'systime') -> int

Returns the count of the most frequent timestamp in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
40
41
42
43
44
@classmethod
def count_most_frequent_timestamp(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> int:
    """Returns the count of the most frequent timestamp in the column."""
    most_frequent_value = cls.most_frequent_timestamp(dataframe, column_name)
    return dataframe[column_name].value_counts().loc[most_frequent_value]

count_not_null classmethod ¤

count_not_null(dataframe: DataFrame, column_name: str = 'systime') -> int

Returns the number of non-null (valid) timestamps in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
15
16
17
18
@classmethod
def count_not_null(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> int:
    """Returns the number of non-null (valid) timestamps in the column."""
    return dataframe[column_name].notna().sum()

count_null classmethod ¤

count_null(dataframe: DataFrame, column_name: str = 'systime') -> int

Returns the number of null (NaN) values in the timestamp column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
10
11
12
13
@classmethod
def count_null(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> int:
    """Returns the number of null (NaN) values in the timestamp column."""
    return dataframe[column_name].isna().sum()

days_with_most_activity classmethod ¤

days_with_most_activity(dataframe: DataFrame, column_name: str = 'systime', n: int = 3) -> Series

Returns the top N days with the most timestamp activity.

Source code in src/ts_shape/features/stats/timestamp_stats.py
100
101
102
103
@classmethod
def days_with_most_activity(cls, dataframe: pd.DataFrame, column_name: str = 'systime', n: int = 3) -> pd.Series:
    """Returns the top N days with the most timestamp activity."""
    return dataframe[column_name].dt.date.value_counts().head(n)

earliest_timestamp classmethod ¤

earliest_timestamp(dataframe: DataFrame, column_name: str = 'systime')

Returns the earliest timestamp in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
20
21
22
23
@classmethod
def earliest_timestamp(cls, dataframe: pd.DataFrame, column_name: str = 'systime'):
    """Returns the earliest timestamp in the column."""
    return dataframe[column_name].min()

get_dataframe ¤

get_dataframe() -> DataFrame

Returns the processed DataFrame.

Source code in src/ts_shape/utils/base.py
34
35
36
def get_dataframe(self) -> pd.DataFrame:
    """Returns the processed DataFrame."""
    return self.dataframe

hour_distribution classmethod ¤

hour_distribution(dataframe: DataFrame, column_name: str = 'systime') -> Series

Returns the distribution of timestamps per hour of the day.

Source code in src/ts_shape/features/stats/timestamp_stats.py
61
62
63
64
@classmethod
def hour_distribution(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Series:
    """Returns the distribution of timestamps per hour of the day."""
    return dataframe[column_name].dt.hour.value_counts()

latest_timestamp classmethod ¤

latest_timestamp(dataframe: DataFrame, column_name: str = 'systime')

Returns the latest timestamp in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
25
26
27
28
@classmethod
def latest_timestamp(cls, dataframe: pd.DataFrame, column_name: str = 'systime'):
    """Returns the latest timestamp in the column."""
    return dataframe[column_name].max()

median_timestamp classmethod ¤

median_timestamp(dataframe: DataFrame, column_name: str = 'systime')

Returns the median timestamp in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
83
84
85
86
@classmethod
def median_timestamp(cls, dataframe: pd.DataFrame, column_name: str = 'systime'):
    """Returns the median timestamp in the column."""
    return dataframe[column_name].median()

month_distribution classmethod ¤

month_distribution(dataframe: DataFrame, column_name: str = 'systime') -> Series

Returns the distribution of timestamps per month.

Source code in src/ts_shape/features/stats/timestamp_stats.py
51
52
53
54
@classmethod
def month_distribution(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Series:
    """Returns the distribution of timestamps per month."""
    return dataframe[column_name].dt.month.value_counts()

most_frequent_day classmethod ¤

most_frequent_day(dataframe: DataFrame, column_name: str = 'systime') -> int

Returns the most frequent day of the week (0=Monday, 6=Sunday).

Source code in src/ts_shape/features/stats/timestamp_stats.py
66
67
68
69
@classmethod
def most_frequent_day(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> int:
    """Returns the most frequent day of the week (0=Monday, 6=Sunday)."""
    return dataframe[column_name].dt.weekday.mode().iloc[0]

most_frequent_hour classmethod ¤

most_frequent_hour(dataframe: DataFrame, column_name: str = 'systime') -> int

Returns the most frequent hour of the day (0-23).

Source code in src/ts_shape/features/stats/timestamp_stats.py
71
72
73
74
@classmethod
def most_frequent_hour(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> int:
    """Returns the most frequent hour of the day (0-23)."""
    return dataframe[column_name].dt.hour.mode().iloc[0]

most_frequent_timestamp classmethod ¤

most_frequent_timestamp(dataframe: DataFrame, column_name: str = 'systime')

Returns the most frequent timestamp in the column.

Source code in src/ts_shape/features/stats/timestamp_stats.py
35
36
37
38
@classmethod
def most_frequent_timestamp(cls, dataframe: pd.DataFrame, column_name: str = 'systime'):
    """Returns the most frequent timestamp in the column."""
    return dataframe[column_name].mode().iloc[0]

standard_deviation_timestamps classmethod ¤

standard_deviation_timestamps(dataframe: DataFrame, column_name: str = 'systime') -> Timedelta

Returns the standard deviation of the time differences between consecutive timestamps.

Source code in src/ts_shape/features/stats/timestamp_stats.py
88
89
90
91
92
93
@classmethod
def standard_deviation_timestamps(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Timedelta:
    """Returns the standard deviation of the time differences between consecutive timestamps."""
    sorted_times = dataframe[column_name].dropna().sort_values()
    time_deltas = sorted_times.diff().dropna()
    return time_deltas.std()

timestamp_quartiles classmethod ¤

timestamp_quartiles(dataframe: DataFrame, column_name: str = 'systime') -> Series

Returns the 25th, 50th (median), and 75th percentiles of the timestamps.

Source code in src/ts_shape/features/stats/timestamp_stats.py
95
96
97
98
@classmethod
def timestamp_quartiles(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Series:
    """Returns the 25th, 50th (median), and 75th percentiles of the timestamps."""
    return dataframe[column_name].quantile([0.25, 0.5, 0.75])

timestamp_range classmethod ¤

timestamp_range(dataframe: DataFrame, column_name: str = 'systime')

Returns the time range (difference) between the earliest and latest timestamps.

Source code in src/ts_shape/features/stats/timestamp_stats.py
30
31
32
33
@classmethod
def timestamp_range(cls, dataframe: pd.DataFrame, column_name: str = 'systime'):
    """Returns the time range (difference) between the earliest and latest timestamps."""
    return cls.latest_timestamp(dataframe, column_name) - cls.earliest_timestamp(dataframe, column_name)

weekday_distribution classmethod ¤

weekday_distribution(dataframe: DataFrame, column_name: str = 'systime') -> Series

Returns the distribution of timestamps per weekday.

Source code in src/ts_shape/features/stats/timestamp_stats.py
56
57
58
59
@classmethod
def weekday_distribution(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Series:
    """Returns the distribution of timestamps per weekday."""
    return dataframe[column_name].dt.weekday.value_counts()

year_distribution classmethod ¤

year_distribution(dataframe: DataFrame, column_name: str = 'systime') -> Series

Returns the distribution of timestamps per year.

Source code in src/ts_shape/features/stats/timestamp_stats.py
46
47
48
49
@classmethod
def year_distribution(cls, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.Series:
    """Returns the distribution of timestamps per year."""
    return dataframe[column_name].dt.year.value_counts()