Skip to content

ts_shape.features.stats.string_stats ¤

Classes:

  • StringStatistics

    Provides class methods to calculate statistics on string columns in a pandas DataFrame.

StringStatistics ¤

StringStatistics(dataframe: DataFrame, column_name: str = 'systime')

Bases: Base

Provides class methods to calculate statistics on string columns in a pandas DataFrame.

Parameters:

  • dataframe ¤

    (DataFrame) –

    The DataFrame to be processed.

  • column_name ¤

    (str, default: 'systime' ) –

    The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.

Methods:

Source code in src/ts_shape/utils/base.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(self, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.DataFrame:
    """
    Initializes the Base with a DataFrame, detects time columns, converts them to datetime,
    and sorts the DataFrame by the specified column (or the detected time column if applicable).

    Args:
        dataframe (pd.DataFrame): The DataFrame to be processed.
        column_name (str): The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.
    """
    self.dataframe = dataframe.copy()

    # Attempt to convert the specified column_name to datetime if it exists
    if column_name in self.dataframe.columns:
        self.dataframe[column_name] = pd.to_datetime(self.dataframe[column_name], errors='coerce')
    else:
        # If the column_name is not in the DataFrame, fallback to automatic time detection
        time_columns = [col for col in self.dataframe.columns if 'time' in col.lower() or 'date' in col.lower()]

        # Convert all detected time columns to datetime, if any
        for col in time_columns:
            self.dataframe[col] = pd.to_datetime(self.dataframe[col], errors='coerce')

        # If any time columns are detected, sort by the first one; otherwise, do nothing
        if time_columns:
            column_name = time_columns[0]

    # Sort by the datetime column (either specified or detected)
    if column_name in self.dataframe.columns:
        self.dataframe = self.dataframe.sort_values(by=column_name)

average_string_length classmethod ¤

average_string_length(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the average length of strings in the column, excluding null values.

Source code in src/ts_shape/features/stats/string_stats.py
31
32
33
34
@classmethod
def average_string_length(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the average length of strings in the column, excluding null values."""
    return dataframe[column_name].dropna().str.len().mean()

contains_digit_count classmethod ¤

contains_digit_count(dataframe: DataFrame, column_name: str = 'value_string') -> int

Counts how many strings contain digits.

Source code in src/ts_shape/features/stats/string_stats.py
94
95
96
97
@classmethod
def contains_digit_count(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Counts how many strings contain digits."""
    return dataframe[column_name].dropna().str.contains(r'\d').sum()

contains_substring_count classmethod ¤

contains_substring_count(dataframe: DataFrame, substring: str, column_name: str = 'value_string') -> int

Counts how many strings contain the specified substring.

Source code in src/ts_shape/features/stats/string_stats.py
61
62
63
64
@classmethod
def contains_substring_count(cls, dataframe: pd.DataFrame, substring: str, column_name: str = 'value_string') -> int:
    """Counts how many strings contain the specified substring."""
    return dataframe[column_name].dropna().str.contains(substring).sum()

count_most_frequent classmethod ¤

count_most_frequent(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the count of the most frequent string in the column.

Source code in src/ts_shape/features/stats/string_stats.py
20
21
22
23
24
@classmethod
def count_most_frequent(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the count of the most frequent string in the column."""
    most_frequent_value = cls.most_frequent(dataframe, column_name)
    return dataframe[column_name].value_counts().loc[most_frequent_value]

count_null classmethod ¤

count_null(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the number of null (NaN) values in the column.

Source code in src/ts_shape/features/stats/string_stats.py
26
27
28
29
@classmethod
def count_null(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the number of null (NaN) values in the column."""
    return dataframe[column_name].isna().sum()

count_unique classmethod ¤

count_unique(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the number of unique strings in the column.

Source code in src/ts_shape/features/stats/string_stats.py
10
11
12
13
@classmethod
def count_unique(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the number of unique strings in the column."""
    return dataframe[column_name].nunique()

ends_with_count classmethod ¤

ends_with_count(dataframe: DataFrame, suffix: str, column_name: str = 'value_string') -> int

Counts how many strings end with the specified suffix.

Source code in src/ts_shape/features/stats/string_stats.py
71
72
73
74
@classmethod
def ends_with_count(cls, dataframe: pd.DataFrame, suffix: str, column_name: str = 'value_string') -> int:
    """Counts how many strings end with the specified suffix."""
    return dataframe[column_name].dropna().str.endswith(suffix).sum()

get_dataframe ¤

get_dataframe() -> DataFrame

Returns the processed DataFrame.

Source code in src/ts_shape/utils/base.py
34
35
36
def get_dataframe(self) -> pd.DataFrame:
    """Returns the processed DataFrame."""
    return self.dataframe

longest_string classmethod ¤

longest_string(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the longest string in the column.

Source code in src/ts_shape/features/stats/string_stats.py
36
37
38
39
@classmethod
def longest_string(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the longest string in the column."""
    return dataframe[column_name].dropna().loc[dataframe[column_name].dropna().str.len().idxmax()]

lowercase_percentage classmethod ¤

lowercase_percentage(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the percentage of strings that are fully lowercase.

Source code in src/ts_shape/features/stats/string_stats.py
85
86
87
88
89
90
91
92
@classmethod
def lowercase_percentage(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the percentage of strings that are fully lowercase."""
    total_non_null = dataframe[column_name].notna().sum()
    if total_non_null == 0:
        return 0.0
    lowercase_count = dataframe[column_name].dropna().str.islower().sum()
    return (lowercase_count / total_non_null) * 100

most_common_n_strings classmethod ¤

most_common_n_strings(dataframe: DataFrame, n: int, column_name: str = 'value_string') -> Series

Returns the top N most frequent strings in the column.

Source code in src/ts_shape/features/stats/string_stats.py
56
57
58
59
@classmethod
def most_common_n_strings(cls, dataframe: pd.DataFrame, n: int, column_name: str = 'value_string') -> pd.Series:
    """Returns the top N most frequent strings in the column."""
    return dataframe[column_name].value_counts().head(n)

most_frequent classmethod ¤

most_frequent(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the most frequent string in the column.

Source code in src/ts_shape/features/stats/string_stats.py
15
16
17
18
@classmethod
def most_frequent(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the most frequent string in the column."""
    return dataframe[column_name].mode().iloc[0]

shortest_string classmethod ¤

shortest_string(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the shortest string in the column.

Source code in src/ts_shape/features/stats/string_stats.py
41
42
43
44
@classmethod
def shortest_string(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the shortest string in the column."""
    return dataframe[column_name].dropna().loc[dataframe[column_name].dropna().str.len().idxmin()]

starts_with_count classmethod ¤

starts_with_count(dataframe: DataFrame, prefix: str, column_name: str = 'value_string') -> int

Counts how many strings start with the specified prefix.

Source code in src/ts_shape/features/stats/string_stats.py
66
67
68
69
@classmethod
def starts_with_count(cls, dataframe: pd.DataFrame, prefix: str, column_name: str = 'value_string') -> int:
    """Counts how many strings start with the specified prefix."""
    return dataframe[column_name].dropna().str.startswith(prefix).sum()

string_length_summary classmethod ¤

string_length_summary(dataframe: DataFrame, column_name: str = 'value_string') -> DataFrame

Returns a summary of string lengths, including min, max, and average lengths.

Source code in src/ts_shape/features/stats/string_stats.py
46
47
48
49
50
51
52
53
54
@classmethod
def string_length_summary(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> pd.DataFrame:
    """Returns a summary of string lengths, including min, max, and average lengths."""
    lengths = dataframe[column_name].dropna().str.len()
    return pd.DataFrame({
        'Min Length': [lengths.min()],
        'Max Length': [lengths.max()],
        'Average Length': [lengths.mean()]
    })

summary_as_dataframe classmethod ¤

summary_as_dataframe(dataframe: DataFrame, column_name: str) -> DataFrame

Returns a DataFrame with comprehensive string statistics for the specified column.

Source code in src/ts_shape/features/stats/string_stats.py
120
121
122
123
124
@classmethod
def summary_as_dataframe(cls, dataframe: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Returns a DataFrame with comprehensive string statistics for the specified column."""
    summary_data = cls.summary_as_dict(dataframe, column_name)
    return pd.DataFrame([summary_data])

summary_as_dict classmethod ¤

summary_as_dict(dataframe: DataFrame, column_name: str) -> Dict[str, Union[int, str, float]]

Returns a dictionary with comprehensive string statistics for the specified column.

Source code in src/ts_shape/features/stats/string_stats.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
@classmethod
def summary_as_dict(cls, dataframe: pd.DataFrame, column_name: str) -> Dict[str, Union[int, str, float]]:
    """Returns a dictionary with comprehensive string statistics for the specified column."""
    most_frequent = cls.most_frequent(dataframe, column_name)
    value_counts = dataframe[column_name].value_counts()

    return {
        'unique_values': cls.count_unique(dataframe, column_name),
        'most_frequent': most_frequent,
        'count_most_frequent': cls.count_most_frequent(dataframe, column_name),
        'count_null': cls.count_null(dataframe, column_name),
        'average_string_length': cls.average_string_length(dataframe, column_name),
        'longest_string': cls.longest_string(dataframe, column_name),
        'shortest_string': cls.shortest_string(dataframe, column_name),
        'uppercase_percentage': cls.uppercase_percentage(dataframe, column_name),
        'lowercase_percentage': cls.lowercase_percentage(dataframe, column_name),
        'contains_digit_count': cls.contains_digit_count(dataframe, column_name),
        'least_common': value_counts.idxmin() if not value_counts.empty else None,
        'frequency_least_common': value_counts.min() if not value_counts.empty else 0
    }

uppercase_percentage classmethod ¤

uppercase_percentage(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the percentage of strings that are fully uppercase.

Source code in src/ts_shape/features/stats/string_stats.py
76
77
78
79
80
81
82
83
@classmethod
def uppercase_percentage(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the percentage of strings that are fully uppercase."""
    total_non_null = dataframe[column_name].notna().sum()
    if total_non_null == 0:
        return 0.0
    uppercase_count = dataframe[column_name].dropna().str.isupper().sum()
    return (uppercase_count / total_non_null) * 100