ts_shape.features.stats.string_stats ¤

Classes:

StringStatistics –

Provides class methods to calculate statistics on string columns in a pandas DataFrame.

StringStatistics ¤

StringStatistics(dataframe: DataFrame, column_name: str = 'systime')

Bases: Base

Provides class methods to calculate statistics on string columns in a pandas DataFrame.

Parameters:

dataframe ¤
(DataFrame) –

The DataFrame to be processed.
column_name ¤
(str, default: 'systime' ) –

The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.

Methods:

average_string_length –

Returns the average length of strings in the column, excluding null values.
contains_digit_count –

Counts how many strings contain digits.
contains_substring_count –

Counts how many strings contain the specified substring.
count_most_frequent –

Returns the count of the most frequent string in the column.
count_null –

Returns the number of null (NaN) values in the column.
count_unique –

Returns the number of unique strings in the column.
ends_with_count –

Counts how many strings end with the specified suffix.
get_dataframe –

Returns the processed DataFrame.
longest_string –

Returns the longest string in the column.
lowercase_percentage –

Returns the percentage of strings that are fully lowercase.
most_common_n_strings –

Returns the top N most frequent strings in the column.
most_frequent –

Returns the most frequent string in the column.
shortest_string –

Returns the shortest string in the column.
starts_with_count –

Counts how many strings start with the specified prefix.
string_length_summary –

Returns a summary of string lengths, including min, max, and average lengths.
summary_as_dataframe –

Returns a DataFrame with comprehensive string statistics for the specified column.
summary_as_dict –

Returns a dictionary with comprehensive string statistics for the specified column.
uppercase_percentage –

Returns the percentage of strings that are fully uppercase.

Source code in src/ts_shape/utils/base.py

def __init__(self, dataframe: pd.DataFrame, column_name: str = 'systime') -> pd.DataFrame:
    """
    Initializes the Base with a DataFrame, detects time columns, converts them to datetime,
    and sorts the DataFrame by the specified column (or the detected time column if applicable).

    Args:
        dataframe (pd.DataFrame): The DataFrame to be processed.
        column_name (str): The column to sort by. Default is 'systime'. If the column is not found or is not a time column, the class will attempt to detect other time columns.
    """
    self.dataframe = dataframe.copy()

    # Attempt to convert the specified column_name to datetime if it exists
    if column_name in self.dataframe.columns:
        self.dataframe[column_name] = pd.to_datetime(self.dataframe[column_name], errors='coerce')
    else:
        # If the column_name is not in the DataFrame, fallback to automatic time detection
        time_columns = [col for col in self.dataframe.columns if 'time' in col.lower() or 'date' in col.lower()]

        # Convert all detected time columns to datetime, if any
        for col in time_columns:
            self.dataframe[col] = pd.to_datetime(self.dataframe[col], errors='coerce')

        # If any time columns are detected, sort by the first one; otherwise, do nothing
        if time_columns:
            column_name = time_columns[0]

    # Sort by the datetime column (either specified or detected)
    if column_name in self.dataframe.columns:
        self.dataframe = self.dataframe.sort_values(by=column_name)

average_string_length `classmethod` ¤

average_string_length(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the average length of strings in the column, excluding null values.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def average_string_length(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the average length of strings in the column, excluding null values."""
    return dataframe[column_name].dropna().str.len().mean()

contains_digit_count `classmethod` ¤

contains_digit_count(dataframe: DataFrame, column_name: str = 'value_string') -> int

Counts how many strings contain digits.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def contains_digit_count(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Counts how many strings contain digits."""
    return dataframe[column_name].dropna().str.contains(r'\d').sum()

contains_substring_count `classmethod` ¤

contains_substring_count(dataframe: DataFrame, substring: str, column_name: str = 'value_string') -> int

Counts how many strings contain the specified substring.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def contains_substring_count(cls, dataframe: pd.DataFrame, substring: str, column_name: str = 'value_string') -> int:
    """Counts how many strings contain the specified substring."""
    return dataframe[column_name].dropna().str.contains(substring).sum()

count_most_frequent `classmethod` ¤

count_most_frequent(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the count of the most frequent string in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def count_most_frequent(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the count of the most frequent string in the column."""
    most_frequent_value = cls.most_frequent(dataframe, column_name)
    return dataframe[column_name].value_counts().loc[most_frequent_value]

count_null `classmethod` ¤

count_null(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the number of null (NaN) values in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def count_null(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the number of null (NaN) values in the column."""
    return dataframe[column_name].isna().sum()

count_unique `classmethod` ¤

count_unique(dataframe: DataFrame, column_name: str = 'value_string') -> int

Returns the number of unique strings in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def count_unique(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> int:
    """Returns the number of unique strings in the column."""
    return dataframe[column_name].nunique()

ends_with_count `classmethod` ¤

ends_with_count(dataframe: DataFrame, suffix: str, column_name: str = 'value_string') -> int

Counts how many strings end with the specified suffix.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def ends_with_count(cls, dataframe: pd.DataFrame, suffix: str, column_name: str = 'value_string') -> int:
    """Counts how many strings end with the specified suffix."""
    return dataframe[column_name].dropna().str.endswith(suffix).sum()

get_dataframe ¤

get_dataframe() -> DataFrame

Returns the processed DataFrame.

Source code in src/ts_shape/utils/base.py

def get_dataframe(self) -> pd.DataFrame:
    """Returns the processed DataFrame."""
    return self.dataframe

longest_string `classmethod` ¤

longest_string(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the longest string in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def longest_string(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the longest string in the column."""
    return dataframe[column_name].dropna().loc[dataframe[column_name].dropna().str.len().idxmax()]

lowercase_percentage `classmethod` ¤

lowercase_percentage(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the percentage of strings that are fully lowercase.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def lowercase_percentage(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the percentage of strings that are fully lowercase."""
    total_non_null = dataframe[column_name].notna().sum()
    if total_non_null == 0:
        return 0.0
    lowercase_count = dataframe[column_name].dropna().str.islower().sum()
    return (lowercase_count / total_non_null) * 100

most_common_n_strings `classmethod` ¤

most_common_n_strings(dataframe: DataFrame, n: int, column_name: str = 'value_string') -> Series

Returns the top N most frequent strings in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def most_common_n_strings(cls, dataframe: pd.DataFrame, n: int, column_name: str = 'value_string') -> pd.Series:
    """Returns the top N most frequent strings in the column."""
    return dataframe[column_name].value_counts().head(n)

most_frequent `classmethod` ¤

most_frequent(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the most frequent string in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def most_frequent(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the most frequent string in the column."""
    return dataframe[column_name].mode().iloc[0]

shortest_string `classmethod` ¤

shortest_string(dataframe: DataFrame, column_name: str = 'value_string') -> str

Returns the shortest string in the column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def shortest_string(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> str:
    """Returns the shortest string in the column."""
    return dataframe[column_name].dropna().loc[dataframe[column_name].dropna().str.len().idxmin()]

starts_with_count `classmethod` ¤

starts_with_count(dataframe: DataFrame, prefix: str, column_name: str = 'value_string') -> int

Counts how many strings start with the specified prefix.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def starts_with_count(cls, dataframe: pd.DataFrame, prefix: str, column_name: str = 'value_string') -> int:
    """Counts how many strings start with the specified prefix."""
    return dataframe[column_name].dropna().str.startswith(prefix).sum()

string_length_summary `classmethod` ¤

string_length_summary(dataframe: DataFrame, column_name: str = 'value_string') -> DataFrame

Returns a summary of string lengths, including min, max, and average lengths.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def string_length_summary(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> pd.DataFrame:
    """Returns a summary of string lengths, including min, max, and average lengths."""
    lengths = dataframe[column_name].dropna().str.len()
    return pd.DataFrame({
        'Min Length': [lengths.min()],
        'Max Length': [lengths.max()],
        'Average Length': [lengths.mean()]
    })

summary_as_dataframe `classmethod` ¤

summary_as_dataframe(dataframe: DataFrame, column_name: str) -> DataFrame

Returns a DataFrame with comprehensive string statistics for the specified column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def summary_as_dataframe(cls, dataframe: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Returns a DataFrame with comprehensive string statistics for the specified column."""
    summary_data = cls.summary_as_dict(dataframe, column_name)
    return pd.DataFrame([summary_data])

summary_as_dict `classmethod` ¤

summary_as_dict(dataframe: DataFrame, column_name: str) -> Dict[str, Union[int, str, float]]

Returns a dictionary with comprehensive string statistics for the specified column.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def summary_as_dict(cls, dataframe: pd.DataFrame, column_name: str) -> Dict[str, Union[int, str, float]]:
    """Returns a dictionary with comprehensive string statistics for the specified column."""
    most_frequent = cls.most_frequent(dataframe, column_name)
    value_counts = dataframe[column_name].value_counts()

    return {
        'unique_values': cls.count_unique(dataframe, column_name),
        'most_frequent': most_frequent,
        'count_most_frequent': cls.count_most_frequent(dataframe, column_name),
        'count_null': cls.count_null(dataframe, column_name),
        'average_string_length': cls.average_string_length(dataframe, column_name),
        'longest_string': cls.longest_string(dataframe, column_name),
        'shortest_string': cls.shortest_string(dataframe, column_name),
        'uppercase_percentage': cls.uppercase_percentage(dataframe, column_name),
        'lowercase_percentage': cls.lowercase_percentage(dataframe, column_name),
        'contains_digit_count': cls.contains_digit_count(dataframe, column_name),
        'least_common': value_counts.idxmin() if not value_counts.empty else None,
        'frequency_least_common': value_counts.min() if not value_counts.empty else 0
    }

uppercase_percentage `classmethod` ¤

uppercase_percentage(dataframe: DataFrame, column_name: str = 'value_string') -> float

Returns the percentage of strings that are fully uppercase.

Source code in src/ts_shape/features/stats/string_stats.py

@classmethod
def uppercase_percentage(cls, dataframe: pd.DataFrame, column_name: str = 'value_string') -> float:
    """Returns the percentage of strings that are fully uppercase."""
    total_non_null = dataframe[column_name].notna().sum()
    if total_non_null == 0:
        return 0.0
    uppercase_count = dataframe[column_name].dropna().str.isupper().sum()
    return (uppercase_count / total_non_null) * 100

ts_shape.features.stats.string_stats ¤

StringStatistics ¤

`dataframe` ¤

`column_name` ¤

average_string_length `classmethod` ¤

contains_digit_count `classmethod` ¤

contains_substring_count `classmethod` ¤

count_most_frequent `classmethod` ¤

count_null `classmethod` ¤

count_unique `classmethod` ¤

ends_with_count `classmethod` ¤

get_dataframe ¤

longest_string `classmethod` ¤

lowercase_percentage `classmethod` ¤

most_common_n_strings `classmethod` ¤

most_frequent `classmethod` ¤

shortest_string `classmethod` ¤

starts_with_count `classmethod` ¤

string_length_summary `classmethod` ¤

summary_as_dataframe `classmethod` ¤

summary_as_dict `classmethod` ¤

uppercase_percentage `classmethod` ¤

ts_shape.features.stats.string_stats ¤

StringStatistics ¤

dataframe ¤

column_name ¤

average_string_length classmethod ¤

contains_digit_count classmethod ¤

contains_substring_count classmethod ¤

count_most_frequent classmethod ¤

count_null classmethod ¤

count_unique classmethod ¤

ends_with_count classmethod ¤

get_dataframe ¤

longest_string classmethod ¤

lowercase_percentage classmethod ¤

most_common_n_strings classmethod ¤

most_frequent classmethod ¤

shortest_string classmethod ¤

starts_with_count classmethod ¤

string_length_summary classmethod ¤

summary_as_dataframe classmethod ¤

summary_as_dict classmethod ¤

uppercase_percentage classmethod ¤

`dataframe` ¤

`column_name` ¤

average_string_length `classmethod` ¤

contains_digit_count `classmethod` ¤

contains_substring_count `classmethod` ¤

count_most_frequent `classmethod` ¤

count_null `classmethod` ¤

count_unique `classmethod` ¤

ends_with_count `classmethod` ¤

longest_string `classmethod` ¤

lowercase_percentage `classmethod` ¤

most_common_n_strings `classmethod` ¤

most_frequent `classmethod` ¤

shortest_string `classmethod` ¤

starts_with_count `classmethod` ¤

string_length_summary `classmethod` ¤

summary_as_dataframe `classmethod` ¤

summary_as_dict `classmethod` ¤

uppercase_percentage `classmethod` ¤