Source code for unravel.tabular.filter_rows

#!/usr/bin/env python3

"""
Use ``tabular_filter_rows`` or ``filter_rows`` from UNRAVEL to filter tabular data by values in a specified column.

Usage to keep rows where 'region' contains 'AUDp':
--------------------------------------------------
tabular_filter_rows -i data.csv -col region -p AUDp [--exact] [-o output_dir] [-v]

Usage to exclude rows where 'region' contains 'layer1':
-------------------------------------------------------
tabular_filter_rows -i data.csv -col region -p layer1 -f exclude [--exact] [-o output_dir] [-v]
"""

import pandas as pd
from pathlib import Path
from rich import print
from rich.traceback import install

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration
from unravel.core.utils import log_command, match_files, verbose_start_msg, verbose_end_msg
from unravel.tabular.utils import load_tabular_file, save_tabular_file


[docs]
def parse_args():
    parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__)

    reqs = parser.add_argument_group('Required arguments')
    reqs.add_argument('-i', '--input', help="One or more CSV/XLSX file paths or glob patterns (space-separated), e.g., 'data/*.csv'.", required=True, nargs='*', action=SM)
    reqs.add_argument('-col', '--column', help="Column to filter", required=True, action=SM)
    reqs.add_argument('-p', '--patterns', help="List of substring patterns to filter column values (use --exact for full-value matching).", nargs='*', required=True, action=SM)

    opts = parser.add_argument_group('Optional arguments')
    opts.add_argument('-f', '--filter', help="Filtering mode ('include' to keep rows with specified values, exclude to remove them). Default: include", choices=['include', 'exclude'], default='include', action=SM)
    opts.add_argument('-e', '--exact', help="Use exact matching instead of substring matching.", action='store_true', default=False)
    opts.add_argument('-o', '--output', help="Directory to save filtered files. Default: filtered_files", default="filtered_files", action=SM)
    opts.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False)
    return parser.parse_args()



[docs]
def filter_table(df, column, patterns, mode='include', exact=False):
    """
    Filter a DataFrame based on a specific column and a list of values.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the tabular data.
    column : str
        Column name to filter by.
    patterns : list
        List of values to filter by. If mode is 'include', rows with these values will be kept; if 'exclude', rows with these values will be removed.
    mode : str
        Filtering mode, either 'include' (default) to keep rows with specified values or 'exclude' to remove them.
    exact : bool
        If True, filter by exact matches of the values in the column. If False, filter by substring matches.

    Returns:
    --------
    pandas.DataFrame or None
        Filtered DataFrame if any rows match the criteria, otherwise None.
    """
    if column not in df.columns:
        print(f"[red]Column '{column}' not found in the DataFrame.")
        print(f"[dim]Available columns: {df.columns.tolist()}[/dim]")
        return None

    df[column] = df[column].astype(str)
    patterns = [str(p) for p in patterns]

    if exact:
        mask = df[column].isin(patterns)
    else:
        mask = df[column].apply(lambda x: any(p in x for p in patterns))

    if mode == 'exclude':
        mask = ~mask

    filtered_df = df[mask]
    return filtered_df if not filtered_df.empty else None




[docs]
@log_command
def main():
    install()
    args = parse_args()
    Configuration.verbose = args.verbose
    verbose_start_msg()

    file_paths = match_files(args.input)
    for file_path in file_paths:

        # Skip temporary files that start with ~
        if Path(file_path).name.startswith("~"):
            continue
        
        df, file_extension = load_tabular_file(file_path)

        df_filtered = filter_table(df,
                                   column=args.column,
                                   patterns=args.patterns,
                                   mode=args.filter,
                                   exact=args.exact)

        if df_filtered is not None:
            output_dir = Path(args.output)
            output_dir.mkdir(parents=True, exist_ok=True)
            output_path = output_dir / f"{Path(file_path).stem}_filtered{file_extension}"
            save_tabular_file(df_filtered, output_path, verbose=args.verbose)

    verbose_end_msg()


if __name__ == '__main__':
    main()