Source code for unravel.tabular.unique_values

#!/usr/bin/env python3

"""
Use ``tabular_unique_values`` or ``uniq_vals`` from UNRAVEL to print unique values in specified column(s) of a CSV file.

Usage:
------
    tabular_unique_values -i input.csv -c column1 column2 [-k keyword1 keyword2 ...] [--exact] [-v]
"""

import pandas as pd
from rich import print
from rich.traceback import install
from rich.table import Table
from rich import box

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg
from unravel.tabular.utils import load_tabular_file, save_tabular_file



[docs]
def parse_args():
    parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__)

    reqs = parser.add_argument_group('Required arguments')
    reqs.add_argument('-i', '--input', help='Path to the input CSV file.', required=True, action=SM)
    reqs.add_argument('-c', '--column', help='Column name(s) to process (space-separated for multiple).', required=True, nargs='*', action=SM)

    opts = parser.add_argument_group('Optional arguments')
    opts.add_argument('-n', '--count', help='Print the count for each unique value.', action='store_true', default=False)
    opts.add_argument('-k', '--keyword', help='Keyword(s) to filter unique values. For partial match use "gene", for exact match use --exact.', nargs='*', default=None, action=SM)
    opts.add_argument('-e', '--exact', help='Use exact match instead of partial substring match.', action='store_true', default=False)
    opts.add_argument('-o', '--output', help='Output file path to save the results. Default: None (prints to console).', default=None, action=SM)
    opts.add_argument('-s', '--space', help='Print unique values as a space-separated list for easy copy-pasting.', action='store_true', default=False)

    general = parser.add_argument_group('General arguments')
    general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False)

    return parser.parse_args()


# TODO: Note in Allen docs that this can be useful for checking unique values in large CSV files (e.g., 'region_of_interest_acronym' for scRNA-seq data).
# TODO: Add option to print unique values as a space-separated list for easy copy-pasting to other tools.


[docs]
def filter_values(values, keywords, exact):
    if not keywords:
        return values
    filtered = []

    for val in values:
        val_str = str(val).lower()
        if any((val_str == kw.lower() if exact else kw.lower() in val_str) for kw in keywords):
            filtered.append(val)

    return filtered



[docs]
def print_values(column, values, keywords=None, value_counts=None, show_count=False, space_mode=False):
    if space_mode:
        list_str = " ".join(str(v) for v in values)
        print(list_str)
        return list_str
    
    count = len(values)
    plural = "value" if count == 1 else "values"

    if keywords:
        print(f"\n Filtered {count} unique {plural} in column '{column}' (matching {keywords}):")
    else:
        print(f"\n Unique {count} {plural} in column '{column}':")

    if show_count and value_counts is not None:
        df_out = pd.DataFrame({
            'Value': values,
            'Count': [value_counts[val] for val in values],
        })
        total = df_out['Count'].sum()
        df_out['Percent'] = df_out['Count'] / total * 100
    else:
        df_out = pd.DataFrame({'Value': values})

    # Rich table display
    table = Table(title=None, box=box.SIMPLE)
    table.add_column("Value", style="green", no_wrap=True)
    if show_count and value_counts is not None:
        table.add_column("Count", justify="right", style="cyan")
        table.add_column("Percent", justify="right", style="magenta")

        for _, row in df_out.iterrows():
            table.add_row(str(row["Value"]), str(row["Count"]), f"{row['Percent']:.2f}%")
    else:
        for val in df_out['Value']:
            table.add_row(str(val))

    print(table)
    return df_out




[docs]
@log_command
def main():
    install()
    args = parse_args()
    Configuration.verbose = args.verbose
    verbose_start_msg()

    df, _ = load_tabular_file(args.input)

    missing = [col for col in args.column if col not in df.columns]
    if missing:
        print(f"[bold yellow]Warning:[/bold yellow] Column(s) not found in CSV: {missing}")

    valid_columns = [col for col in args.column if col in df.columns]
    if not valid_columns:
        print("No valid columns found in the CSV file.")
        return

    for column in valid_columns:
        col_series = df[column].dropna()
        value_counts = col_series.value_counts()
        unique_values = sorted(df[column].dropna().unique(), key=lambda x: str(x).lower())

        if not unique_values:
            print(f"[dim]No matching values found in column '{column}'.[/dim]")
            continue

        filtered_values = filter_values(unique_values, args.keyword, args.exact)
        filtered_value_counts = value_counts[filtered_values] if args.count else None

        output_df_or_list = print_values(
            column,
            filtered_values,
            keywords=args.keyword,
            value_counts=filtered_value_counts,
            show_count=args.count,
            space_mode=args.space
        )

        if args.output and output_df_or_list is not None:
            if args.space:
                with open(args.output, 'w') as f:
                    f.write(output_df_or_list+ '\n')
                    if args.verbose:
                        print(f"\n[green]Data saved to: {args.output}[/green]")
            else:
                save_tabular_file(output_df_or_list, args.output, index=False, verbose=args.verbose)

    verbose_end_msg()



if __name__ == '__main__':
    main()