Source code for unravel.abca.sunburst.sunburst_filter

#!/usr/bin/env python3

"""
Use ``abca_sunburst_filter`` or ``sbf`` from UNRAVEL to filter ABCA sunburst expression data, keeping cells with high expression at any level (class, subclass, supertype, cluster).

Prereqs:
    - ``abca_suburst_expression``

Notes:
    - Use LUTs from ``abca_suburst_expression`` for coloring the sunburst plot.

Usage for first run:
--------------------
abca_sunburst_filter -i path/main_ABCA_sunburst_expression.csv -g geneX -o ABCA_sunburst_filtered/ [-n] [-c 10] [-t 6] 

Usage to apply filtering from another dataset:
----------------------------------------------
abca_sunburst_filter -i path/_main_ABCA_sunburst_filter.csv -a path/secondary_ABCA_sunburst_expression.csv -g geneX -o new_output [-n] [-c 10] [-t 6]
"""

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd
import shutil
from pathlib import Path
from rich import print
from rich.traceback import install

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration 
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg



[docs]
def parse_args():
    parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__)

    reqs = parser.add_argument_group('Required arguments')
    reqs.add_argument('-i', '--input', help='path/main_ABCA_sunburst_expression.csv (the primary file for determining filtering by expression)', required=True, action=SM)

    opts = parser.add_argument_group('Optional args')
    opts.add_argument('-t', '--threshold', help='Log2(CPM+1) threshold for percent gene expression (rec: use same thresh as for ABCA_sunburst_expression.py). Default: 6', default=6, type=float, action=SM)
    opts.add_argument('-l', '--level', help='Level to filter on. Default: supertype', default='supertype', action=SM)
    opts.add_argument('-o', '--output', help='Output dir path. Default: ABCA_sunburst_filtered_thr6_<level>', default=None, action=SM)
    opts.add_argument('-a', '--apply_to', help='path/secondary_ABCA_sunburst_expression.csv (the secondary file to filter based on the primary file)', action=SM)

    general = parser.add_argument_group('General arguments')
    general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False)

    return parser.parse_args()




[docs]
@log_command
def main():
    install()
    args = parse_args()
    Configuration.verbose = args.verbose
    verbose_start_msg()

    main_sunburst_exp_df = pd.read_csv(args.input)
    print('\nMain sunburst expression data:')
    print(main_sunburst_exp_df)

    # Keep cells that have high mean expression (e.g., >= 6) of the gene of interest at the specified level
    cell_mean_col = f'{args.level}_mean'
    cells_df_filtered = main_sunburst_exp_df[
        (main_sunburst_exp_df[cell_mean_col] >= args.threshold)
    ]

    print('\nMain sunburst expression data after filtering:')
    print(cells_df_filtered)

    # Save the filtered results
    if args.output:
        output_dir = Path(args.output)
    else:
        output_dir = Path(f'ABCA_sunburst_filtered_thr{args.threshold}_{args.level}')
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / str(Path(args.input).name).replace('.csv', f'_filtered.csv')
    cells_df_filtered.to_csv(output_path, index=False)

    # Copy the mean expression LUT
    mean_lut = str(args.input).replace(f'expression_thr{args.threshold}.csv', 'mean_expression_lut.txt')
    if Path(mean_lut).exists():
        shutil.copy(mean_lut, output_dir)

    if args.apply_to:
        # Get list of clusters with high expression
        filtered_clusters = cells_df_filtered['cluster'].unique()
        print(f"\nClusters with high expression: {filtered_clusters}")

        # Load the secondary dataset
        secondary_sunburst_exp_df = pd.read_csv(args.apply_to)
        print('\nSecondary sunburst expression data:')
        print(secondary_sunburst_exp_df)

        # Filter the secondary dataset
        secondary_cells_df_filtered = secondary_sunburst_exp_df[secondary_sunburst_exp_df['cluster'].isin(filtered_clusters)]
        print('\nSecondary sunburst expression data after filtering:')
        print(secondary_cells_df_filtered)

        # Save the filtered results
        output_path = output_dir / str(Path(args.apply_to).name).replace('.csv', f'_filtered_with_{Path(args.input).name}')
        secondary_cells_df_filtered.to_csv(output_path, index=False)

        # Copy the mean expression LUT
        if str(args.apply_to).endswith(f'expression_thr{args.threshold}.csv'):  # scRNA-seq data
            mean_lut = str(args.apply_to).replace(f'expression_thr{args.threshold}.csv', 'mean_expression_lut.txt')
        elif str(args.apply_to).endswith(f'expression_summary.csv'):  # MERFISH data
            mean_lut = str(args.apply_to).replace(f'expression_summary.csv', 'mean_expression_lut.txt')
        if Path(mean_lut).exists():
            shutil.copy(mean_lut, output_dir)
    
    verbose_end_msg()



if __name__ == '__main__':
    main()