Source code for unravel.allen_institute.mapmysections.csv_concat_with_source

#!/usr/bin/env python3

"""
Use ``mms_concat_with_source`` or ``mms_c`` from UNRAVEL to concatenate multiple CSV files, include a 'source_file' column, and sort rows by that column.

Prereqs:
    - ``mms_soma_ratio`` or ``mms_seg_summary``
    - Aggregate their outputs
    - For ``mms_seg_summary``, use ``agg`` to aggregate results across samples and cd to the target directory.

Note:
    - This command loads all matching CSV files.
    - It adds a 'source_file' column (file stem).
    - It handles empty files by filling rows with 0s for all expected columns.
    - It sorts all rows by 'source_file'.

Usage:
------
    csv_concat_with_source [-i '<asterisk>.csv'] [-o output.csv] [-v]
"""

import pandas as pd
from pathlib import Path
from rich import print
from rich.traceback import install

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.utils import log_command, match_files, verbose_start_msg, verbose_end_msg


[docs] def parse_args(): parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__) opts = parser.add_argument_group('Optional arguments') opts.add_argument('-i', '--input', help="Path(s) or glob pattern(s) for input CSV files. Default: '*.csv'", default='*.csv', nargs='*', action=SM) opts.add_argument('-o', '--output', help="Output CSV path. Default: concatenated_output.csv", default='concatenated_output.csv', action=SM) general = parser.add_argument_group('General arguments') general.add_argument('-v', '--verbose', help='Verbose output.', action='store_true') return parser.parse_args()
[docs] @log_command def main(): install() args = parse_args() verbose_start_msg() input_files = match_files(args.input) file_dfs = [] all_columns = set() for file in sorted(input_files): if args.verbose: print(f'📂 Reading: {file}') try: df = pd.read_csv(file) empty = df.empty except pd.errors.EmptyDataError: empty = True if empty: print(f'[yellow]⚠️ Empty file detected:[/yellow] {file.name}') if not all_columns: print(f"[red]❌ Cannot infer columns for empty file before loading any non-empty file.[/red]") continue df = pd.DataFrame([{col: 0 for col in all_columns}]) else: all_columns.update(df.columns) df['source_file'] = file.stem file_dfs.append(df) if not file_dfs: raise ValueError("No valid dataframes loaded.") df_concat = pd.concat(file_dfs, ignore_index=True).fillna(0) # Reorder: source_file first, then alphabetized rest cols = [c for c in df_concat.columns if c != 'source_file'] df_concat = df_concat[['source_file'] + sorted(cols)] # Sort by source_file df_concat = df_concat.sort_values(by='source_file').reset_index(drop=True) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) df_concat.to_csv(output_path, index=False) print(f'\n[bold green]✅ Saved concatenated CSV to:[/bold green] {output_path}\n') verbose_end_msg()
if __name__ == '__main__': main()