Source code for unravel.allen_institute.genetic_tools_atlas.org_samples

#!/usr/bin/env python3

"""
Use ``gta_org_samples`` (``gta_os``) from UNRAVEL to organize GTA data across samples for batch processing.

Prereqs:
    - ``gta_download`` (``gta_dl``) must be run first to download .zarr data at a set resolution.
    - ``io_convert_img`` (``conv``) must be run to convert the .zarr data to TIFF series.

Inputs:
    - `*.zarr` files from ``gta_download`` (``gta_dl``) at a set resolution (e.g., level 3).

Outputs:
    - Root dir: TIFFs/
    - Directories created based on the fluorescent channel (e.g., red, green, dual).
    - Relevant sample directories created in each channel directory (e.g., ID_<Image Series ID>).
    - Sample directories contain 'green' and 'red' directories with TIFF files for each channel.

Note:
    - Key SpecimenMetadata.csv columns: 'Image Series ID' 'Donor Genotype' 'Cargo'
    - Run from GTA_level_3 directory
    
Next steps:
    - ... 

Usage:
------
    gta_os [-d red green] [-i "path/to/SpecimenMetadata.csv"] [-col col1 col2 ...] [-o output_dir] [-p sample_prefix] [-v]
"""

import shutil
import pandas as pd
from pathlib import Path
from rich import print
from rich.traceback import install

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg

COLUMNS = ['Image Series ID', 'Donor Genotype', 'Cargo']

# Logic determined when there was 3902 GTA STPT records (as of 2025-08-19):
CARGO_MAP = { 
    'NA': 'USE Tg LOGIC',
    'iCre(R297T)': 'green if Ai193 is in "Donor Genotype" else red',
    'SYFP2': 'green',
    'iCre': 'green if Ai193 is in "Donor Genotype" else red',
    'FlpO': 'red',
    'dTomato': 'red',
    'jGCaMP8m': 'green',
    'SYFP2 | iCre(R297T)': 'green',
    'iCre(R297T) | EGFP': 'dual',
    'SYFP2 | mScarlet': 'dual',
    'iCre(R297T) | SYFP2': 'green',
    'ChR2(H134R) | dTomato | EYFP': 'dual',
    'EYFP': 'green',
    'tdTomato | SYFP2': 'dual',
    'iCre | FlpO': 'dual',
    'FlpO | iCre': 'dual',
    'iCre(R297T) | tdTomato': 'red',
    'EGFP | iCre(R297T)': 'dual',
    'EYFP | ChR2(H134R)': 'green',
    'CreN': 'red',
    'tdTomato | iCre(R297T)': 'red',
    'mScarlet | SYFP2': 'dual',
    'ChR2(H134R) | EYFP | dTomato': 'dual',
    'SYFP2 | tdTomato': 'dual'
}

# If 'Cargo' is 'NA', use Tg logic (Check 'Donor Genotype' for the presence of these list items to determine the channel [green, red, or dual]):
GREEN_TG =  ['GFP', 'Ai210', 'Ai195', 'RCE-FRT']
RED_TG = ['tdTomato', 'Ai223', 'Ai65F']


[docs]
def parse_args():
    parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__)

    opts = parser.add_argument_group('Optional arguments')
    opts.add_argument('-d', '--directories', help='Space-separated list of tif directory names to organize. Default: "red green"', default=['red', 'green'], nargs='*', action=SM)
    opts.add_argument('-i', '--input', help='path/SpecimenMetadata.csv. Default: unravel/allen_institute/genetic_tools_atlas/SpecimenMetadata_subset.csv', default=None, action=SM)
    opts.add_argument('-o', '--output_dir', help='Output directory for organized samples. Default: TIFFs', default='TIFFs', action=SM)
    opts.add_argument('-p', '--prefix', help='Prefix for sample directories (useful for batch processing). Default: "ID_"', default='ID_', action=SM)

    general = parser.add_argument_group('General arguments')
    general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False)

    return parser.parse_args()



[docs]
def org_samples(df, target_dir, prefix, tif_dirs):
    """
    Organize samples into directories based on the SpecimenMetadata DataFrame.

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing 'Image Series ID' with the sample IDs to organize.
    target_dir : Path
        Directory where the sample directories will be created.
    prefix : str
        Prefix for sample directories.
    tif_dirs : list of str
        List of TIFF directory names to organize (e.g., ['red', 'green']).
    """
    for _, row in df.iterrows():
        series_id = row['Image Series ID']
        sample_dir = target_dir / f'{prefix}{series_id}'

        for dir in tif_dirs:
            tif_dir = Path(dir) / str(series_id)
            if tif_dir.is_dir():
                dest = sample_dir / str(dir)
                dest.mkdir(parents=True, exist_ok=True)

                # Move all TIFFs from tif_dir to dest
                for item in tif_dir.iterdir():
                    if item.suffix == '.tif':
                        shutil.move(str(item), str(dest))

                # Remove the tif_dir if empty
                try:
                    if not any(p for p in tif_dir.iterdir() if not p.name.startswith('.')):
                        tif_dir.rmdir()
                except OSError:
                    print(f'Warning: {tif_dir} not empty or failed to remove.')

    # Remove parent directories ('red', 'green') if they are now empty
    for dir in tif_dirs:
        tif_dir = Path(dir)
        if not tif_dir.exists():
            continue  # It's already deleted — no warning needed
        try:
            if not any(p for p in tif_dir.iterdir() if not p.name.startswith('.')):
                tif_dir.rmdir()
        except Exception as e:
            print(f'Warning: {tif_dir} could not be removed ({e})')




[docs]
@log_command
def main():
    install()
    args = parse_args()
    Configuration.verbose = args.verbose
    verbose_start_msg()

    if args.input is None:
        input_path = Path(__file__).parent / 'SpecimenMetadata_subset.csv'
    else:
        input_path = Path(args.input)

    print(f"\n[bold green]SpecimenMetadata.csv file:\n  [/bold green]{input_path}\n")

    if not input_path.is_file():
        print(f"[bold red]SpecimenMetadata file not found: [/bold red]{input_path}")
        return

    df = pd.read_csv(input_path, usecols=COLUMNS)

    # Drop rows duplicate values in 'Image Series ID'
    df = df.drop_duplicates(subset='Image Series ID')

    # Add a 'Channel' column based on 'Cargo' and 'Donor Genotype'
    df['Channel'] = df['Cargo'].apply(lambda x: CARGO_MAP.get(x, 'NA')) # Set 'Channel' using 'Cargo' as the key

    # Handle conditional logic for 'Channel'
    for i in ['iCre(R297T)', 'iCre']:
        df.loc[df['Cargo'] == i, 'Channel'] = df.apply(
            lambda row: 'green' if 'Ai193' in row['Donor Genotype'] else 'red',
            axis=1
        )

    # If 'Cargo' is 'NA', use Tg logic to determine the channel
    df.loc[df['Channel'] == 'NA', 'Channel'] = df.apply(
        lambda row: 'green' if any(g in row['Donor Genotype'] for g in GREEN_TG) else
                     ('red' if any(r in row['Donor Genotype'] for r in RED_TG) else 'dual'),
        axis=1
    )

    # Print rows with 'Channel' as 'NA' (if any)
    na_channel_rows = df[df['Channel'] == 'NA']
    if not na_channel_rows.empty:
        print(f"[bold yellow]Warning: Found {len(na_channel_rows)} rows with 'Channel' as 'NA'. These will be skipped.[/bold yellow]")
        print(na_channel_rows)

    # Create a df for green, red, and dual channels with the 'Image Series ID' column
    green_df = df[df['Channel'] == 'green'][['Image Series ID']].sort_values(by='Image Series ID')
    red_df = df[df['Channel'] == 'red'][['Image Series ID']].sort_values(by='Image Series ID')
    dual_df = df[df['Channel'] == 'dual'][['Image Series ID']].sort_values(by='Image Series ID')

    # Create output directories
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    green_dir = output_dir / 'green'
    red_dir = output_dir / 'red'
    dual_dir = output_dir / 'dual'
    green_dir.mkdir(parents=True, exist_ok=True)
    red_dir.mkdir(parents=True, exist_ok=True)
    dual_dir.mkdir(parents=True, exist_ok=True)

    # Organize green samples
    org_samples(green_df, green_dir, args.prefix, args.directories)

    # Organize red samples
    org_samples(red_df, red_dir, args.prefix, args.directories)

    # Organize dual samples
    org_samples(dual_df, dual_dir, args.prefix, args.directories)

    # Delete empty directories in the output directory
    for dir in [green_dir, red_dir, dual_dir]:
        try:
            if not any(p for p in dir.iterdir() if not p.name.startswith('.')):
                dir.rmdir()
        except OSError:
            print(f'Warning: {dir} not empty or failed to remove.')

    # Print output directory structure
    print(f"\n[bold green]Organized samples saved to: {output_dir}[/bold green]")

    verbose_end_msg()

    

if __name__ == "__main__":
    main()