Source code for unravel.allen_institute.genetic_tools_atlas.org_samples

#!/usr/bin/env python3

"""
Use ``gta_org_samples`` (``gta_os``) from UNRAVEL to organize GTA data across samples for batch processing.

Prereqs:
    - ``gta_download`` (``gta_dl``) must be run first to download .zarr data at a set resolution.
    - ``io_convert_img`` (``conv``) must be run to convert the .zarr data to TIFF series.

Inputs:
    - `*.zarr` files from ``gta_download`` (``gta_dl``) at a set resolution (e.g., level 3).

Outputs:
    - Root dir: TIFFs/
    - Directories created based on the fluorescent channel (e.g., red, green, dual).
    - Relevant sample directories created in each channel directory (e.g., ID_<Image Series ID>).
    - Sample directories contain 'green' and 'red' directories with TIFF files for each channel.

Note:
    - Key SpecimenMetadata.csv columns: 'Image Series ID' 'Donor Genotype' 'Cargo'
    - Run from GTA_level_3 directory
    
Next steps:
    - ... 

Usage:
------
    gta_os [-d red green] [-i "path/to/SpecimenMetadata.csv"] [-col col1 col2 ...] [-o output_dir] [-p sample_prefix] [-v]
"""

import shutil
import pandas as pd
from pathlib import Path
from rich import print
from rich.traceback import install

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg

COLUMNS = ['Image Series ID', 'Donor Genotype', 'Cargo']

# Logic determined when there was 3902 GTA STPT records (as of 2025-08-19):
CARGO_MAP = { 
    'NA': 'USE Tg LOGIC',
    'iCre(R297T)': 'green if Ai193 is in "Donor Genotype" else red',
    'SYFP2': 'green',
    'iCre': 'green if Ai193 is in "Donor Genotype" else red',
    'FlpO': 'red',
    'dTomato': 'red',
    'jGCaMP8m': 'green',
    'SYFP2 | iCre(R297T)': 'green',
    'iCre(R297T) | EGFP': 'dual',
    'SYFP2 | mScarlet': 'dual',
    'iCre(R297T) | SYFP2': 'green',
    'ChR2(H134R) | dTomato | EYFP': 'dual',
    'EYFP': 'green',
    'tdTomato | SYFP2': 'dual',
    'iCre | FlpO': 'dual',
    'FlpO | iCre': 'dual',
    'iCre(R297T) | tdTomato': 'red',
    'EGFP | iCre(R297T)': 'dual',
    'EYFP | ChR2(H134R)': 'green',
    'CreN': 'red',
    'tdTomato | iCre(R297T)': 'red',
    'mScarlet | SYFP2': 'dual',
    'ChR2(H134R) | EYFP | dTomato': 'dual',
    'SYFP2 | tdTomato': 'dual'
}

# If 'Cargo' is 'NA', use Tg logic (Check 'Donor Genotype' for the presence of these list items to determine the channel [green, red, or dual]):
GREEN_TG =  ['GFP', 'Ai210', 'Ai195', 'RCE-FRT']
RED_TG = ['tdTomato', 'Ai223', 'Ai65F']

[docs] def parse_args(): parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__) opts = parser.add_argument_group('Optional arguments') opts.add_argument('-d', '--directories', help='Space-separated list of tif directory names to organize. Default: "red green"', default=['red', 'green'], nargs='*', action=SM) opts.add_argument('-i', '--input', help='path/SpecimenMetadata.csv. Default: unravel/allen_institute/genetic_tools_atlas/SpecimenMetadata_subset.csv', default=None, action=SM) opts.add_argument('-o', '--output_dir', help='Output directory for organized samples. Default: TIFFs', default='TIFFs', action=SM) opts.add_argument('-p', '--prefix', help='Prefix for sample directories (useful for batch processing). Default: "ID_"', default='ID_', action=SM) general = parser.add_argument_group('General arguments') general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False) return parser.parse_args()
[docs] def org_samples(df, target_dir, prefix, tif_dirs): """ Organize samples into directories based on the SpecimenMetadata DataFrame. Parameters: ----------- df : pd.DataFrame DataFrame containing 'Image Series ID' with the sample IDs to organize. target_dir : Path Directory where the sample directories will be created. prefix : str Prefix for sample directories. tif_dirs : list of str List of TIFF directory names to organize (e.g., ['red', 'green']). """ for _, row in df.iterrows(): series_id = row['Image Series ID'] sample_dir = target_dir / f'{prefix}{series_id}' for dir in tif_dirs: tif_dir = Path(dir) / str(series_id) if tif_dir.is_dir(): dest = sample_dir / str(dir) dest.mkdir(parents=True, exist_ok=True) # Move all TIFFs from tif_dir to dest for item in tif_dir.iterdir(): if item.suffix == '.tif': shutil.move(str(item), str(dest)) # Remove the tif_dir if empty try: if not any(p for p in tif_dir.iterdir() if not p.name.startswith('.')): tif_dir.rmdir() except OSError: print(f'Warning: {tif_dir} not empty or failed to remove.') # Remove parent directories ('red', 'green') if they are now empty for dir in tif_dirs: tif_dir = Path(dir) if not tif_dir.exists(): continue # It's already deleted — no warning needed try: if not any(p for p in tif_dir.iterdir() if not p.name.startswith('.')): tif_dir.rmdir() except Exception as e: print(f'Warning: {tif_dir} could not be removed ({e})')
[docs] @log_command def main(): install() args = parse_args() Configuration.verbose = args.verbose verbose_start_msg() if args.input is None: input_path = Path(__file__).parent / 'SpecimenMetadata_subset.csv' else: input_path = Path(args.input) print(f"\n[bold green]SpecimenMetadata.csv file:\n [/bold green]{input_path}\n") if not input_path.is_file(): print(f"[bold red]SpecimenMetadata file not found: [/bold red]{input_path}") return df = pd.read_csv(input_path, usecols=COLUMNS) # Drop rows duplicate values in 'Image Series ID' df = df.drop_duplicates(subset='Image Series ID') # Add a 'Channel' column based on 'Cargo' and 'Donor Genotype' df['Channel'] = df['Cargo'].apply(lambda x: CARGO_MAP.get(x, 'NA')) # Set 'Channel' using 'Cargo' as the key # Handle conditional logic for 'Channel' for i in ['iCre(R297T)', 'iCre']: df.loc[df['Cargo'] == i, 'Channel'] = df.apply( lambda row: 'green' if 'Ai193' in row['Donor Genotype'] else 'red', axis=1 ) # If 'Cargo' is 'NA', use Tg logic to determine the channel df.loc[df['Channel'] == 'NA', 'Channel'] = df.apply( lambda row: 'green' if any(g in row['Donor Genotype'] for g in GREEN_TG) else ('red' if any(r in row['Donor Genotype'] for r in RED_TG) else 'dual'), axis=1 ) # Print rows with 'Channel' as 'NA' (if any) na_channel_rows = df[df['Channel'] == 'NA'] if not na_channel_rows.empty: print(f"[bold yellow]Warning: Found {len(na_channel_rows)} rows with 'Channel' as 'NA'. These will be skipped.[/bold yellow]") print(na_channel_rows) # Create a df for green, red, and dual channels with the 'Image Series ID' column green_df = df[df['Channel'] == 'green'][['Image Series ID']].sort_values(by='Image Series ID') red_df = df[df['Channel'] == 'red'][['Image Series ID']].sort_values(by='Image Series ID') dual_df = df[df['Channel'] == 'dual'][['Image Series ID']].sort_values(by='Image Series ID') # Create output directories output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) green_dir = output_dir / 'green' red_dir = output_dir / 'red' dual_dir = output_dir / 'dual' green_dir.mkdir(parents=True, exist_ok=True) red_dir.mkdir(parents=True, exist_ok=True) dual_dir.mkdir(parents=True, exist_ok=True) # Organize green samples org_samples(green_df, green_dir, args.prefix, args.directories) # Organize red samples org_samples(red_df, red_dir, args.prefix, args.directories) # Organize dual samples org_samples(dual_df, dual_dir, args.prefix, args.directories) # Delete empty directories in the output directory for dir in [green_dir, red_dir, dual_dir]: try: if not any(p for p in dir.iterdir() if not p.name.startswith('.')): dir.rmdir() except OSError: print(f'Warning: {dir} not empty or failed to remove.') # Print output directory structure print(f"\n[bold green]Organized samples saved to: {output_dir}[/bold green]") verbose_end_msg()
if __name__ == "__main__": main()