Source code for unravel.allen_institute.genetic_tools_atlas.simplify_metadata
#!/usr/bin/env python3
"""
Use ``gta_simplify_metadata`` (``gta_sm``) from UNRAVEL to simplify metadata from the Genetic Tools Atlas (GTA).
Prereqs:
- Visit the GTA: https://portal.brain-map.org/genetic-tools/genetic-tools-atlas
- Click "Access Genetic Tools Atlas"
- Filter by 'Data Modality' = 'STPT'
- 'Download Data' → 'Metadata Table'
- Unzip the downloaded file, find SpecimenMetadata.csv, and use it as the input.
Note:
- Default columns to keep: 'Image Series ID' 'Donor Genotype' 'Vector Full Name' 'Targeted Cell Population' 'Cargo' 'Vector Delivery Method'
- Duplicate rows in 'Image Series ID' are dropped.
- The output file is saved as 'SpecimenMetadata_subset.csv' in the current directory.
- If you want to keep other columns, use the -col option with a space-separated list of column names.
- If you want to change the output file name, use the -o option.
Next steps:
- Run ``gta_org_samples`` (``gta_os``) to organize the GTA data across samples for batch processing.
Usage:
------
gta_sm -i "path/to/SpecimenMetadata.csv" [-col col1 col2 ...] [-o output] [-v]
"""
from pathlib import Path
import pandas as pd
from rich import print
from rich.traceback import install
from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM
from unravel.core.config import Configuration
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg
COLUMNS = ['Image Series ID', 'Donor Genotype', 'Vector Full Name', 'Targeted Cell Population', 'Cargo', 'Vector Delivery Method']
[docs]
def parse_args():
parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__)
opts = parser.add_argument_group('Optional arguments')
opts.add_argument('-i', '--input', help='Path to the SpecimenMetadata.csv. Default: SpecimenMetadata.csv in the current directory', default='SpecimenMetadata.csv', action=SM)
opts.add_argument('-col', '--columns', help='CSV columns to keep. See notes for default columns.', nargs='*', default=COLUMNS, action=SM)
opts.add_argument('-o', '--output', help='Output CSV file path. Default: SpecimenMetadata_subset.csv', default='SpecimenMetadata_subset.csv', action=SM)
general = parser.add_argument_group('General arguments')
general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False)
return parser.parse_args()
[docs]
@log_command
def main():
install()
args = parse_args()
Configuration.verbose = args.verbose
verbose_start_msg()
df = pd.read_csv(args.input)
missing = [c for c in args.columns if c not in df.columns]
if missing:
print(f"[yellow]Warning: Missing columns: {missing}[/yellow]")
args.columns = [c for c in args.columns if c in df.columns]
# Keep specified columns
if args.columns:
df = df[args.columns]
# Drop rows duplicate values in 'Image Series ID'
df = df.drop_duplicates(subset='Image Series ID')
# Fill blank cells with 'NA'
df = df.fillna('NA')
# Save the edited DataFrame
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
verbose_end_msg()
if __name__ == "__main__":
main()