Source code for unravel.cluster_stats.legend

#!/usr/bin/env python3

"""
Use ``cstats_legend`` from UNRAVEL to summarize regional abbreviations from _valid_clusters_table.xlsx files.

Inputs:
    - <asterisk>_valid_clusters_table.xlsx files in the working directory output from ``cstats_table``

Outputs:
    - legend.xlsx

Note: 
    - CCFv3-2020_info.csv is in UNRAVEL/unravel/core/csvs/
    - It has columns: structure_id_path,very_general_region,collapsed_region_name,abbreviation,collapsed_region,other_abbreviation,other_abbreviation_defined,layer,sunburst
    - Alternatively, use CCFv3-2017_info.csv or provide a custom CSV with the same columns.

Usage:
------
    cstats_legend [-p path/dir/with/xlsx_files] [-csv CCFv3-2020_info.csv] [-v]
"""

from glob import glob
from pathlib import Path
import openpyxl
import pandas as pd
from rich import print
from rich.traceback import install
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Border, Side, Font
from openpyxl.styles import Alignment

from unravel.core.help_formatter import RichArgumentParser, SuppressMetavar, SM

from unravel.core.config import Configuration
from unravel.core.utils import log_command, verbose_start_msg, verbose_end_msg


[docs] def parse_args(): parser = RichArgumentParser(formatter_class=SuppressMetavar, add_help=False, docstring=__doc__) opts = parser.add_argument_group('Optional args') opts.add_argument('-p', '--path', help='Path to the directory containing the *_valid_clusters_table.xlsx files. Default: current working directory', action=SM) opts.add_argument('-csv', '--csv_path', help='CSV name or path/name.csv. Default: CCFv3-2020_info.csv', default='CCFv3-2020_info.csv', action=SM) general = parser.add_argument_group('General arguments') general.add_argument('-v', '--verbose', help='Increase verbosity. Default: False', action='store_true', default=False) return parser.parse_args()
[docs] def extract_unique_regions_from_file(file_path): # Load the Excel file into a DataFrame, specifying that the header is in the second row (index 1) df = pd.read_excel(file_path, header=1, engine='openpyxl') unique_regions = set() # Iterate through columns and find those that start with "Top_Region" for col in df.columns: if col.startswith("Top_Region"): # Extract the regions from the column, remove percent volumes and add to the set unique_regions.update(df[col].dropna().apply(lambda x: ' '.join(x.split()[:-1]))) return unique_regions
[docs] def apply_rgb_to_cell(ws, df_w_rgb, col_w_labels, col_num): """Apply RGB values to cells in the worksheet. Parameters: ws (openpyxl worksheet), df_w_rgb (DataFrame with RGB values), col_w_labels (column with region labels), region (region name), col_num (column number to apply the RGB values starting from 0)""" for row in ws.iter_rows(min_row=3, min_col=2, max_col=4, max_row=ws.max_row-1): region_cell = row[col_num] r = df_w_rgb.loc[df_w_rgb[col_w_labels] == region_cell.value, 'R'].values[0] g = df_w_rgb.loc[df_w_rgb[col_w_labels] == region_cell.value, 'G'].values[0] b = df_w_rgb.loc[df_w_rgb[col_w_labels] == region_cell.value, 'B'].values[0] hex_color = "{:02x}{:02x}{:02x}".format(r, g, b) fill = PatternFill(start_color=hex_color, end_color=hex_color, fill_type='solid') region_cell.fill = fill
[docs] @log_command def main(): install() args = parse_args() Configuration.verbose = args.verbose verbose_start_msg() path = Path(args.path) if args.path else Path.cwd() # Find cluster_* dirs in the current dir xlsx_files = path.glob('*_valid_clusters_table.xlsx') # Filter out files starting with '~$' xlsx_files = [f for f in xlsx_files if not str(f).split('/')[-1].startswith('~$')] # Filter out files that start with legend xlsx_files = [f for f in xlsx_files if not str(f).split('/')[-1].startswith('legend')] if xlsx_files == []: print("\n [red1]No *_valid_clusters_table.xlsx files found in the specified directory. Exiting...\n") import sys ; sys.exit() else: print(f'\nProcessing:') for file in xlsx_files: print(f' {file}') # Initialize a set to store unique regions from all files, accounting for headers in the second row all_unique_regions = set() # Using a set to avoid duplicates # Iterate through each file and extract unique regions from each file for file_path in xlsx_files: unique_regions = extract_unique_regions_from_file(file_path) all_unique_regions.update(unique_regions) # Convert the set to a sorted list for easier reading all_unique_regions = sorted(list(all_unique_regions)) if len(all_unique_regions) == 0: print("\n [red1]No regions found in the xlsx files. Headers expected in the second row. Exiting...\n") import sys ; sys.exit() print(f'\nRegions: {all_unique_regions}\n') # Specify the column names you want to load columns_to_load = ['structure_id_path', 'very_general_region', 'collapsed_region_name', 'abbreviation', 'collapsed_region', 'other_abbreviation', 'other_abbreviation_defined', 'layer', 'sunburst'] # Load the specified columns from the CSV with CCFv3 info if args.csv_path == 'CCFv3-2017_info.csv' or args.csv_path == 'CCFv3-2020_info.csv': ccfv3_info_df = pd.read_csv(Path(__file__).parent.parent / 'core' / 'csvs' / args.csv_path, usecols=columns_to_load) else: ccfv3_info_df = pd.read_csv(args.csv_path, usecols=columns_to_load) # Creat a dictionary to hold the mappings for the region abbreviation to collapsed region abbreviation abbreviation_to_collapsed_dict = dict(zip(ccfv3_info_df['abbreviation'], ccfv3_info_df['collapsed_region'])) # Now collapse the regions in the unique_regions set unique_regions_collapsed = {abbreviation_to_collapsed_dict.get(region, region) for region in all_unique_regions} unique_regions_collapsed = sorted(list(unique_regions_collapsed)) print(f'{unique_regions_collapsed=}\n') # If a region in all_unique_regions has a digit in it, check if the 'layer' column is defined for it. Then, add unique layers to a set layers_set = set() for region in all_unique_regions: if any(char.isdigit() for char in region): layer = ccfv3_info_df.loc[ccfv3_info_df['abbreviation'] == region, 'layer'].values if len(layer) > 0: layers_set.add(str(layer[0])) # Convert float to string # Sort the layers layers_set = sorted(list(layers_set)) layers_set = [layer for layer in layers_set if str(layer) != 'nan'] # Get all regions with digits that are not defined as layers other_regions_w_digits = [ region for region in all_unique_regions if any(char.isdigit() for char in region) and not ccfv3_info_df[ccfv3_info_df['abbreviation'] == region]['layer'].notna().any() ] # Print the cortical layers and any regions with digits that are not defined as layers if len(other_regions_w_digits) > 0: print(f"Numbers ({layers_set}) = cortical layers (with these exceptions {other_regions_w_digits})\n") else: print(f"Numbers ({layers_set}) = cortical layers\n") # For regions in all_unique_regions, determine abbreviations to offload from the table (i.e., abbreviations mentioned in 'other_abbreviation' and defined in 'other_abbreviation_defined') list_of_regions_w_other_abbreviation_in_all_unique_regions = [region for region in all_unique_regions if ccfv3_info_df.loc[ccfv3_info_df['abbreviation'] == region, 'other_abbreviation'].notna().any()] # Initialize an empty dictionary to hold the mapping of other_abbreviations to their definitions other_abbreviation_to_definitions = {} for region in list_of_regions_w_other_abbreviation_in_all_unique_regions: # Extract 'other_abbreviation' and 'other_abbreviation_defined' for the current region rows = ccfv3_info_df[ccfv3_info_df['abbreviation'] == region] for _, row in rows.iterrows(): other_abbreviation = row['other_abbreviation'] other_abbreviation_defined = row['other_abbreviation_defined'] if pd.notna(other_abbreviation) and pd.notna(other_abbreviation_defined): # Initialize the set for this abbreviation if it doesn't exist if other_abbreviation not in other_abbreviation_to_definitions: other_abbreviation_to_definitions[other_abbreviation] = set() # Add the current definition to the set of definitions for this abbreviation other_abbreviation_to_definitions[other_abbreviation].add(other_abbreviation_defined) # Convert sets to strings with " or " as the separator for abbreviation, definitions_set in other_abbreviation_to_definitions.items(): other_abbreviation_to_definitions[abbreviation] = " or ".join(definitions_set) # Sort the dictionary by key other_abbreviation_to_definitions = dict(sorted(other_abbreviation_to_definitions.items())) print(f'Additional abbreviations not shown in the region abbreviation legend (SI table): {other_abbreviation_to_definitions}') # Get the 'very_general_region' column from the CCFv3-2020_info.csv file and use it to get the 'very_general_region' for each region in unique_regions_collapsed very_general_region_dict = dict(zip(ccfv3_info_df['collapsed_region'], ccfv3_info_df['very_general_region'])) very_general_regions = [very_general_region_dict.get(region, '') for region in unique_regions_collapsed] # If the same string is in the 'very_general_regions' list and the 'unique_regions_collapsed' list, remove it from both at the same index for i, region in enumerate(unique_regions_collapsed): if very_general_regions[i] == region: very_general_regions[i] = '' unique_regions_collapsed[i] = '' # Reset the indices of the lists very_general_regions = [region for region in very_general_regions if region != ''] unique_regions_collapsed = [region for region in unique_regions_collapsed if region != ''] # Make a dataframe with the 'very_general_regions' and 'unique_regions_collapsed' lists legend_df = pd.DataFrame({'Region': very_general_regions, 'Abbrev.': unique_regions_collapsed}) # Add the 'Subregion' column to the dataframe legend_df['Subregion'] = [ccfv3_info_df.loc[ccfv3_info_df['collapsed_region'] == region, 'collapsed_region_name'].values[0] for region in unique_regions_collapsed] # Add the 'structure_id_path' column to the dataframe legend_df['structure_id_path'] = [ccfv3_info_df.loc[ccfv3_info_df['collapsed_region'] == region, 'structure_id_path'].values[0] for region in unique_regions_collapsed] # Sort the dataframe by the 'structure_id_path' column in descending order legend_df.sort_values(by='structure_id_path', ascending=False, inplace=True) # Reset the index of the dataframe legend_df.reset_index(drop=True, inplace=True) # Drop the 'structure_id_path' column legend_df.drop(columns='structure_id_path', inplace=True) # Create an Excel workbook and select the active worksheet wb = openpyxl.Workbook() ws = wb.active thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin')) # Write the dataframe to the worksheet for region in dataframe_to_rows(legend_df, index=False, header=True): ws.append(region) # Apply a thin border style to cells with content for row in ws.iter_rows(): for cell in row: if cell.value: cell.border = thin_border cell.font = Font(name='Arial', size=11) # Apply the font to the header row header_font = Font(name='Arial', bold=True) for cell in ws['1:1']: cell.font = header_font # Make the first column bold for cell in ws['A']: cell.font = Font(name='Arial', bold=True) # Insert a new row at the top ws.insert_rows(1) # Insert a new column at the left ws.insert_cols(1) # Fill cells in first column with white for cell in ws['A']: cell.fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # Fill cells in fifth column with white for cell in ws['E']: cell.fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # Fill cells in first row with white for cell in ws[1]: cell.fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # Fill cells in last row with white num_rows = legend_df.shape[0] + 3 for cell in ws[num_rows]: cell.fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # Adjust the column width to fit the content for col in ws.columns: max_length = 0 for cell in col: if cell.value: max_length = max(max_length, len(str(cell.value))) if max_length > 0: adjusted_width = max_length + 2 column_letter = openpyxl.utils.get_column_letter(col[0].column) ws.column_dimensions[column_letter].width = adjusted_width # # Add columns for R, G, and B values to ccfv3_info_df ccfv3_info_df[['R', 'G', 'B']] = ccfv3_info_df['sunburst'].str.extract(r'rgb\((\d+),(\d+),(\d+)\)') ccfv3_info_df[['R', 'G', 'B']] = ccfv3_info_df[['R', 'G', 'B']].apply(pd.to_numeric) # Apply RGB values to cells apply_rgb_to_cell(ws, ccfv3_info_df, 'very_general_region', 0) apply_rgb_to_cell(ws, ccfv3_info_df, 'collapsed_region', 1) apply_rgb_to_cell(ws, ccfv3_info_df, 'collapsed_region_name', 2) # Iterate through the cells and merge cells with the same value in column B current_region = None first_row = None # Adjusted min_row to 2 and min_col/max_col to merge_column because of the added padding row and column merge_column = 2 for row in ws.iter_rows(min_row=2, max_row=ws.max_row - 1, min_col=merge_column, max_col=merge_column): cell = row[0] # row[0] since we're only looking at one column, and iter_rows yields tuples if cell.value != current_region: # If the cell value changes, merge the previous cells if there are more than one with the same value if first_row and first_row < cell.row - 1: ws.merge_cells(start_row=first_row, start_column=merge_column, end_row=cell.row - 1, end_column=merge_column) # After merging, we need to set the alignment for the merged cells merged_cell = ws.cell(row=first_row, column=merge_column) merged_cell.alignment = Alignment(vertical='center') # Update the current region and reset the first_row to the current cell's row current_region = cell.value first_row = cell.row # After the loop, check and merge the last set of cells if needed if first_row and first_row < ws.max_row: ws.merge_cells(start_row=first_row, start_column=merge_column, end_row=ws.max_row - 1, end_column=merge_column) # Align the last merged cell as well merged_cell = ws.cell(row=first_row, column=merge_column) merged_cell.alignment = Alignment(vertical='center') # Center align the content for row in ws.iter_rows(min_row=1, min_col=1): for cell in row: cell.alignment = Alignment(horizontal='center', vertical='center') # Ensure that fonts are black for row in ws.iter_rows(min_col=2): for cell in row: if cell.font: # If the cell already has font settings applied cell.font = Font(name='Arial', size=cell.font.size, bold=cell.font.bold, color='FF000000') else: cell.font = Font(name='Arial', color='FF000000') # Save the workbook to a file excel_file_path = path / 'legend.xlsx' wb.save(excel_file_path) # Save text for figure legend fig_legend_txt = path / "fig_legend.txt" with open(fig_legend_txt, "w") as file: file.write(f'\n{all_unique_regions=}\n') file.write(f'\n{unique_regions_collapsed=}\n') if len(other_regions_w_digits) > 0: file.write(f"\nNumbers ({layers_set}) = cortical layers (with these exceptions {other_regions_w_digits})\n") else: file.write(f"\nNumbers ({layers_set}) = cortical layers\n") file.write(f'\nAdditional abbreviations not shown in the region abbreviation legend (SI table): {other_abbreviation_to_definitions}\n') verbose_end_msg()
if __name__ == '__main__': main()