#!/usr/bin/python # ------------------------------------------- # Load AD430 / MR and write each as mtx files # - write raw chuks, merge after # Updated 03/10/2023 # ------------------------------------------- import re import gc import pandas as pd import numpy as np import scanpy as sc from tqdm import tqdm import gzip import time # Set directories (TODO: Separate func mimicking R/config for this) # ----------------------------------------------------------------- maindir = '/home/cboix/data/' topdir = maindir + 'perturbseq/' dbdir = topdir + 'db/' sdbdir = topdir + 'db/snPFC/' anndir = dbdir + 'Annotation/' # Load in the preprocessed dataset (w/ UMAP, etc) # ----------------------------------------------- # Consensus dataset; simplest to present: prefix = 'consensus_annot_snPFC.joint_annotation' h5ad_file = sdbdir + prefix + '.h5ad' adata = sc.read(h5ad_file) # Get the matrix margin: marg = np.sum(adata.X, axis=1) marg = np.array(marg).T[0] # NOTE: Should be equivalent to n_counts # Read in gene signatures: # ------------------------ sigfile = maindir + 'DEVTRAJ/db/Annotation/' + 'fusion_signature_lists.tsv' sgdf = pd.read_csv(sigfile, sep="\t") # Filter out genes not in adata sgdf = sgdf[np.isin(sgdf.gene, adata.var_names.tolist())] # Get the signature gene sets: sigdict = sgdf.groupby('kwd')['gene'].apply(list).to_dict() # Score the cells by each gene set: scoredf = pd.DataFrame({'obsnames': adata.obs_names, 'marg': marg}) for kwd in tqdm(sigdict.keys()): genes = sigdict[kwd] ind = np.isin(adata.var_names, genes) score = np.sum(adata[:, ind].X, axis=1) score = np.array(score).T[0] scoredf[[kwd]] = score # Write out these signatures: fusdir = maindir + 'DEVTRAJ/db/fusions_AD430/' scorefile = fusdir + 'fusions_AD430_signature_scores.tsv.gz' scoredf.to_csv(scorefile, sep="\t")