#!/usr/bin/python
# -------------------------------------------
# Load AD430 / MR and write each as mtx files
# - write raw chuks, merge after
# Updated 03/10/2023
# -------------------------------------------
import re
import gc
import pandas as pd
import numpy as np
import scanpy as sc
from tqdm import tqdm
import gzip
import time


# Set directories (TODO: Separate func mimicking R/config for this)
# -----------------------------------------------------------------
maindir = '/home/cboix/data/'
topdir = maindir + 'perturbseq/'
dbdir = topdir + 'db/'
sdbdir = topdir + 'db/snPFC/'
anndir = dbdir + 'Annotation/'


# Load in the preprocessed dataset (w/ UMAP, etc)
# -----------------------------------------------
# Consensus dataset; simplest to present:
prefix = 'consensus_annot_snPFC.joint_annotation'
h5ad_file = sdbdir + prefix + '.h5ad'
adata = sc.read(h5ad_file)

# Get the matrix margin:
marg = np.sum(adata.X, axis=1)
marg = np.array(marg).T[0]  # NOTE: Should be equivalent to n_counts


# Read in gene signatures:
# ------------------------
sigfile = maindir + 'DEVTRAJ/db/Annotation/' + 'fusion_signature_lists.tsv'
sgdf = pd.read_csv(sigfile, sep="\t")
# Filter out genes not in adata
sgdf = sgdf[np.isin(sgdf.gene, adata.var_names.tolist())]

# Get the signature gene sets:
sigdict = sgdf.groupby('kwd')['gene'].apply(list).to_dict()

# Score the cells by each gene set:
scoredf = pd.DataFrame({'obsnames': adata.obs_names, 'marg': marg})
for kwd in tqdm(sigdict.keys()):
    genes = sigdict[kwd]
    ind = np.isin(adata.var_names, genes)
    score = np.sum(adata[:, ind].X, axis=1)
    score = np.array(score).T[0]
    scoredf[[kwd]] = score


# Write out these signatures:
fusdir = maindir + 'DEVTRAJ/db/fusions_AD430/'
scorefile = fusdir + 'fusions_AD430_signature_scores.tsv.gz'
scoredf.to_csv(scorefile, sep="\t")