Comparative Analysis of Cell Atlas Concordance and Perturbation Responses
In [ ]:
!date
Mon Aug 17 20:20:34 UTC 2020
Download Data¶
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')
#CellRanger Starvation h5ad data
download_file('10.22002/D1.1798','.gz')
#Import previously saved, clustered, & filtered stimulation data
download_file('10.22002/D1.1821','.gz')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
Out[ ]:
'D1.1821.gz'
In [ ]:
#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')
#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
Out[ ]:
'D1.1814.gz'
In [ ]:
#Import merged data with knn clusters
download_file('10.22002/D1.1823','.gz')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
Out[ ]:
'D1.1823.gz'
In [ ]:
!gunzip *.gz
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy
!pip install --quiet louvain
|████████████████████████████████| 122kB 4.3MB/s |████████████████████████████████| 10.2MB 4.0MB/s |████████████████████████████████| 71kB 7.6MB/s |████████████████████████████████| 51kB 6.1MB/s |████████████████████████████████| 71kB 7.9MB/s Building wheel for sinfo (setup.py) ... done Building wheel for umap-learn (setup.py) ... done |████████████████████████████████| 2.2MB 4.3MB/s |████████████████████████████████| 3.2MB 44.6MB/s
Import packages¶
In [ ]:
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
Do Distance Based Analysis for Inter- and Intra- Cluster Distances¶
Read in previously saved data
In [ ]:
# Description: bus_fs_combo data (filtered + clustered starvation (fed/starved --> fs) data)
# Description: bus_combo_noZero (filtered + clustered stimulation data) from Kallisto bus
#bus_fs_combo
bus_fs_combo = anndata.read("D1.1796")
print(bus_fs_combo)
bus_combo_noZero = anndata.read('D1.1821')
print(bus_combo_noZero)
#Previously saved overlap, merged data (from both experiments)
overlap_combo = anndata.read("D1.1823")
overlap_combo
AnnData object with n_obs × n_vars = 13673 × 8696 obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub' var: 'n_counts', 'mean', 'std' uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap' obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs' obsp: 'connectivities', 'distances' AnnData object with n_obs × n_vars = 18921 × 10260 obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'condition', 'orgID', 'cellRanger_louvain' var: 'n_counts', 'mean', 'std' uns: 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', 'condition_colors', 'louvain', 'neighbors', 'paga', 'pca', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs' obsp: 'connectivities', 'distances'
Out[ ]:
AnnData object with n_obs × n_vars = 32594 × 6756 obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus' var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std' uns: 'pca' obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs'
In [ ]:
#How raw overlap adata was made (no genes filtered out)
bus_fs_raw = anndata.read("D1.1797")
bus_stim_raw = anndata.read("D1.1814")
#Merge datasets
bus_fs_raw.obs_names = [i+'_fs' for i in bus_fs_raw.obs_names]
bus_stim_raw.obs_names = [i+'_ieg' for i in bus_stim_raw.obs_names]
raw_overlap_combo = bus_fs_raw.concatenate(bus_stim_raw,join='outer', index_unique=None)
Predict labels for stimulation data
In [ ]:
#Make X_pca with merged, use top n PC's
def getPredLabels(overlap_fs,overlap_combo, n_PC):
numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]
#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]
#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']
#Fit and predict
classifier = KNeighborsClassifier(n_neighbors=15)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
labels = list(y_train)+list(y_pred)
print(len(labels),' labels created')
return labels
How merged dataset (joint-PC space) with stimulation and starvation experiments was made (= overlap_combo)¶
In [ ]:
#Get intersection of var names between the two (making overlap_combo)
overlap = list(set(bus_combo_noZero.var_names).intersection(bus_fs_combo.var_names))
overlap_fs = bus_fs_combo[:,overlap]
overlap_stim = bus_combo_noZero[:,overlap]
#Merge datasets
overlap_fs.obs_names = [i+'_fs' for i in overlap_fs.obs_names]
overlap_stim.obs_names = [i+'_ieg' for i in overlap_stim.obs_names]
origin = list(np.repeat('FS',len(overlap_fs.obs_names))) + list(np.repeat('Stim',len(overlap_stim.obs_names)))
overlap_combo = overlap_fs.concatenate(overlap_stim,join='outer', index_unique=None)
overlap_combo.obs['cell_origin'] = pd.Categorical(origin)
sc.pp.scale(overlap_combo, max_value=10)
#Do PCA on merged dataset + plot variance explained
sc.tl.pca(overlap_combo, n_comps=60)
#sc.pl.pca_variance_ratio(overlap_combo, log=True)
#overlap_combo
labels = getPredLabels(overlap_fs,overlap_combo,n_PC = 60)
overlap_combo.obs['knn_clus'] = pd.Categorical(labels)
32594 labels created
In [ ]:
sc.pp.neighbors(overlap_combo,n_neighbors=150, n_pcs=60,random_state=42)
sc.tl.paga(overlap_combo, groups='knn_clus')
sc.pl.paga(overlap_combo, color=['knn_clus'])
In [ ]:
sc.tl.tsne(overlap_combo,random_state=42,n_pcs=60,early_exaggeration=5)
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
In [ ]:
sc.tl.umap(overlap_combo,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga')
In [ ]:
# sc.pl.tsne(overlap_combo,color=['knn_clus'])
In [ ]:
sc.pl.umap(overlap_combo,color=['knn_clus'],color_map='viridis')
In [ ]:
#overlap_combo.write('overlap_combo.h5ad')
Plot percent of K neighbors for each stim cell that are in one unique cluster¶
In [ ]:
from sklearn.neighbors import NearestNeighbors
import random
#Read in previously saved data
overlap_combo = anndata.read("D1.1823")
n = 15
neigh = NearestNeighbors(n_neighbors=n)
n_PC = 60
numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]
#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]
#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']
#Fit and predict
#neigh.fit(X_train)
#Subsample 70% of fed/starved data for training, test on remaining 30% and stim data
numSamp = np.int(0.7*X_train.shape[0])
ind = random.sample(range(0,X_train.shape[0]),numSamp)
allInd = range(0,X_train.shape[0])
rest = [i for i in allInd if i not in ind]
#Fit and predict
neigh.fit(X_train[ind,:])
X_rest = X_train[rest,:]
fsNeigh = neigh.kneighbors(X_rest)
stimNeigh = neigh.kneighbors(X_test)
In [ ]:
y_train = y_train[ind]
In [ ]:
stim_neighbors = stimNeigh[1]
fs_neighbors = fsNeigh[1]
#y_train = y_train[ind]
stim_perc = np.zeros(stim_neighbors.shape[0])
for i in range(0,len(stim_perc)):
#How many of top neighbors come from same cluster in fed/starved data (out of n=15)
stim_perc[i] = np.max(y_train[stim_neighbors[i]].value_counts())/n
fs_perc = np.zeros(fs_neighbors.shape[0])
#y_train = y_train[ind]
for i in range(0,len(fs_perc)):
fs_perc[i] = np.max(y_train[fs_neighbors[i]].value_counts())/n
In [ ]:
plt.hist(stim_perc,density=False,range=(0,1),alpha=0.6 ,bins=15,weights=np.ones(len(stim_perc))/len(stim_perc),label='stim-neighbors')
plt.hist(fs_perc,density=False,range=(0,1),alpha=0.6 ,bins=15 ,weights=np.ones(len(fs_perc))/len(fs_perc),label='fs-neighbors')
plt.legend(loc='upper left')
plt.title('Histogram of Nearest Neighbor Specificity',fontsize=11)
plt.grid(b=None)
plt.xlabel("Fraction of n=15 nearest neighbors in unique cluster",fontsize=10)
plt.ylabel("Frequency",fontsize=10)
plt.show()
Determine Inter and Intra Cluster Distances (perturbation effect vs cell type distance)¶
In [ ]:
#Plot distances between centroid of stim-starved and fed-starved for each cluster
def changeBool(fed):
if fed == 'True':
return 'fed'
elif fed == 'False':
return 'starved'
else:
return 'stim'
def addConds(overlap_combo):
fs_conds = [changeBool(i) for i in overlap_combo.obs['fed']]
overlap_combo.obs['centroid_conds'] = pd.Categorical(fs_conds)
overlap_combo.obs['centroid_conds']
In [ ]:
from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
clusters = np.unique(overlap_combo.obs[clusType])
centroids = np.zeros((len(clusters),pcs))
for c in clusters:
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
centroid = pca_embed.mean(axis=0)
centroids[c,:] = list(centroid)
return centroids
In [ ]:
#Distance between conditions within clusters
def getCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
fedStarv = []
fsStim = []
cluster = []
#Initialize output matrix
addConds(overlap_combo)
clusters = np.unique(overlap_combo.obs[clusType])
centroid_dist = pd.DataFrame(columns =['fedStarv','fsStim','cluster','clus_color'])
for c in clusters:
#Get cells in cluster and their conditions
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
centroid_conds = sub_data.obs['centroid_conds']
#Get 2D tsne embedding
pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA
#Get location of condition cells
stim_pos = list(centroid_conds == 'stim')
allFS_pos = list(centroid_conds != 'stim')
fed_pos = list(centroid_conds == 'fed')
starved_pos = list(centroid_conds == 'starved')
#Get column means for x,y coords
meanFed = pca_embed[fed_pos,].mean(axis=0)
meanStarv = pca_embed[starved_pos,].mean(axis=0)
meanStim = pca_embed[stim_pos,].mean(axis=0)
meanFS = pca_embed[allFS_pos,].mean(axis=0)
cluster += [c]
#Dist between fed and starved
fs_dist = np.linalg.norm(meanFed - meanStarv,1)
#Dist between (all) starved experiment and stim
starvStim_dist = np.linalg.norm(meanFS - meanStim,1)
fedStarv += [fs_dist]
fsStim += [starvStim_dist]
centroid_dist['fedStarv'] = fedStarv
centroid_dist['fsStim'] = fsStim
centroid_dist['cluster'] = cluster
return centroid_dist
In [ ]:
#Distance between conditions within clusters
def getStimCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
swKCl = []
swDI = []
cluster = []
#Initialize output matrix
#addConds(overlap_combo)
clusters = np.unique(overlap_combo.obs[clusType])
centroid_dist = pd.DataFrame(columns =['swKCl','swDI','cluster','clus_color'])
for c in clusters:
#Get cells in cluster and their conditions
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
centroid_conds = sub_data.obs['condition']
#Get 2D tsne embedding
pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA
#Get location of condition cells
sw_pos = list(centroid_conds == 'SW')
kcl_pos = list(centroid_conds == 'KCl')
di_pos = list(centroid_conds == 'DI')
#Get column means for x,y coords
meanSW = pca_embed[sw_pos,].mean(axis=0)
meanKCl = pca_embed[kcl_pos,].mean(axis=0)
meanDI = pca_embed[di_pos,].mean(axis=0)
cluster += [c]
#Dist between fed and starved
swKCl_dist = np.linalg.norm(meanSW - meanKCl,1)
#Dist between (all) starved experiment and stim
swDI_dist = np.linalg.norm(meanSW - meanDI,1)
swKCl += [swKCl_dist]
swDI += [swDI_dist]
centroid_dist['swKCl'] = swKCl
centroid_dist['swDI'] = swDI
centroid_dist['cluster'] = cluster
return centroid_dist
Fed/Starved L1 Analysis¶
Cell type v state plots
In [ ]:
withinFS_Dists = getCentroidDists(bus_fs_combo,60,'cellRanger_louvain')
withinFS_Dists.head()
Out[ ]:
fedStarv | fsStim | cluster | clus_color | |
---|---|---|---|---|
0 | 11.357661 | NaN | 0 | NaN |
1 | 17.776861 | NaN | 1 | NaN |
2 | 12.411180 | NaN | 2 | NaN |
3 | 18.126976 | NaN | 3 | NaN |
4 | 13.799874 | NaN | 4 | NaN |
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_fs_combo,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
150.34457080272998
In [ ]:
np.unique(bus_fs_combo.obs['cellRanger_louvain'])
Out[ ]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])
In [ ]:
#Early cnidocytes, germ cells, stem cells have distances that overlap with intra-F/S distances for 14&19
np.where(pairCentroid_dists < 62)
Out[ ]:
(array([ 0, 0, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]), array([ 0, 12, 1, 2, 20, 3, 7, 4, 5, 6, 3, 7, 8, 9, 10, 11, 0, 12, 13, 14, 15, 16, 17, 18, 19, 2, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]))
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))
matplotlib.rc('axes',edgecolor='black')
plt.figure(figsize=(16,10))
plt.hist(withinFS_Dists['fedStarv'],color='r',bins=15,alpha=0.6,label='fed-starved',density=True)
plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with fed/starved cells)')
plt.grid(b=None)
plt.xlabel("L1 Distance with 60 PC's")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()
49.54310559108853
In [ ]:
#Area where line between cell state/type is blurred seems to be only in digestive endodermal clusters 14,19 (strong metabolic responses to starvation)
outlier_clus = withinFS_Dists[withinFS_Dists.fedStarv> 46.69]['cluster']
outlier_clus
Out[ ]:
14 14 19 19 Name: cluster, dtype: int64
Compare internal distances to differences in numbers of fed & starved cells (Spearman Rank correlation)
In [ ]:
#Composition of broad cell types between fed and starved , cell counts
counts = pd.DataFrame(columns =['count', 'orgID','condition','cluster'])
clusters = np.unique(bus_fs_combo.obs['cellRanger_louvain'])
c = []
org = []
cond = []
clus = []
orgs = np.unique(bus_fs_combo.obs['orgID'])
conds = ['True', 'False']
for cl in clusters:
data = bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([cl])]
for cd in conds:
#c_data = data[data.obs['condition'].isin([cd])]
for o in orgs:
pos = (data.obs['fed'].isin([cd])) & (data.obs['orgID'].isin([o]))
org_data = data[pos]
c += [org_data.n_obs]
org += [o]
if cd == 'True':
cond += ['fed']
else:
cond += ['starved']
clus += [cl]
print(len(c))
counts['count'] = c
counts['orgID'] = org
counts['condition'] = cond
counts['cluster'] = clus
720
In [ ]:
counts = counts[counts['count']> 0 ]
counts.head()
Out[ ]:
count | orgID | condition | cluster | |
---|---|---|---|---|
0 | 118 | 1 | fed | 0 |
2 | 83 | 2 | fed | 0 |
3 | 102 | 3 | fed | 0 |
4 | 102 | 4 | fed | 0 |
5 | 118 | 5 | fed | 0 |
In [ ]:
#One-way anova for each cluster
pvalsFTest = {}
for cl in clusters:
sub = counts[counts.cluster == cl]
subFed = sub[sub.condition == 'fed']
subStarv = sub[sub.condition == 'starved']
fedVals = list(subFed['count'])
starvVals = list(subStarv['count'])
pvalsFTest[cl] = scipy.stats.f_oneway(fedVals,starvVals)
In [ ]:
import statsmodels.stats as sm
pvals = []
# sizes = []
for c in clusters:
pvals += [pvalsFTest[c][1]]
# sizes += [len(bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([c])].obs_names)]
pvals = sm.multitest.fdrcorrection(pvals,alpha=0.05, method='indep')
pvals
Out[ ]:
(array([False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]), array([1.64253441e-01, 9.35522778e-01, 5.21386720e-01, 8.07361578e-01, 9.97826979e-01, 3.31498471e-01, 8.07361578e-01, 5.68508228e-01, 6.91908915e-01, 5.34369937e-01, 4.49408286e-04, 1.99877010e-01, 9.63535360e-01, 7.78106033e-01, 2.25187596e-01, 4.82880690e-01, 6.42595657e-01, 4.19543898e-01, 6.87698796e-01, 9.35522778e-01, 3.31498471e-01, 9.63535360e-01, 9.63535360e-01, 3.31498471e-01, 3.31498471e-01, 9.97826979e-01, 4.82880690e-01, 1.64253441e-01, 4.82880690e-01, 2.09527206e-01, 4.82880690e-01, 3.31498471e-01, 9.63535360e-01, 1.64253441e-01, 9.35522778e-01, 1.00000000e+00]))
In [ ]:
#counts = counts[counts['count']> 0 ]
#https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib/37518947#37518947
counts['cluster'] = [int(i) for i in counts['cluster']]
plt.figure(figsize=(16, 6))
ax = sns.boxplot(x="cluster", y="count", hue="condition", data=counts,linewidth=0.5,palette=["darkorange", "b"])
ax.set(ylabel='Normalized Cell Count')
ax.set(xlabel='Louvain Cluster')
# statistical annotation
# alpha < 0.05
for i in [11]:
w = 0.5
x1, x2 = i-w,i+w
y, h, col = counts['count'][counts['cluster'] == i].max() + 2, 4, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
plt.show()
In [ ]:
internalDist = list(withinFS_Dists['fedStarv'])
In [ ]:
scipy.stats.spearmanr(internalDist,pvals[1])
Out[ ]:
SpearmanrResult(correlation=-0.050871795043468955, pvalue=0.768261688241116)
In [ ]:
plt.scatter(internalDist,pvals[1])
plt.xlabel('Internal Distances')
plt.ylabel('P-value: Cell Count Difference')
plt.grid(None)
In [ ]:
counts.to_csv('individ_cellCounts.csv',index=None)
Repeat for Stimulation Data¶
In [ ]:
overlap_combo = anndata.read("D1.1823")
bus_combo_noZero.obs['cellRanger_louvain'] = pd.Categorical(overlap_combo[overlap_combo.obs['cell_origin'] == 'Stim'].obs['knn_clus'])
In [ ]:
withinStim_Dists = getStimCentroidDists(bus_combo_noZero,60,'cellRanger_louvain')
withinStim_Dists
Out[ ]:
swKCl | swDI | cluster | clus_color | |
---|---|---|---|---|
0 | 7.350214 | 8.503921 | 0 | NaN |
1 | 21.065245 | 36.732548 | 1 | NaN |
2 | 20.948408 | 20.759415 | 2 | NaN |
3 | 13.442171 | 34.598927 | 3 | NaN |
4 | 14.205286 | 25.923903 | 4 | NaN |
5 | 22.041147 | 31.476810 | 5 | NaN |
6 | 14.477929 | 25.217993 | 6 | NaN |
7 | 11.867729 | 17.921000 | 7 | NaN |
8 | 12.292484 | 11.644725 | 8 | NaN |
9 | 22.288679 | 22.918928 | 9 | NaN |
10 | 10.736828 | 11.939169 | 10 | NaN |
11 | 9.275630 | 8.065190 | 11 | NaN |
12 | 5.801665 | 22.793827 | 12 | NaN |
13 | 31.335329 | 31.087091 | 13 | NaN |
14 | 25.940889 | 28.084023 | 14 | NaN |
15 | 10.881435 | 18.185766 | 15 | NaN |
16 | 25.211300 | 17.230129 | 16 | NaN |
17 | 22.916479 | 12.188016 | 17 | NaN |
18 | 21.876759 | 14.847662 | 18 | NaN |
19 | 23.442184 | 24.017313 | 19 | NaN |
20 | 56.649334 | 65.165184 | 20 | NaN |
21 | 24.023872 | 16.950150 | 21 | NaN |
22 | 13.108215 | 19.507177 | 22 | NaN |
23 | 6.248736 | 12.076812 | 23 | NaN |
24 | 32.759808 | 45.112389 | 24 | NaN |
25 | 11.500583 | 18.344841 | 25 | NaN |
26 | 34.523087 | 26.619263 | 26 | NaN |
27 | 25.315067 | 31.565296 | 27 | NaN |
28 | 46.271626 | 31.329525 | 28 | NaN |
29 | 13.501001 | 10.425589 | 29 | NaN |
30 | 18.720173 | 38.476334 | 30 | NaN |
31 | 38.197033 | 57.892250 | 31 | NaN |
32 | 31.816713 | 17.769464 | 32 | NaN |
33 | 40.618649 | 44.847408 | 33 | NaN |
34 | 11.309300 | 12.301980 | 34 | NaN |
35 | 16.585102 | 11.396259 | 35 | NaN |
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_combo_noZero,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
173.6400492134319
In [ ]:
#withinStim_Dists['swDI']
print(np.max(withinStim_Dists))
swKCl 56.649334 swDI 65.165184 cluster 35.000000 clus_color NaN dtype: float64
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))
plt.figure(figsize=(16,10))
plt.hist(withinStim_Dists['swKCl'],color='r',bins=15,alpha=0.6,label='SW-KCl',density=True)
plt.hist(withinStim_Dists['swDI'],color='b',bins=15,alpha=0.6,label='SW-DI',density=True)
plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with SW/KCl/DI cells)')
plt.xlabel("L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.grid(b=None)
plt.show()
65.46425357810222
In [ ]:
print(np.where(pairCentroid_dists < 65.46))
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]), array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]))
Organismal Distances, Fed/Starved + Stimulation¶
Batch effect plots
In [ ]:
#condition -- Stim, fed -- Fed/Starved data
overlap_combo = anndata.read("D1.1823")
overlap_combo
Out[ ]:
AnnData object with n_obs × n_vars = 32594 × 6756 obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus' var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std' uns: 'pca' obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs'
In [ ]:
#Use overlap_combo
#Go through each clustee
clus = np.unique(overlap_combo.obs['knn_clus'])
fed_data = overlap_combo[overlap_combo.obs['fed'] == 'True']
sw_data = overlap_combo[overlap_combo.obs['condition'] == 'SW']
fed_orgs = np.unique(fed_data.obs['orgID'])
sw_orgs = np.unique(sw_data.obs['orgID'])
pcs = 60
org_metadata = pd.DataFrame(columns =['orgID','cluster','condition']) #centroid = row mean for all cells in organism
num_points = (len(fed_orgs)+len(sw_orgs))*len(clus)
#Get centroid in PCA space in each cluster for each organism
org_centroids = np.zeros((num_points,pcs))
condition = []
cluster = []
orgID = []
count = 0
toRemove = []
for c in clus:
#Get all cells in chosen condition
fed_sub = fed_data[fed_data.obs['knn_clus'] == c]
fed_pca_embed = fed_sub.obsm['X_pca'][:,0:pcs] #PCA matrix
sw_sub = sw_data[sw_data.obs['knn_clus'] == c]
sw_pca_embed = sw_sub.obsm['X_pca'][:,0:pcs] #PCA matrix
#For each org, get centroid of cells (X_pca)
for sw in sw_orgs:
condition += ['SW']
cluster += [c]
orgID += [sw]
#Get location of org cells
org_pos = list(sw_sub.obs['orgID'] == sw)
#Sum down rows of cells for each PC
if len(sw_pca_embed[org_pos,]) == 0:
toRemove += [count]
org_centroids[count,:] = list(sw_pca_embed[org_pos,].mean(axis=0))
count += 1
for fed in fed_orgs:
condition += ['Fed']
cluster += [c]
orgID += [fed]
#Get location of org cells
org_pos = list(fed_sub.obs['orgID'] == fed)
#Sum down rows of cells for each PC
if len(fed_pca_embed[org_pos,]) == 0:
toRemove += [count]
org_centroids[count,:] = list(fed_pca_embed[org_pos,].mean(axis=0))
count += 1
#For each org, get centroid of cells (X_pca)
#Average pairwise distance between org centroid
org_metadata['orgID'] = orgID
org_metadata['cluster'] = cluster
org_metadata['condition'] = condition
In [ ]:
toRemove
Out[ ]:
[]
In [ ]:
org_metadata = org_metadata.drop(org_metadata.index[toRemove])
org_metadata.head()
org_centroids= np.delete(org_centroids, toRemove, 0)
org_centroids[0,:]
Out[ ]:
array([-2.25471210e+00, -1.51177585e+00, -1.93993020e+00, -9.54584032e-02, -2.38718843e+00, -1.51001954e+00, -2.81468654e+00, 1.05100930e+00, 1.54143572e+00, -7.58547902e-01, -1.50774598e+00, -2.47600722e+00, 7.53088653e-01, -9.25398111e-01, 5.30199647e-01, -2.36053783e-02, 9.48955193e-02, -6.50646329e-01, 1.05686748e+00, -2.67228413e+00, 2.98014688e+00, 5.93579225e-02, 5.69488704e-01, -1.05491745e+00, 1.12574625e+00, -1.49573892e-01, 5.68022847e-01, -1.29234409e+00, 1.40452191e-01, -3.84621084e-01, 2.95532733e-01, 1.13393039e-01, 2.88706899e-01, -5.50938368e-01, -1.17623888e-01, -1.51531205e-01, -7.78403163e-01, -7.72583902e-01, 9.31857646e-01, 9.21139657e-01, -4.21358228e-01, 9.92068797e-02, -3.57610792e-01, 1.40223965e-01, 4.30851281e-02, 2.75345147e-01, 5.04794896e-01, -1.59878790e-01, 1.80910379e-01, 2.61838585e-01, -2.14510873e-01, 1.25861779e-01, -3.76004130e-01, 2.04274207e-01, -1.61955715e-03, -9.25957318e-03, 5.98372109e-02, -2.70651639e-01, 2.62535036e-01, 1.59801412e-02])
In [ ]:
org_pairDists = pd.DataFrame(columns =['cluster','pair_fed','pair_sw','pair_mix'])
cluster = []
pair_fed = []
pair_sw = []
pair_mix = []
for c in clus:
#Fed org centroids
where_fed = (org_metadata.cluster == c) & (org_metadata.condition == 'Fed')
fed_mat = org_centroids[where_fed,:]
pair_fed_dist = pairwise_distances(fed_mat, metric = 'l1')
pair_fed_mean = np.mean(pair_fed_dist)
#SW org centroids
where_sw = (org_metadata.cluster == c) & (org_metadata.condition == 'SW')
sw_mat = org_centroids[where_sw,:]
pair_sw_dist = pairwise_distances(sw_mat, metric = 'l1')
pair_sw_mean = np.mean(pair_sw_dist)
pair_mix_dist = pairwise_distances(fed_mat,sw_mat, metric = 'l1')
pair_mix_mean = np.mean(pair_mix_dist)
cluster += [c]
pair_fed += [pair_fed_mean]
pair_sw += [pair_sw_mean]
pair_mix += [pair_mix_mean]
org_pairDists['cluster'] = cluster
org_pairDists['pair_fed'] = pair_fed
org_pairDists['pair_sw'] = pair_sw
org_pairDists['pair_mix'] = pair_mix
In [ ]:
org_pairDists.head()
Out[ ]:
cluster | pair_fed | pair_sw | pair_mix | |
---|---|---|---|---|
0 | 0 | 8.005131 | 7.995117 | 23.570756 |
1 | 1 | 15.629137 | 32.140367 | 75.364971 |
2 | 2 | 9.598802 | 14.187616 | 49.332659 |
3 | 3 | 19.421792 | 22.127487 | 42.742088 |
4 | 4 | 16.030268 | 26.287836 | 36.652727 |
In [ ]:
plt.figure(figsize=(16,10))
plt.hist(org_pairDists['pair_fed'],color='r',bins=15,alpha=0.6,label='control org dist',density=True)
plt.hist(org_pairDists['pair_sw'],color='blue',bins=15,alpha=0.6,label='sw org dist',density=True)
plt.hist(org_pairDists['pair_mix'], bins=15,color = 'orange',alpha=0.6,label='mix dist',density=True)
plt.legend(loc='upper right')
plt.title('Inter-Organism Distances (L1)')
plt.grid(b=None)
plt.xlabel("Average L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.show()
In [ ]:
overlap_combo
Out[ ]:
AnnData object with n_obs × n_vars = 32594 × 6756 obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus' var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std' uns: 'pca' obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs'
In [ ]:
overlap_combo.write('overlap_combo.h5ad')
... storing 'louvain' as categorical ... storing 'leiden' as categorical ... storing 'orgID' as categorical ... storing 'fed' as categorical ... storing 'starved' as categorical ... storing 'cellRanger_louvain' as categorical ... storing 'annos' as categorical ... storing 'new_cellRanger_louvain' as categorical ... storing 'annosSub' as categorical ... storing 'condition' as categorical
In [ ]: