Cell Atlas Clustering and Analysis from kallisto-bustools

!date

Sat Nov 21 21:38:02 UTC 2020

Download Data¶

import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
    url = 'https://api.datacite.org/dois/'+doi+'/media'
    r = requests.get(url).json()
    netcdf_url = r['data'][0]['attributes']['url']
    r = requests.get(netcdf_url,stream=True)
    #Set file name
    fname = doi.split('/')[-1]+ext
    #Download file with progress bar
    if r.status_code == 403:
        print("File Unavailable")
    if 'content-length' not in r.headers:
        print("Did not get file")
    else:
        with open(fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            pbar = tnrange(int(total_length/1024), unit="B")
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update()
                    f.write(chunk)
        return fname

1 2	`#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing download_file('10.22002/D1.1797','.gz')`

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`



HBox(children=(FloatProgress(value=0.0, max=26058.0), HTML(value='')))





'D1.1797.gz'

1 2	`#CellRanger Starvation h5ad data download_file('10.22002/D1.1798','.gz')`

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`



HBox(children=(FloatProgress(value=0.0, max=45376.0), HTML(value='')))





'D1.1798.gz'

1 2	`#Kallisto bus clustered starvation data, h5ad download_file('10.22002/D1.1796','.gz')`

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`



HBox(children=(FloatProgress(value=0.0, max=479630.0), HTML(value='')))





'D1.1796.gz'

#Human ortholog annotations
download_file('10.22002/D1.1819','.gz')

#Panther annotations
download_file('10.22002/D1.1820','.gz')

#GO Terms
download_file('10.22002/D1.1822','.gz')

#Saved DeSeq2 Results for Fed/Starved (Differentially expressed under starvation --> perturbed genes)
download_file('10.22002/D1.1810','.gz')

#Saved gene modules adata
download_file('10.22002/D1.1813','.gz')

#Gene Markers to plot (for cell atlas) --> Fig 2 heatmap
download_file('10.22002/D1.1809','.gz')

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`



HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=515.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=224.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=59338.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))





'D1.1809.gz'

1	`!gunzip *.gz`

1
2
3

!pip install --quiet anndata
!pip install --quiet scanpy
!pip install --quiet louvain

[K     |████████████████████████████████| 122kB 8.1MB/s 
[K     |████████████████████████████████| 10.2MB 7.5MB/s 
[K     |████████████████████████████████| 51kB 4.5MB/s 
[K     |████████████████████████████████| 71kB 6.8MB/s 
[?25h  Building wheel for sinfo (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.2MB 9.3MB/s 
[K     |████████████████████████████████| 3.2MB 41.7MB/s 
[?25h

1	`!pip3 install --quiet rpy2`

Import Packages¶

import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.preprocessing import scale

import random

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
sc.set_figure_params(dpi=125)

import seaborn as sns
sns.set(style="whitegrid")
%load_ext rpy2.ipython

1 2	`# # See version of all installed packages, last done 01/11/2021 # !pip list -v > pkg_vers_20210111.txt`

#Read in annotations
from io import StringIO

hg_ortho_df = pd.read_csv(StringIO(''.join(l.replace('|', '\t') for l in open('D1.1819'))),
            sep="\t",header=None,skiprows=[0,1,2,3])

hg_ortho_df[['XLOC','TCONS']] = hg_ortho_df[13].str.split(expand=True) 
hg_ortho_df[['Gene','gi']] = hg_ortho_df[3].str.split(expand=True) 
hg_ortho_df['Description']= hg_ortho_df[11]


panther_df = pd.read_csv('D1.1820',
            sep="\t",header=None) #skiprows=[0,1,2,3]



goTerm_df = pd.read_csv('D1.1822',
            sep=" ",header=None) #skiprows=[0,1,2,3]

How Gene Filtering & Clustering Was Done for Kallisto Processed Data¶

#Read in starvation data

#Kallisto bus h5ad file with no gene filtering 
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )



#CellRanger h5ad file with old louvain clustering
cellRanger_fs = anndata.read("D1.1798")
print(cellRanger_fs)

AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch'
AnnData object with n_obs × n_vars = 13673 × 2657
    obs: 'n_counts', 'n_countslog', 'louvain', 'orgID', 'fed', 'starved', 'fed_ord', 'starved_ord', 'new_fed', 'fed_neighbor_score'
    var: 'n_counts', 'n_cells'
    uns: 'fed_ord_colors', 'louvain', 'louvain_colors', 'louvain_sizes', 'neighbors', 'new_fed_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'starved_ord_colors'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Apply labels from ClickTags (organism number and condition)

bus_fs_raw.obs['orgID'] = pd.Categorical(cellRanger_fs.obs['orgID'])
bus_fs_raw.obs['fed'] = pd.Categorical(cellRanger_fs.obs['fed'])
bus_fs_raw.obs['starved'] = pd.Categorical(cellRanger_fs.obs['starved'])
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(cellRanger_fs.obs['louvain'])
bus_fs_raw

AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch', 'orgID', 'fed', 'starved', 'cellRanger_louvain'

1 2	`sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1 sc.pp.filter_genes(bus_fs_raw, min_counts=0)`

1
2
3

bus_fs_raw_sub = bus_fs_raw[bus_fs_raw.obs['fed'] == True]
bus_fs_raw_sub2 = bus_fs_raw[bus_fs_raw.obs['fed'] == False]
bus_fs_raw_sub2

View of AnnData object with n_obs × n_vars = 7062 × 46716
    obs: 'batch', 'orgID', 'fed', 'starved', 'cellRanger_louvain', 'n_counts'
    var: 'n_counts'

1 2	`print('Fed reads:' + str(sum(bus_fs_raw_sub.obs['n_counts']))) print('Starved reads:' + str(sum(bus_fs_raw_sub2.obs['n_counts'])))`

Fed reads:37119861.0
Starved reads:30254873.0

1 2	`print('Fed cells: '+str(len(bus_fs_raw_sub.obs_names))) print('Starved cells: '+str(len(bus_fs_raw_sub2.obs_names)))`

Fed cells: 6611
Starved cells: 7062

#Median Genes/cell
nonZero = bus_fs_raw.X.todense() != 0.0
nonZeroCounts = np.sum(nonZero,axis=1)
nonZeroCounts.shape
print('Median genes/cell:' + str(np.median(list(nonZeroCounts))))

Median genes/cell:676.0

1 2	`#Median UMIs/cell print('Median UMIs/cell:' + str(np.median(bus_fs_raw.obs['n_counts'])))`

Median UMIs/cell:1802.0

Filter for highly variable genes and apply cluster labels

#Find highly variable genes from all nonzero genes

#How annotated/filtered adata was made

#sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1
#sc.pp.filter_genes(bus_fs_raw, min_counts=0)
bus_fs_raw.obs['n_countslog']=np.log10(bus_fs_raw.obs['n_counts'])

bus_fs_raw.raw = sc.pp.log1p(bus_fs_raw, copy=True)
sc.pp.normalize_per_cell(bus_fs_raw, counts_per_cell_after=1e4)
filter_result = sc.pp.filter_genes_dispersion(
    bus_fs_raw.X, min_mean=0.0125, max_mean=4.5, min_disp=0.2)
sc.pl.filter_genes_dispersion(filter_result)

#Filter genes from anndata
bus_fs_raw = bus_fs_raw[:, filter_result.gene_subset]

print(bus_fs_raw)

#How to get PAGA/UMAP embedding (creates bus_fs_clus)

sc.pp.scale(bus_fs_raw, max_value=10)
sc.tl.pca(bus_fs_raw, n_comps=60)
sc.pp.neighbors(bus_fs_raw,n_neighbors=150, n_pcs=60,random_state=42) #use_rep='X_nca'
sc.tl.paga(bus_fs_raw, groups='cellRanger_louvain')
sc.pl.paga(bus_fs_raw, color=['cellRanger_louvain'])

sc.tl.umap(bus_fs_raw,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga') #min_dist=0.5,spread=3,   min_dist=0.3

Cell Atlas Analysis for previously saved clustered and labeled data¶

Use for Cell Atlas markers, labels, perturbation response analysis, etc

1
2
3

#Read in PREVIOUSLY SAVED clustered + labeled data
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus )

AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

#Code here for neighbor score
#Calculate number of neighbors (out of top 15) with same starvation condition
#Input adata object and if score is for fed or starved (True or False)
def neighborScores(adata,conditionBool):
  sc.pp.neighbors(adata,n_neighbors=15)
  neighborDists = adata.uns['neighbors']['distances'].todense()
  counts = []

  for i in range(0,len(adata.obs_names)):
    cellNames = adata.obs_names

    #get fed observation for this cell
    cellObs = adata.obs['fed'][cellNames[i]]

    #get row for cell
    nonZero = neighborDists[i,:]>0
    l = nonZero.tolist()[0]

    cellRow = neighborDists[i,:]
    cellRow = cellRow[:,l]


    #get 'fed' observations
    obs = adata.obs['fed'][l]

    # count # in 'fed' observations == cell obs
    count = 0
    #if cellObs == conditionBool:
    for j in obs:
      if j == conditionBool:
        count += 1


    counts += [count]

  print(len(counts))

  return counts

Use PAGA embedding of cells to visualize umi counts, cell types, and animal conditions/labels

#Exact values used to generate PAGA image below
# sc.pp.neighbors(bus_fs_clus,n_neighbors=150, n_pcs=60,random_state=42) #use_rep='X_nca'
# sc.tl.paga(bus_fs_clus, groups='cellRanger_louvain',)

sc.pl.paga(bus_fs_clus, color=['cellRanger_louvain'])

png

1	`bus_fs_clus`

AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

1 2	`# Add fed_neighbor_score to adata # bus_fs_clus.obs['fed_neighbor_score'] = neighborScores(bus_fs_clus,'True') #Uncomment to run`

#Exact parameters for umap embedding below
#sc.tl.umap(bus_fs_clus,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga') #min_dist=0.5,spread=3,   min_dist=0.3

#sc.pl.umap(bus_fs_clus,color=['annos'])
sc.pl.umap(bus_fs_clus,color=['n_countslog'],color_map='viridis')

png

#Change colormap range
rgb_list = [tuple([67/256,114/256,196/256]),tuple([237/256,125/256,49/256])]
float_list = list(np.linspace(0,1,len(rgb_list)))

cdict = dict()
for num, col in enumerate(['red','green','blue']):
  col_list = [[float_list[i],rgb_list[i][num],rgb_list[i][num]] for i in range(len(float_list))]
  cdict[col] = col_list

cmp = mcolors.LinearSegmentedColormap('starv',segmentdata=cdict,N=256)

1	`sc.pl.umap(bus_fs_clus,color=['fed_neighbor_score'],color_map=cmp,save='score.pdf')`

WARNING: saving figure to file figures/umapscore.pdf

png

1 2	`bus_fs_clus.obs['Individual'] = pd.Categorical(bus_fs_clus.obs['orgID']) bus_fs_clus.obs['Individual'].cat.categories`

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dtype='object')

bus_fs_clus.uns['Individual_colors'] = np.array(['#B00005','#8F5DFD','#FED354',
                                                 '#7D98D3','#FD4F53','#5FA137',
                                                 '#F0A171','#BBD9BB','#D18085',
                                                 '#16A53F'])

1 2	`#Assign color to orgIDs sc.pl.umap(bus_fs_clus,color=['Individual'],save='orgID.pdf')`

WARNING: saving figure to file figures/umaporgID.pdf

png

How Cluster/Cell Type Labels are Assigned¶

#Defining the eight main classes

def annotateLouvain(bus_fs_clus):
  cluster_types = []

  for i in bus_fs_clus.obs_names:

    if bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [35,13,2,0]:
      cluster_types += ['Stem Cell/Germ Cell']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [12,11,23,17]:
      cluster_types += ['Nematocyte']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [5,10,21]:
      cluster_types += ['Mechanosensory']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [6,9,26,31]:
      cluster_types += ['Neural']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [34,22,27,32,25]:
      cluster_types += ['Gland Cell']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [3,7,14,15,19,24,16,33]:
      cluster_types += ['Gastroderm']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [28]:
      cluster_types += ['Bioluminescent Cells']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [1,4,29,18,8,30,20]:
      cluster_types += ['Epidermal/Muscle']


  bus_fs_clus.obs['annos'] = pd.Categorical(cluster_types)

annotateLouvain(bus_fs_clus)
bus_fs_clus

AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

1	`bus_fs_clus.obs["annos"].cat.categories`

Index(['Bioluminescent Cells', 'Epidermal/Muscle', 'Gastroderm', 'Gland Cell',
       'Mechanosensory', 'Nematocyte', 'Neural', 'Stem Cell/Germ Cell'],
      dtype='object')

1	`bus_fs_clus.uns['annos_colors'] = ['#707B7C','#AF7AC5','#BF124F','#1A5276','#5499C7','#1D8348','#7DCEA0','#F1C40F']`

1	`sc.pl.umap(bus_fs_clus,color="annos")`

png

#Get colors for broad class annotations, and apply colors to 36 subclusters
colors = bus_fs_clus.uns['annos_colors']
colorsCellR = bus_fs_clus.uns['cellRanger_louvain_colors']

annos_names = bus_fs_clus.obs["annos"].cat.categories
cellR_names = bus_fs_clus.obs["cellRanger_louvain"].cat.categories

annos_dict = {}
annos_dict['Stem Cell/Germ Cell'] = [35,13,2,0]
annos_dict['Nematocyte'] = [12,11,23,17]
annos_dict['Mechanosensory'] = [5,10,21]
annos_dict['Neural'] = [6,9,26,31]
annos_dict['Gland Cell'] = [34,22,27,32,25]
annos_dict['Gastroderm'] = [3,7,14,15,19,24,16,33]
annos_dict['Bioluminescent Cells'] = [28]
annos_dict['Epidermal/Muscle'] = [1,4,29,18,8,30,20]

#annos_dict

#Update 36 Louvain cluster colors with main class colors
for n in annos_names:
  pos = annos_dict[n]
  for p in pos:
    #
    pos_2 = np.where(annos_names == n)[0][0]
    colorsCellR[p] = colors[pos_2]

#colorsCellR
colorsCopy = colorsCellR.copy()

#Append color information to adata
bus_fs_clus.obs['new_cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_clus.uns['new_cellRanger_louvain_colors'] = colorsCellR

#Convert labels to string format for plotting - and update order of colors
#Make dictionary with cellRanger_louvain 36 clusters and new names --> new_cellRanger_louvain
bus_fs_clus.obs['new_cellRanger_louvain'] = [str(i) for i in bus_fs_clus.obs['new_cellRanger_louvain']]
bus_fs_clus.obs['new_cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['new_cellRanger_louvain'])

new_cats = bus_fs_clus.obs['new_cellRanger_louvain'].cat.categories

new_colors = bus_fs_clus.uns['new_cellRanger_louvain_colors']
for i in range(0,len(new_cats)):
  new_colors[i] = colorsCopy[int(new_cats[i])]

bus_fs_clus.uns['new_cellRanger_louvain_colors'] = new_colors

#Create dendrogram for subclusters
sc.tl.dendrogram(bus_fs_clus,'new_cellRanger_louvain',linkage_method='ward')


bus_fs_clus.uns["dendrogram_new_cellRanger_louvain"] = bus_fs_clus.uns["dendrogram_['new_cellRanger_louvain']"]

1	`bus_fs_clus.uns['new_cellRanger_louvain_colors']`

array(['#F1C40F', '#AF7AC5', '#5499C7', '#1D8348', '#1D8348', '#F1C40F',
       '#BF124F', '#BF124F', '#BF124F', '#1D8348', '#AF7AC5', '#BF124F',
       '#F1C40F', '#AF7AC5', '#5499C7', '#1A5276', '#1D8348', '#BF124F',
       '#1A5276', '#7DCEA0', '#1A5276', '#707B7C', '#AF7AC5', '#BF124F',
       '#AF7AC5', '#7DCEA0', '#1A5276', '#BF124F', '#1A5276', '#F1C40F',
       '#AF7AC5', '#5499C7', '#7DCEA0', '#BF124F', '#AF7AC5', '#7DCEA0'],
      dtype=object)

#Make plot for 8 broad classes of cell types
colors = bus_fs_clus.uns['annos_colors']
colors
fig, ax = plt.subplots(figsize=(10,10))

c = np.unique(bus_fs_clus.obs["annos"].values)
cmap = [i+'70' for i in colors]#plt.cm.get_cmap("tab20")


names = c

for idx, (cluster, name) in enumerate(zip(c, names)):
    XX = bus_fs_clus[bus_fs_clus.obs.annos == cluster,:].obsm["X_umap"]

    x = XX[:,0]
    y = XX[:,1]
    ax.scatter(x, y, color = cmap[idx], label=cluster,s=5)
    if name == 'Bioluminescent Cells':
      ax.annotate(name, 
                  (np.median(x)+7, np.median(y)),
                  horizontalalignment='center',
                  verticalalignment='bottom',
                  size=14, weight='bold',
                  color="black",
                  backgroundcolor=cmap[idx])
    else:
      ax.annotate(name, 
                  (np.median(x), np.median(y)),
                  horizontalalignment='center',
                  verticalalignment='bottom',
                  size=14, weight='bold',
                  color="black",
                  backgroundcolor=cmap[idx])

ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')
ax.grid(False)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
for edge_i in ['bottom','left']:
    ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
    ax.spines[edge_i].set_edgecolor("white")

plt.savefig('broadAtlas.pdf') 
plt.show()

png

#Names for all 36 clusters/cell types
def annotateLouvainSub(bus_fs_clus):
  cluster_types = []

  for i in bus_fs_clus.obs_names:

    if bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [12]:
      cluster_types += ['Nematocyte Precursors']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [17]:
      cluster_types += ['Maturing/mature Nematocytes']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [11]:
      cluster_types += ['Early Nematocytes']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [23]:
      cluster_types += ['Late Nematocytes']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [2]:
      cluster_types += ['Medium Oocytes']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [13]:
      cluster_types += ['Small Oocytes']

    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [7]:
      cluster_types += ['GastroDigestive-B']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [15]:
      cluster_types += ['GastroDigestive-D']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [3]:
      cluster_types += ['GastroDigestive-A']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [14]:
      cluster_types += ['GastroDigestive-C']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [19]:
      cluster_types += ['GastroDigestive-E']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [24]:
      cluster_types += ['GastroDigestive-F']


    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [5]:
      cluster_types += ['Mechanosensory Cells Early Stages']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [10]:
      cluster_types += ['Mechanosensory Cells-A']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [21]:
      cluster_types += ['Mechanosensory Cells-B']

    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [0]:
      cluster_types += ['i-Cells']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [1]:
      cluster_types += ['Exumbrella Epidermis']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [4]:
      cluster_types += ['Manubrium Epidermis']

    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [6]:
      cluster_types += ['Neural Cells Early Stages']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [9]:
      cluster_types += ['Neural Cells-A (incl. GLWa, MIH cells)']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [26]:
      cluster_types += ['Neural Cells-B (incl. RFamide cells)']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [31]:
      cluster_types += ['Neural Cells-C (incl. YFamide cells)']

    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [8]:
      cluster_types += ['Striated Muscle of Subumbrella']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [16]:
      cluster_types += ['Tentacle Bulb Distal Gastroderm']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [18]:
      cluster_types += ['Radial Smooth Muscles']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [20]:
      cluster_types += ['Tentacle Epidermis']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [22]:
      cluster_types += ['Gland Cells-A']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [25]:
      cluster_types += ['Gland Cells-C']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [27]:
      cluster_types += ['Gland Cells-B']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [28]:
      cluster_types += ['Tentacle GFP Cells']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [29]:
      cluster_types += ['Gonad Epidermis']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [30]:
      cluster_types += ['Striated Muscle of Velum']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [32]:
      cluster_types += ['Gland Cells-D']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [33]:
      cluster_types += ['Endodermal Plate']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [34]:
      cluster_types += ['Gland Cells-E']
    elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [35]:
      cluster_types += ['Very Early Oocytes']

  bus_fs_clus.obs['annosSub'] = pd.Categorical(cluster_types)

annotateLouvainSub(bus_fs_clus)
bus_fs_clus

1	`sc.pl.umap(bus_fs_clus,color='annosSub')`

png

colors = bus_fs_clus.uns['annosSub_colors']
colors
fig, ax = plt.subplots(figsize=(10,10))

c = np.unique(bus_fs_clus.obs["cellRanger_louvain"].values)
cmap = [i+'70' for i in colors]


names = c

for idx, (cluster, name) in enumerate(zip(c, names)):
    XX = bus_fs_clus[bus_fs_clus.obs.cellRanger_louvain.isin([cluster]),:].obsm["X_umap"]
    text = list(bus_fs_clus[bus_fs_clus.obs.cellRanger_louvain.isin([cluster]),:].obs.annosSub)[0]
    x = XX[:,0]
    y = XX[:,1]
    ax.scatter(x, y, color = cmap[idx], label=str(cluster)+': '+text,s=5)
    ax.annotate(name, 
             (np.mean(x), np.mean(y)),
             horizontalalignment='center',
             verticalalignment='bottom',
             size=20, weight='bold',
             color="black",
               backgroundcolor=cmap[idx]) 


ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False)
ax.set_axis_off()
#plt.savefig('36ClusAtlas.pdf')  
plt.show()

png

1 2	`# This is the saved output used in the notebook bus_fs_clus.write('fedStarved_withUMAPPaga.h5ad')`

Heatmaps for Top Markers¶

How marker genes are selected and annotated for the 36 cell types¶

#Get top n marker genes for each cluster

#Keep top 100 genes, 'louvain_neur' is label for neuron clusters determined using Louvain clustering algorithm
sc.tl.rank_genes_groups(bus_fs_clus, 'cellRanger_louvain',n_genes = 100,method='wilcoxon') #Using non-parametric test for significance

#Make dataframe, with 100 marker genes for each cluster + annotations
clusters = np.unique(bus_fs_clus.obs['cellRanger_louvain'])
markers = pd.DataFrame()

clus = []
markerGene = []
padj = []
orthoGene = []
orthoDescr = []

pantherNum = []
pantherDescr = []

goTerms = []

for i in clusters:
  genes = bus_fs_clus.uns['rank_genes_groups']['names'][str(i)]

  clus += list(np.repeat(i,len(genes)))
  markerGene += list(genes)
  padj += list(bus_fs_clus.uns['rank_genes_groups']['pvals_adj'][str(i)])

  for g in genes:

    sub_df = hg_ortho_df[hg_ortho_df.XLOC.isin([g])]
    panth_df = panther_df[panther_df[0].isin([g])]
    go_df = goTerm_df[goTerm_df[0].isin([g])]

    if len(sub_df) > 0:
      #Save first result for gene/description
      orthoGene += [list(sub_df.Gene)[0]]
      orthoDescr += [list(sub_df.Description)[0]]
    else:
      orthoGene += ['NA']
      orthoDescr += ['NA']


    if len(panth_df) > 0:
      pantherNum += [list(panth_df[1])]
      pantherDescr += [list(panth_df[2])]
    else:
      pantherNum += ['NA']
      pantherDescr += ['NA']


    if len(go_df) > 0:
      goTerms += [list(go_df[1])]
    else:
      goTerms += ['NA']


markers['clus'] = clus
markers['markerGene'] = markerGene
markers['padj'] = padj

markers['orthoGene'] = orthoGene
markers['orthoDescr'] = orthoDescr

markers['pantherID'] = pantherNum
markers['pantherDescr'] = pantherDescr

markers['goTerms'] = goTerms

markers.head()
#list(neurons.uns['rank_genes_groups']['names']['1'])

	markerGene	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms
0	XLOC_010813	DDX39B	spliceosome RNA helicase DDX39B [Homo sapiens]	[PTHR47958:SF10]	[ATP-DEPENDENT RNA HELICASE DDX39A]	[GO:0015931,GO:0051169,GO:0006807,GO:0030529,G...
1	XLOC_043536	CCT7	T-complex protein 1 subunit eta isoform a [Ho...	[PTHR11353:SF22]	[T-COMPLEX PROTEIN 1 SUBUNIT ETA]	[GO:0019538,GO:0044238,GO:0006461,GO:0006457,G...
2	XLOC_008430	PA2G4	proliferation-associated protein 2G4 [Homo sa...	[PTHR10804:SF11]	[PROLIFERATION-ASSOCIATED PROTEIN 2G4]	[GO:0016787,GO:0044238,GO:0008152,GO:0006508,G...
3	XLOC_017967	NA	NA	NA	NA	[nan]
4	XLOC_000657	RPS12	40S ribosomal protein S12 [Homo sapiens]	[PTHR11843:SF19]	[40S RIBOSOMAL PROTEIN S12]	[GO:0005737,GO:0005622,GO:0032991,GO:0005840,G...

#Write to csv
markers.to_csv('fs_marker_annotations.csv')

#Read in csv (previously saved version, uploaded to Box)
markers = pd.read_csv('fs_marker_annotations.csv',
            sep=",")
markers.head()

	Unnamed: 0	markerGene	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms
0	0	XLOC_010813	DDX39B	spliceosome RNA helicase DDX39B [Homo sapiens]	['PTHR24031:SF521']	['SPLICEOSOME RNA HELICASE DDX39B']	['GO:0015931,GO:0051169,GO:0006807,GO:0030529,...
1	1	XLOC_043536	CCT7	T-complex protein 1 subunit eta isoform a [Ho...	['PTHR11353:SF139']	['T-COMPLEX PROTEIN 1 SUBUNIT ETA']	['GO:0019538,GO:0044238,GO:0006461,GO:0006457,...
2	2	XLOC_008430	PA2G4	proliferation-associated protein 2G4 [Homo sa...	['PTHR10804:SF125']	['PROLIFERATION-ASSOCIATED 2G4 ,B']	['GO:0016787,GO:0044238,GO:0008152,GO:0006508,...
3	3	XLOC_017967	NaN	NaN	NaN	NaN	[nan]
4	4	XLOC_000657	RPS12	40S ribosomal protein S12 [Homo sapiens]	['PTHR11843:SF5']	['40S RIBOSOMAL PROTEIN S12']	['GO:0005737,GO:0005622,GO:0032991,GO:0005840,...

topGenes = []
clusts = []
names = []
var_groups = []
var_labels = []

top5 = pd.DataFrame()

ind = 0
n_genes = 5
for j in np.unique(markers.clus):
  sub = markers[markers.clus == j]
  sub.sort_values(by='padj',ascending=True)

  noDups = [i for i in sub.markerGene if i not in topGenes] #Remove duplicate genes
  topGenes += list(noDups[0:n_genes])
  clusts += [j]*n_genes


top5['marker'] = topGenes
top5['clus'] = clusts

top5.to_csv('top5markersAtlas.csv')

Read in saved markers for figure¶

1
2
3

#List of top markers for Figure 2, from subset of top5markers
topMarkers = pd.read_csv('D1.1809',sep=",")
topMarkers.head()

	clus	markerGene	annot	orthoGene	orthoDescr	pantherID	pantherDescr
0	0	XLOC_010813	DDX39A/B	DDX39B	spliceosome RNA helicase DDX39B [Homo sapiens]	['PTHR24031:SF521']	['SPLICEOSOME RNA HELICASE DDX39B']
1	0	XLOC_008430	PA2G4	PA2G4	proliferation-associated protein 2G4 [Homo sa...	['PTHR10804:SF125']	['PROLIFERATION-ASSOCIATED 2G4 ,B']
2	1	XLOC_016073	ZP-containing-1	NaN	NaN	['PTHR11576']	['ZONA PELLUCIDA SPERM-BINDING PROTEIN 3']
3	2	XLOC_006164	FMN-reductiase	NaN	NaN	['PTHR30543:SF12']	['NAD(P)H-DEPENDENT FMN REDUCTASE LOT6']
4	3	XLOC_013735	Innexin-like	NaN	NaN	['PTHR11893:SF41']	['INNEXIN INX7']

1	`topMarkers = topMarkers[0:51]`

topGenes = []
names = []
var_groups = []
var_labels = []
ind = 0

#Add cell type labels for gene markers
for i in np.unique(topMarkers.clus):
  sub = topMarkers[topMarkers.clus == i]


  topGenes += list(sub.markerGene)
  names += list(sub.annot)

  var_groups += [(ind,ind+len(list(sub.annot))-1)]
  var_labels += [str(int(i))] 
  ind += len(list(sub.annot))

#Add to raw data so any genes can be plotted
#Kallisto bus h5ad file with no gene filtering 
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )

sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1
sc.pp.filter_genes(bus_fs_raw, min_counts=0)

bus_fs_raw.obs['new_cellRanger_louvain'] = bus_fs_clus.obs['new_cellRanger_louvain']
bus_fs_raw.uns["dendrogram_new_cellRanger_louvain"] = bus_fs_clus.uns["dendrogram_new_cellRanger_louvain"]
bus_fs_raw.uns['new_cellRanger_louvain_colors'] = bus_fs_clus.uns['new_cellRanger_louvain_colors']
bus_fs_raw.obs['fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])

AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch'

#Plot data with names attached to gene XLOCs
toPlot = bus_fs_raw[:,topGenes]
toPlot.var['names'] = names

sc.pp.log1p(toPlot)

Trying to set attribute `.var` of view, copying.

#Subsample for clusters > 100 in size
#Subsample from full dataset, across each cluster
def subSample(adata):
  groups = np.unique(adata.obs['cellRanger_louvain'])
  subSample = 100
  cellNames = np.array(adata.obs_names)

  allCells = []
  for i in groups:

      cells = np.array(list(adata.obs['cellRanger_louvain'].isin([i])))
      cellLocs = list(np.where(cells)[0])

      if len(cellLocs) > 100:
      #Take all cells if < subSample
        choice = random.sample(cellLocs,subSample)
      else:
        choice = cellLocs

      pos = list(choice)
      #print(len(pos))

      allCells += list(cellNames[pos])


  sub = adata[allCells,:]
  return sub

1 2	`toPlotSub = subSample(toPlot) toPlotSub`

View of AnnData object with n_obs × n_vars = 3404 × 51
    obs: 'batch', 'n_counts', 'new_cellRanger_louvain', 'fed', 'cellRanger_louvain'
    var: 'n_counts', 'names'
    uns: 'dendrogram_new_cellRanger_louvain', 'new_cellRanger_louvain_colors', 'log1p'

1	`bus_fs_raw.uns['new_cellRanger_louvain_colors']`

array(['#F1C40F', '#AF7AC5', '#5499C7', '#1D8348', '#1D8348', '#F1C40F',
       '#BF124F', '#BF124F', '#BF124F', '#1D8348', '#AF7AC5', '#BF124F',
       '#F1C40F', '#AF7AC5', '#5499C7', '#1A5276', '#1D8348', '#BF124F',
       '#1A5276', '#7DCEA0', '#1A5276', '#707B7C', '#AF7AC5', '#BF124F',
       '#AF7AC5', '#7DCEA0', '#1A5276', '#BF124F', '#1A5276', '#F1C40F',
       '#AF7AC5', '#5499C7', '#7DCEA0', '#BF124F', '#AF7AC5', '#7DCEA0'],
      dtype=object)

sc.set_figure_params(scanpy=True, fontsize=30,dpi=150)
#bus_fs_clus.obs['cellRanger_louvain'] =  pd.Categorical([str(i) for i in bus_fs_clus.obs['cellRanger_louvain']])
sc.pl.heatmap(toPlotSub, names, groupby='new_cellRanger_louvain', dendrogram=True,show_gene_labels=True,
              var_group_positions=var_groups,var_group_labels=var_labels,cmap='PuBuGn',gene_symbols='names',
              standard_scale='var',use_raw=False,swap_axes=True,figsize = (30,30),save='cellAtlas.pdf')

WARNING: saving figure to file figures/heatmapcellAtlas.pdf

png

DE Gene Analysis Across Clusters (After Extracting Perturbed Genes)¶

#Remove clusters with < 10 cells per condition

#Read in previously saved data
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus )

bus_fs_raw = anndata.read("D1.1797")

bus_fs_raw = bus_fs_raw[bus_fs_clus.obs_names,]
#bus_fs_raw.obs['orgID'] = bus_fs_clus.obs['orgID']
bus_fs_raw.obs['fed'] = bus_fs_clus.obs['fed']
bus_fs_raw.obs['cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_raw


#clusSize

Trying to set attribute `.obs` of view, copying.


AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'





AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch', 'fed', 'cellRanger_louvain'

Cluster DE Genes by Expression and Run TopGO Analysis¶

Use output from DeSeq2 analysis in separate notebook: Genes in each cell type with differential expression under starvation

1 2	`deGenesDF = pd.read_csv('D1.1810') #deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample_annotations.csv from DeSeq2 analysis deGenesDF.head()`

	Unnamed: 0	Unnamed: 0.1	Unnamed: 0.1.1	Genes	Condition	padj	padjClus	log2FC	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms	geneClus
0	0	0	1	XLOC_028699	Starved	5.554489e-16	1.832981e-14	-1.284301	NaN	NaN	NaN	NaN	NaN	4
1	1	1	2	XLOC_010635	Starved	2.528288e-14	8.343350e-13	-1.492625	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4
2	2	2	3	XLOC_011294	Starved	8.348790e-14	2.755101e-12	-1.441413	NaN	NaN	NaN	NaN	[nan]	0
3	3	3	4	XLOC_034889	Starved	1.786565e-13	5.895663e-12	-1.448216	NaN	NaN	['PTHR13680:SF29']	['CDGSH IRON-SULFUR DOMAIN-CONTAINING PROTEIN ...	[nan]	1
4	4	4	5	XLOC_030861	Starved	8.598653e-12	2.837556e-10	-1.570453	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4

1	`deGenesDF_sig = deGenesDF[deGenesDF.padjClus < 0.05]`

Cluster gene x cell matrix for 'perturbed' genes

#Filter raw count dataset for only perturbed genes
bus_fs_raw = anndata.read("D1.1797")
bus_fs_raw = bus_fs_raw [:,np.unique(deGenesDF_sig.Genes)]
bus_fs_raw =bus_fs_raw[bus_fs_clus.obs_names,:]
bus_fs_raw.obs['cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_raw.obs['fed'] = bus_fs_clus.obs['fed']

Trying to set attribute `.obs` of view, copying.

de_gene_adata = anndata.AnnData(X=bus_fs_raw.X.T)
de_gene_adata.var_names = bus_fs_raw.obs_names
de_gene_adata.obs_names = bus_fs_raw.var_names

de_gene_adata_orig = de_gene_adata.copy()
#de_gene_adata

numIntersects = deGenesDF['Genes'].value_counts()
num_intersect = []
for g in de_gene_adata.obs_names: 
    if g in list(deGenesDF['Genes']):
      num_intersect += [numIntersects[g]]
    else:
      num_intersect += [0]

de_gene_adata.obs['numIntersects'] = pd.Categorical(num_intersect)

#de_gene_adata

de_gene_adata

AnnData object with n_obs × n_vars = 953 × 13673
    obs: 'numIntersects'

#Normalize and scale data
sc.pp.filter_genes(de_gene_adata, min_counts=0)

sc.pp.normalize_per_cell(de_gene_adata, counts_per_cell_after=1e4)
de_gene_adata.raw = sc.pp.log1p(de_gene_adata, copy=True)

sc.pp.scale(de_gene_adata, max_value=10)
sc.tl.pca(de_gene_adata, n_comps=60,random_state=42)
#sc.pl.pca_variance_ratio(bus_combo, log=True)

#Determine neighbors for clustering
sc.pp.neighbors(de_gene_adata,n_neighbors=20, n_pcs=15) #n_neighbors=5, n_pcs=15,  20, n_pcs=15
sc.tl.louvain(de_gene_adata,resolution=2)

sc.tl.tsne(de_gene_adata, n_pcs=15,random_state=42)
sc.pl.tsne(de_gene_adata,color=['louvain','numIntersects'])

WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.

png

1 2	`sc.tl.tsne(de_gene_adata, n_pcs=10,random_state=42,perplexity=15) # sc.pl.tsne(de_gene_adata,color=['louvain'])`

WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.

png

# Add which gene modules the pertubed genes are in
clusters = []
for g in deGenesDF.Genes:
  if g in list(de_gene_adata.obs_names):
    clus = de_gene_adata[g,:].obs['louvain'][0]
    clusters += [clus]
  else:
    clusters += ['padjClus_not_sig']

deGenesDF['geneClus'] = clusters
deGenesDF.head()

	Unnamed: 0	Unnamed: 0.1	Genes	Condition	padj	padjClus	log2FC	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms	geneClus
0	0	1	XLOC_028699	Starved	5.554489e-16	1.832981e-14	-1.284301	NaN	NaN	NaN	NaN	NaN	4
1	1	2	XLOC_010635	Starved	2.528288e-14	8.343350e-13	-1.492625	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4
2	2	3	XLOC_011294	Starved	8.348790e-14	2.755101e-12	-1.441413	NaN	NaN	NaN	NaN	[nan]	0
3	3	4	XLOC_034889	Starved	1.786565e-13	5.895663e-12	-1.448216	NaN	NaN	['PTHR13680:SF29']	['CDGSH IRON-SULFUR DOMAIN-CONTAINING PROTEIN ...	[nan]	1
4	4	5	XLOC_030861	Starved	8.598653e-12	2.837556e-10	-1.570453	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4

1	`deGenesDF.to_csv('atlas_deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample_annotations.csv')`

1 2	`deGenesDF = pd.read_csv('D1.1810') deGenesDF.head()`

	Unnamed: 0	Unnamed: 0.1	Unnamed: 0.1.1	Genes	Condition	padj	padjClus	log2FC	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms	geneClus
0	0	0	1	XLOC_028699	Starved	5.554489e-16	1.832981e-14	-1.284301	NaN	NaN	NaN	NaN	NaN	4
1	1	1	2	XLOC_010635	Starved	2.528288e-14	8.343350e-13	-1.492625	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4
2	2	2	3	XLOC_011294	Starved	8.348790e-14	2.755101e-12	-1.441413	NaN	NaN	NaN	NaN	[nan]	0
3	3	3	4	XLOC_034889	Starved	1.786565e-13	5.895663e-12	-1.448216	NaN	NaN	['PTHR13680:SF29']	['CDGSH IRON-SULFUR DOMAIN-CONTAINING PROTEIN ...	[nan]	1
4	4	4	5	XLOC_030861	Starved	8.598653e-12	2.837556e-10	-1.570453	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4

Run TopGO analysis for gene modules to find GO Term enrichment

#For topGO analysis (To Assign Labels to Modules)

def returnVal(i):
  if i == i:
    i= i.replace("[","")
    i = i.replace("]","")
    i= i.replace("'","")
    i = i.replace("'","")
    return i 
  else:
    return 'nan'

deGenesDF.goTerms = [returnVal(i) for i in list(deGenesDF.goTerms)]
deGenesDF = deGenesDF[deGenesDF.goTerms != 'nan']
deGenesDF = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']

deGenesDF.to_csv('atlas_deseq2_genes_fortopGO_metadata.txt',sep='\t',columns=['Genes','goTerms','geneClus'],
             header=None,index_label=False,index=False)
deGenesDF.to_csv('atlas_deseq2_genes_fortopGO.txt',sep='\t',columns=['Genes','goTerms'],
             header=None,index_label=False,index=False)

%%R 

install.packages('rlang')
if (!requireNamespace("BiocManager", quietly=TRUE)){
  install.packages("BiocManager")
}

BiocManager::install("topGO")

%%R

library(topGO)
library(readr)
#Read in DE genes (XLOC's) with GO Terms
geneID2GO <- readMappings(file = "atlas_deseq2_genes_fortopGO.txt")
str(head(geneID2GO ))

#Add gene modules as factor 
atlas_deseq2_genes_fortopGO_metadata <- read_delim("atlas_deseq2_genes_fortopGO_metadata.txt", 
                                                   "\t", escape_double = FALSE, col_names = FALSE, 
                                                   trim_ws = TRUE)

#Set variables
allMods = unique(atlas_deseq2_genes_fortopGO_metadata$X3)
alpha = 0.05/length(allMods) #Bonferroni correction, could correct for all pairwise comparisons?


getEnrichTerms <- function(geneID2GO, modMetadata, clus){
  mods <- factor(as.integer(modMetadata$X3 == clus)) #Choose gene module to make 'interesting'
  names(mods) <- names(geneID2GO)


  #Get genes only in module of interest
  clusGenes <- function(mods) {
    return(mods == 1)
  }
  subMods <- clusGenes(mods)

  #Make GO data
  GOdata <- new("topGOdata", ontology = "BP", allGenes = mods,
                geneSel = subMods, annot = annFUN.gene2GO, gene2GO = geneID2GO)

  #GOdata
  #sigGenes(GOdata)

  resultFis <- runTest(GOdata, algorithm = "classic", statistic = "fisher")

  resultWeight <- runTest(GOdata, statistic = "fisher")

  #P-values from Weight Algorithm
  pvalsWeight <- score(resultWeight)

  #hist(pvalsWeight, 50, xlab = "p-values")

  allRes <- GenTable(GOdata, classic = resultFis, weight = resultWeight, 
                     orderBy = "weight", ranksOf = "classic", topNodes = 20)

  subRes <- allRes[as.numeric(allRes$weight) < alpha,]

  #Write output
  write.csv(subRes,file=paste('mod',clus,'_GOTerms.csv',sep=""))
}

#Run for all modules and write outputs
for(c in allMods){

  getEnrichTerms(geneID2GO = geneID2GO,modMetadata = atlas_deseq2_genes_fortopGO_metadata, clus = c)

}

List of 6
 $ XLOC_007052: chr [1:5] "GO:0019538" "GO:0044238" "GO:0006461" "GO:0006457" ...
 $ XLOC_045583: chr [1:17] "GO:0005488" "GO:0006139" "GO:0007166" "GO:0044238" ...
 $ XLOC_004670: chr [1:12] "GO:0016070" "GO:0006139" "GO:0044238" "GO:0019219" ...
 $ XLOC_025064: chr [1:3] "GO:0003824" "GO:0008152" "GO:0016491"
 $ XLOC_045734: chr [1:16] "GO:0016070" "GO:0016787" "GO:0016072" "GO:0044238" ...
 $ XLOC_042552: chr [1:31] "GO:0019220" "GO:0005085" "GO:0006807" "GO:0007165" ...


R[write to console]: 
── Column specification ────────────────────────────────────────────────────────
cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_double()
)


R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 193 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 193 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  4 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  5 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  5 nodes to be scored    (38 eliminated genes)

R[write to console]: 
     Level 9:   11 nodes to be scored   (42 eliminated genes)

R[write to console]: 
     Level 8:   15 nodes to be scored   (63 eliminated genes)

R[write to console]: 
     Level 7:   17 nodes to be scored   (123 eliminated genes)

R[write to console]: 
     Level 6:   31 nodes to be scored   (174 eliminated genes)

R[write to console]: 
     Level 5:   41 nodes to be scored   (228 eliminated genes)

R[write to console]: 
     Level 4:   31 nodes to be scored   (278 eliminated genes)

R[write to console]: 
     Level 3:   24 nodes to be scored   (340 eliminated genes)

R[write to console]: 
     Level 2:   8 nodes to be scored    (374 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (408 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 221 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 221 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 13:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 12:  3 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  4 nodes to be scored    (4 eliminated genes)

R[write to console]: 
     Level 10:  5 nodes to be scored    (29 eliminated genes)

R[write to console]: 
     Level 9:   11 nodes to be scored   (37 eliminated genes)

R[write to console]: 
     Level 8:   18 nodes to be scored   (50 eliminated genes)

R[write to console]: 
     Level 7:   22 nodes to be scored   (105 eliminated genes)

R[write to console]: 
     Level 6:   28 nodes to be scored   (149 eliminated genes)

R[write to console]: 
     Level 5:   48 nodes to be scored   (222 eliminated genes)

R[write to console]: 
     Level 4:   40 nodes to be scored   (286 eliminated genes)

R[write to console]: 
     Level 3:   30 nodes to be scored   (360 eliminated genes)

R[write to console]: 
     Level 2:   10 nodes to be scored   (385 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (409 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 208 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 208 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 13:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 12:  3 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  5 nodes to be scored    (4 eliminated genes)

R[write to console]: 
     Level 10:  7 nodes to be scored    (38 eliminated genes)

R[write to console]: 
     Level 9:   15 nodes to be scored   (45 eliminated genes)

R[write to console]: 
     Level 8:   19 nodes to be scored   (67 eliminated genes)

R[write to console]: 
     Level 7:   15 nodes to be scored   (124 eliminated genes)

R[write to console]: 
     Level 6:   30 nodes to be scored   (176 eliminated genes)

R[write to console]: 
     Level 5:   45 nodes to be scored   (223 eliminated genes)

R[write to console]: 
     Level 4:   32 nodes to be scored   (284 eliminated genes)

R[write to console]: 
     Level 3:   26 nodes to be scored   (347 eliminated genes)

R[write to console]: 
     Level 2:   9 nodes to be scored    (383 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (407 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 165 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 165 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 13:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 12:  3 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  4 nodes to be scored    (4 eliminated genes)

R[write to console]: 
     Level 10:  5 nodes to be scored    (29 eliminated genes)

R[write to console]: 
     Level 9:   10 nodes to be scored   (35 eliminated genes)

R[write to console]: 
     Level 8:   12 nodes to be scored   (52 eliminated genes)

R[write to console]: 
     Level 7:   12 nodes to be scored   (107 eliminated genes)

R[write to console]: 
     Level 6:   23 nodes to be scored   (158 eliminated genes)

R[write to console]: 
     Level 5:   38 nodes to be scored   (205 eliminated genes)

R[write to console]: 
     Level 4:   27 nodes to be scored   (264 eliminated genes)

R[write to console]: 
     Level 3:   22 nodes to be scored   (341 eliminated genes)

R[write to console]: 
     Level 2:   7 nodes to be scored    (379 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (402 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 165 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 165 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  3 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  3 nodes to be scored    (23 eliminated genes)

R[write to console]: 
     Level 9:   6 nodes to be scored    (31 eliminated genes)

R[write to console]: 
     Level 8:   8 nodes to be scored    (46 eliminated genes)

R[write to console]: 
     Level 7:   13 nodes to be scored   (101 eliminated genes)

R[write to console]: 
     Level 6:   27 nodes to be scored   (151 eliminated genes)

R[write to console]: 
     Level 5:   37 nodes to be scored   (199 eliminated genes)

R[write to console]: 
     Level 4:   27 nodes to be scored   (263 eliminated genes)

R[write to console]: 
     Level 3:   28 nodes to be scored   (337 eliminated genes)

R[write to console]: 
     Level 2:   10 nodes to be scored   (379 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (411 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 191 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 191 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  4 nodes to be scored    (23 eliminated genes)

R[write to console]: 
     Level 9:   6 nodes to be scored    (31 eliminated genes)

R[write to console]: 
     Level 8:   10 nodes to be scored   (49 eliminated genes)

R[write to console]: 
     Level 7:   17 nodes to be scored   (92 eliminated genes)

R[write to console]: 
     Level 6:   25 nodes to be scored   (124 eliminated genes)

R[write to console]: 
     Level 5:   48 nodes to be scored   (198 eliminated genes)

R[write to console]: 
     Level 4:   37 nodes to be scored   (265 eliminated genes)

R[write to console]: 
     Level 3:   30 nodes to be scored   (355 eliminated genes)

R[write to console]: 
     Level 2:   10 nodes to be scored   (385 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (410 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 183 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 183 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  1 nodes to be scored    (13 eliminated genes)

R[write to console]: 
     Level 9:   4 nodes to be scored    (13 eliminated genes)

R[write to console]: 
     Level 8:   10 nodes to be scored   (13 eliminated genes)

R[write to console]: 
     Level 7:   15 nodes to be scored   (71 eliminated genes)

R[write to console]: 
     Level 6:   31 nodes to be scored   (126 eliminated genes)

R[write to console]: 
     Level 5:   45 nodes to be scored   (207 eliminated genes)

R[write to console]: 
     Level 4:   35 nodes to be scored   (289 eliminated genes)

R[write to console]: 
     Level 3:   29 nodes to be scored   (348 eliminated genes)

R[write to console]: 
     Level 2:   10 nodes to be scored   (383 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (410 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 180 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 180 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  4 nodes to be scored    (23 eliminated genes)

R[write to console]: 
     Level 9:   7 nodes to be scored    (31 eliminated genes)

R[write to console]: 
     Level 8:   11 nodes to be scored   (47 eliminated genes)

R[write to console]: 
     Level 7:   17 nodes to be scored   (102 eliminated genes)

R[write to console]: 
     Level 6:   26 nodes to be scored   (137 eliminated genes)

R[write to console]: 
     Level 5:   44 nodes to be scored   (222 eliminated genes)

R[write to console]: 
     Level 4:   33 nodes to be scored   (274 eliminated genes)

R[write to console]: 
     Level 3:   25 nodes to be scored   (361 eliminated genes)

R[write to console]: 
     Level 2:   9 nodes to be scored    (383 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (410 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 76 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 76 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 9:   1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 8:   2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 7:   4 nodes to be scored    (54 eliminated genes)

R[write to console]: 
     Level 6:   7 nodes to be scored    (85 eliminated genes)

R[write to console]: 
     Level 5:   20 nodes to be scored   (90 eliminated genes)

R[write to console]: 
     Level 4:   17 nodes to be scored   (175 eliminated genes)

R[write to console]: 
     Level 3:   16 nodes to be scored   (316 eliminated genes)

R[write to console]: 
     Level 2:   8 nodes to be scored    (349 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (391 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 101 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 101 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 9:   2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 8:   6 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 7:   10 nodes to be scored   (55 eliminated genes)

R[write to console]: 
     Level 6:   14 nodes to be scored   (67 eliminated genes)

R[write to console]: 
     Level 5:   23 nodes to be scored   (126 eliminated genes)

R[write to console]: 
     Level 4:   19 nodes to be scored   (229 eliminated genes)

R[write to console]: 
     Level 3:   18 nodes to be scored   (291 eliminated genes)

R[write to console]: 
     Level 2:   8 nodes to be scored    (367 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (393 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 152 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 152 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  3 nodes to be scored    (23 eliminated genes)

R[write to console]: 
     Level 9:   4 nodes to be scored    (31 eliminated genes)

R[write to console]: 
     Level 8:   7 nodes to be scored    (41 eliminated genes)

R[write to console]: 
     Level 7:   11 nodes to be scored   (91 eliminated genes)

R[write to console]: 
     Level 6:   26 nodes to be scored   (108 eliminated genes)

R[write to console]: 
     Level 5:   37 nodes to be scored   (176 eliminated genes)

R[write to console]: 
     Level 4:   27 nodes to be scored   (252 eliminated genes)

R[write to console]: 
     Level 3:   23 nodes to be scored   (324 eliminated genes)

R[write to console]: 
     Level 2:   10 nodes to be scored   (361 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (401 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 139 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 139 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 10:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 9:   4 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 8:   8 nodes to be scored    (7 eliminated genes)

R[write to console]: 
     Level 7:   13 nodes to be scored   (69 eliminated genes)

R[write to console]: 
     Level 6:   21 nodes to be scored   (121 eliminated genes)

R[write to console]: 
     Level 5:   29 nodes to be scored   (213 eliminated genes)

R[write to console]: 
     Level 4:   28 nodes to be scored   (286 eliminated genes)

R[write to console]: 
     Level 3:   26 nodes to be scored   (349 eliminated genes)

R[write to console]: 
     Level 2:   8 nodes to be scored    (381 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (408 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 157 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 157 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  1 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  2 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  2 nodes to be scored    (23 eliminated genes)

R[write to console]: 
     Level 9:   3 nodes to be scored    (31 eliminated genes)

R[write to console]: 
     Level 8:   5 nodes to be scored    (39 eliminated genes)

R[write to console]: 
     Level 7:   12 nodes to be scored   (90 eliminated genes)

R[write to console]: 
     Level 6:   24 nodes to be scored   (126 eliminated genes)

R[write to console]: 
     Level 5:   40 nodes to be scored   (168 eliminated genes)

R[write to console]: 
     Level 4:   32 nodes to be scored   (249 eliminated genes)

R[write to console]: 
     Level 3:   26 nodes to be scored   (340 eliminated genes)

R[write to console]: 
     Level 2:   9 nodes to be scored    (380 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (403 eliminated genes)

R[write to console]: 
Building most specific GOs .....

R[write to console]:    ( 176 GO terms found. )

R[write to console]: 
Build GO DAG topology ..........

R[write to console]:    ( 442 GO terms and 780 relations. )

R[write to console]: 
Annotating nodes ...............

R[write to console]:    ( 440 genes annotated to the GO terms. )

R[write to console]: 
             -- Classic Algorithm --

         the algorithm is scoring 227 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
             -- Weight01 Algorithm --

         the algorithm is scoring 227 nontrivial nodes
         parameters: 
             test statistic: fisher

R[write to console]: 
     Level 12:  3 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 11:  4 nodes to be scored    (0 eliminated genes)

R[write to console]: 
     Level 10:  6 nodes to be scored    (36 eliminated genes)

R[write to console]: 
     Level 9:   11 nodes to be scored   (40 eliminated genes)

R[write to console]: 
     Level 8:   21 nodes to be scored   (59 eliminated genes)

R[write to console]: 
     Level 7:   21 nodes to be scored   (106 eliminated genes)

R[write to console]: 
     Level 6:   36 nodes to be scored   (161 eliminated genes)

R[write to console]: 
     Level 5:   51 nodes to be scored   (203 eliminated genes)

R[write to console]: 
     Level 4:   34 nodes to be scored   (267 eliminated genes)

R[write to console]: 
     Level 3:   28 nodes to be scored   (338 eliminated genes)

R[write to console]: 
     Level 2:   11 nodes to be scored   (384 eliminated genes)

R[write to console]: 
     Level 1:   1 nodes to be scored    (410 eliminated genes)

DE Genes Across Cell Types¶

We analyze clustered perturbed genes with GO term analysis output by looking at the sharing of gene modules between cell types and differential expression of these genes under starvation

deGenesDF = pd.read_csv('D1.1810')
deGenesDF.head()

#Read in saved de_gene_adata here
de_gene_adata = anndata.read('D1.1813')
de_gene_adata

AnnData object with n_obs × n_vars = 953 × 13673
    obs: 'numIntersects', 'n_counts', 'louvain', 'clus35', 'clus14', 'clus19'
    var: 'n_counts', 'mean', 'std'
    uns: 'louvain', 'louvain_colors', 'neighbors', 'numIntersects_colors', 'pca'
    obsm: 'X_pca', 'X_tsne'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

1 2	`sc.set_figure_params(dpi=125) sc.pl.tsne(de_gene_adata,color=['louvain','numIntersects'])`

png

1
2
3

#Based on saved topGO output (See script....)
c = np.unique(de_gene_adata.obs["louvain"].values)
print(c)

['0' '1' '10' '11' '12' '13' '2' '3' '4' '5' '6' '7' '8' '9']

names = ['Protein Synthesis & Stress Response','Oxidative Phosphorylation','Cytoskeletal','Synaptic Transmission & Transport','NA','NA',
         'Phosphate-Containing Metabolic Process','Cell Cycle & Development (Early Oocytes)','Protein Synthesis',
         'Proteolysis & Cell Physiology','Cell Cycle','Protein Synthesis & Transport','Cell-Matrix Adhesion','Metabolic Process']

geneClusNames = dict(zip(c, names))

addNames = []
deGenesDF_sig = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']

for i in deGenesDF_sig.geneClus:
  addNames += [geneClusNames[i]]

deGenesDF_sig['names'] = addNames
deGenesDF_sig.head()

	Unnamed: 0	Unnamed: 0.1	Unnamed: 0.1.1	Genes	Condition	padj	padjClus	log2FC	orthoGene	orthoDescr	pantherID	pantherDescr	goTerms	geneClus	names
0	0	0	1	XLOC_028699	Starved	5.554489e-16	1.832981e-14	-1.284301	NaN	NaN	NaN	NaN	NaN	4	Protein Synthesis
1	1	1	2	XLOC_010635	Starved	2.528288e-14	8.343350e-13	-1.492625	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4	Protein Synthesis
2	2	2	3	XLOC_011294	Starved	8.348790e-14	2.755101e-12	-1.441413	NaN	NaN	NaN	NaN	[nan]	0	Protein Synthesis & Stress Response
3	3	3	4	XLOC_034889	Starved	1.786565e-13	5.895663e-12	-1.448216	NaN	NaN	['PTHR13680:SF29']	['CDGSH IRON-SULFUR DOMAIN-CONTAINING PROTEIN ...	[nan]	1	Oxidative Phosphorylation
4	4	4	5	XLOC_030861	Starved	8.598653e-12	2.837556e-10	-1.570453	SRSF1	serine/arginine-rich splicing factor 1 isofor...	['PTHR24012:SF650']	['SERINE/ARGININE-RICH SPLICING FACTOR 1']	[nan]	4	Protein Synthesis

#https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid

def draw_plots(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize=(20,20))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        name = geneClusNames[var_name]
        sub = df[df['names'] == name]
        sub.Cluster.value_counts().sort_values().plot(kind = 'barh',ax=ax,fontsize = 10)
        #sub['Cluster'].hist(bins=10,ax=ax)
        ax.set_title(var_name+': '+name+" Distribution",fontsize = 12)
        ax.set_ylabel("Cell Atlas Clusters",fontsize = 12)
        ax.set_xlabel("Number of Perturbed Genes from Atlas Cluster",fontsize = 12)
    fig.tight_layout()  # Improves appearance a bit.

    plt.show()

draw_plots(deGenesDF_sig, np.unique(deGenesDF_sig.geneClus), 5, 4)

png

Gene Module Plots¶

Plot gene modules colored by how genes are shared between cell types

#Mark DE/Perturbed Genes
def returnDE(i,names):
  if i in list(names):
    return 'DE'
  else:
    return 'nonSig'

#Mark genes with no GO Terms
def returnVal(i):
  if i == i:
    i= i.replace("[","")
    i = i.replace("]","")
    i= i.replace("'","")
    i = i.replace("'","")
    return i 
  else:
    return 'nan'

def returnGO(i,names):
  if i in names:
    return "withGO"
  else:
    return "n/a"

deGenesDF.goTerms = [returnVal(i) for i in list(deGenesDF.goTerms)]

deGenesDF_sub = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']
deGenesDF_sub = deGenesDF_sub[deGenesDF_sub.goTerms != 'nan'] # Has GO Term
withGO_names = list(deGenesDF_sub.Genes)

print(len(np.unique(withGO_names)))

goLabels = [returnGO(i,withGO_names) for i in de_gene_adata.obs_names]
de_gene_adata.obs['withGO'] = pd.Categorical(goLabels)
de_gene_adata

472





AnnData object with n_obs × n_vars = 953 × 13673
    obs: 'numIntersects', 'n_counts', 'louvain', 'clus35', 'clus14', 'clus19', 'withGO'
    var: 'n_counts', 'mean', 'std'
    uns: 'louvain', 'louvain_colors', 'neighbors', 'numIntersects_colors', 'pca'
    obsm: 'X_pca', 'X_tsne'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

1
2
3

clus14Genes = deGenesDF[deGenesDF.Cluster == 14]
clus19Genes = deGenesDF[deGenesDF.Cluster == 19]
clus35Genes = deGenesDF[deGenesDF.Cluster == 35]

# Label DE/notDE genes
clus35Labels = [returnDE(i,np.unique(clus35Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus35'] = pd.Categorical(clus35Labels)

clus14Labels = [returnDE(i,np.unique(clus14Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus14'] = pd.Categorical(clus14Labels)

#sc.pl.tsne(de_gene_adata,groups=['DE'],color=['clus14'])

# Label DE/notDE genes
clus19Labels = [returnDE(i,np.unique(clus19Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus19'] = pd.Categorical(clus19Labels)

#sc.pl.tsne(de_gene_adata,groups=['DE'],color=['clus19'])

1 2	`c = list(np.unique(de_gene_adata.obs["louvain"].values)) c`

['0', '1', '10', '11', '12', '13', '2', '3', '4', '5', '6', '7', '8', '9']

#Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
fig, ax = plt.subplots(figsize=(6,6))

#c = np.unique(de_gene_adata.obs["louvain"].values)
cmap = plt.cm.get_cmap("tab20")


for idx, (cluster, cluster) in enumerate(zip(c, c)):
    XX = de_gene_adata[de_gene_adata.obs.louvain == cluster,:].obsm["X_tsne"]

    x = XX[:,0]
    y = XX[:,1]
    ax.scatter(x, y, color = cmap(idx), label=cluster,s=25,alpha=0.7)


    ax.annotate(cluster, 
                (np.median(x), np.median(y)),
                horizontalalignment='right',
                verticalalignment='bottom',
                size=12, weight='bold',
                color="black",
                backgroundcolor=cmap(idx) )



#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
#ax.set_axis_off()
ax.set_xlabel('tSNE1')
ax.set_ylabel('tSNE2')
ax.grid(False)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
for edge_i in ['bottom','left']:
    ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
    ax.spines[edge_i].set_edgecolor("white")
plt.show()

png

# Density plot of perturbed genes with two cell types
def multDensityofDE(de_gene_adata,clusName1,clusName2,label1,label2):
  s = 13
  # Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
  fig, ax = plt.subplots(figsize=(3,3))

  XX = de_gene_adata.obsm["X_tsne"]

  de1 = np.array([de_gene_adata.obs[clusName1] == 'DE'])
  de2 = np.array([de_gene_adata.obs[clusName2] == 'DE'])

  overlap = list(np.where(de1 & de2)[1])
  only1 = [i for i in list(np.where(de1)[1]) if i not in overlap]
  only2 = [i for i in list(np.where(de2)[1]) if i not in overlap]

  nonsig = [i for i in range(0,len(XX[:,0])) if i not in overlap+only1+only2]


  x = XX[nonsig,0]
  y = XX[nonsig,1]
  ax.scatter(x, y, color = 'grey',s=25,alpha=0.1,edgecolors='none') #cmap(idx),label=cluster

  x_DE1 = XX[only1,0]
  y_DE1 = XX[only1,1]
  ax.scatter(x_DE1, y_DE1, color = 'navy',s=25,alpha=0.3,label=label1) #label=cluster

  x_DE2 = XX[only2,0]
  y_DE2 = XX[only2,1]
  ax.scatter(x_DE2, y_DE2, color = 'orange',s=25,alpha=0.3,label=label2) #label=cluster

  x_DE3 = XX[overlap,0]
  y_DE3 = XX[overlap,1]
  ax.scatter(x_DE3, y_DE3, color = 'green',s=25,alpha=0.3,label='Both') #label=cluster

  ax.set_axis_off()
  ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
  plt.show()

# Density plot of perturbed genes with three cell types
def tripleDensityofDE(de_gene_adata,clusName1,clusName2,clusName3,label1,label2):
  s = 13
  # Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
  fig, ax = plt.subplots(figsize=(3,3))

  XX = de_gene_adata.obsm["X_tsne"]

  de1 = np.array([de_gene_adata.obs[clusName1] == 'DE'])
  de2 = np.array([de_gene_adata.obs[clusName2] == 'DE'])
  de3 = np.array([de_gene_adata.obs[clusName3] == 'DE'])

  overlap = list(np.where((de1 & de2) | (de1 & de3))[1])
  only1 = [i for i in list(np.where(de1)[1]) if i not in overlap]
  other = [i for i in list(np.where(de2 | de3)[1]) if i not in overlap]


  nonsig = [i for i in range(0,len(XX[:,0])) if i not in overlap+only1+other]


  x = XX[nonsig,0]
  y = XX[nonsig,1]
  ax.scatter(x, y, color = 'grey',s=25,alpha=0.1,edgecolors='none') #cmap(idx),label=cluster

  x_DE1 = XX[only1,0]
  y_DE1 = XX[only1,1]
  ax.scatter(x_DE1, y_DE1, color = 'purple',s=25,alpha=0.3,label=label1) #label=cluster

  x_DE2 = XX[other,0]
  y_DE2 = XX[other,1]
  ax.scatter(x_DE2, y_DE2, color = 'lightcoral',s=25,alpha=0.3,label=label2) #label=cluster

  x_DE3 = XX[overlap,0]
  y_DE3 = XX[overlap,1]
  ax.scatter(x_DE3, y_DE3, color = 'green',s=25,alpha=0.3,label='Shared') #label=cluster

  ax.set_axis_off()
  ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
  plt.show()

1	`multDensityofDE(de_gene_adata,'clus14','clus19','Cluster 14','Cluster 19')`

png

1	`tripleDensityofDE(de_gene_adata,'clus35','clus14','clus19','Cluster 35','Digestive Types (14/19)')`

png

1 2	`#Saved version used for this analysis #de_gene_adata.write('de_gene_clus_adata.h5ad')`

For Gene Cluster SplitsTree Conversion¶

from sklearn.metrics import pairwise_distances

#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
    clusters = np.unique(overlap_combo.obs[clusType])
    centroids = np.zeros((len(clusters),pcs))

    for c in clusters:

        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
        centroid = pca_embed.mean(axis=0)

        centroids[int(c),:] = list(centroid)

    return centroids

#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(de_gene_adata,60,'louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape

(14, 14)

df = pd.DataFrame(pairCentroid_dists)
clus = np.unique(de_gene_adata.obs['louvain'])
df['cluster'] = range(0,len(clus))

clusters = [int(i) for i in de_gene_adata.obs['louvain']]

names = [geneClusDict[str(i)] for i in df['cluster']]

df['annos'] = names

df.to_csv('geneClustDist_SplitsTree.csv')
df.head()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	cluster	annos
0	0.000000	115.627783	143.492221	154.445199	139.531540	161.270522	154.411614	154.479290	179.288620	144.515184	118.180061	187.660104	178.807665	202.399721	0	Protein Synthesis & Stress Response
1	115.627783	0.000000	98.075214	143.125559	77.157201	132.257530	104.986413	100.198085	154.236652	120.262093	92.372829	162.726511	162.104749	182.189908	1	Oxidative Phosphorylation
2	143.492221	98.075214	0.000000	134.376313	89.768823	148.965495	132.329147	117.734078	154.317732	126.807212	122.691857	161.371515	178.094517	187.628485	2	Phosphate-Containing Metabolic Process
3	154.445199	143.125559	134.376313	0.000000	131.398811	150.699813	164.020334	161.338567	142.324225	152.467793	129.686223	162.602245	181.032459	206.935751	3	Development & Reproduction
4	139.531540	77.157201	89.768823	131.398811	0.000000	143.979940	120.886121	79.866331	160.604428	123.758153	118.502382	167.547313	167.604706	193.690030	4	Protein Synthesis

Gene expression Plots & Cluster 14 & 19 DE Gene Comparisons/Violin Plots¶

Look at expression profiles of perturbed genes from different cell types

#Kallisto bus h5ad file with no gene filtering 
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )

#Read in PREVIOUSLY SAVED clustered + labeled data (cells x gene)
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus)
bus_fs_clus.obs['Fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
#sc.pl.umap(bus_fs_clus, color='Fed')


bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])
bus_fs_raw

AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch'
AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'





AnnData object with n_obs × n_vars = 13673 × 46716
    obs: 'batch', 'cellRanger_louvain'

1
2
3

deGenesDF_sig = deGenesDF[deGenesDF.padjClus < 0.05] 
clusGenes = list(deGenesDF_sig.Genes[deGenesDF_sig.Cluster == 3])
len(clusGenes)

#Expression profiles for perturbed genes in each cell type (all high expression or not)

cellTypes = np.unique(deGenesDF_sig.Cluster)
colors = bus_fs_clus.uns['annosSub_colors']
cmap = [i for i in colors]
vals = []
types = []
num = []
cmap_l = []

exprPlots = pd.DataFrame()
for t in cellTypes:
  clusGenes = list(deGenesDF_sig.Genes[deGenesDF_sig.Cluster == t])
  sub = bus_fs_raw[:,clusGenes]
  sub = sub[sub.obs['cellRanger_louvain'].isin([t])]
  toAdd = sub.X.todense().mean(0)
  #Calculate average expression for each gene across cells in type t
  means = toAdd.flatten().tolist()[0]
  vals += means

  types += list(np.repeat(t,len(means)))
  num += list(np.repeat(len(means),len(means)))
  cmap_l  += [cmap[t]]


exprPlots['cellType'] = types
exprPlots['expr'] = np.log1p(vals)
exprPlots['numGenes'] = num

1	`exprPlots.head()`

	expr	numGenes
0	2.297710	99
1	1.577077	99
2	2.441016	99
3	1.346493	99
4	1.190470	99

palette = sns.color_palette(cmap_l)
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x="numGenes", y="expr", hue='cellType',palette=palette, data=exprPlots,order=np.arange(190),dodge=False)


for i,artist in enumerate(ax.artists):
    # Set the linecolor on the artist to the facecolor, and set the facecolor to None
    col = artist.get_facecolor()
    artist.set_edgecolor(col)
    artist.set_facecolor('None')

    # Each box has 6 associated Line2D objects (to make the whiskers, fliers, etc.)
    # Loop over them here, and use the same colour as above
    for j in range(i*6,i*6+6):
        line = ax.lines[j]
        if j == (i*6+4):
          line.set_color('black')
          line.set_mfc('black')
          line.set_mec('black')
        else:
          line.set_color(col)
          line.set_mfc(col)
          line.set_mec(col)


ax.set_xlabel('Number of Perturbed Genes')
ax.set_ylabel('log(Expression)')
ax.grid(False)

for edge_i in ['bottom','left']:
    ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
    ax.spines[edge_i].set_edgecolor("white")
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=2,title="Cell Type")


for i in range(190):
     if i % 25 != 0:
       ax.xaxis.get_major_ticks()[i].draw = lambda *args:None


plt.show()

png

clus14 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([14])]
clus14.obs['Condition2'] = pd.Categorical(clus14.obs['fed'])
clus19 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([19])]
clus19.obs['Condition2'] = pd.Categorical(clus19.obs['fed'])

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.

png

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.

1	`newOfInterest = ['XLOC_043836','XLOC_043846','XLOC_007437','XLOC_009798','XLOC_008632','XLOC_033541','XLOC_004011']`

axes = sc.pl.violin(clus14, newOfInterest, groupby='Condition2',ylabel=['Oxidoreductase','Dioxygenase',
                                                                        'POSTN-related','KRTAP5','Uncharacterized','NPC2','NPC2'],show=False)
for ax in axes:
  ax.set_ylim(0, 5)
  ax.grid(False)
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['bottom'].set_color('black')
  ax.spines['left'].set_color('black') 
axes = sc.pl.violin(clus19, newOfInterest, groupby='Condition2',ylabel=['Oxidoreductase','Dioxygenase',
                                                                        'POSTN-related','KRTAP5','Uncharacterized','NPC2','NPC2'],show=False)
for ax in axes:
  ax.set_ylim(0, 5)
  ax.grid(False)
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['bottom'].set_color('black')
  ax.spines['left'].set_color('black')

png

newOfInterest = ['XLOC_035232','XLOC_007437'] 
axes = sc.pl.violin(clus14, newOfInterest, groupby='Condition2',ylabel=['TGFB-like',
                                                                        'POSTN-like'],show=False)
for ax in axes:
  ax.set_ylim(0, 7)
  ax.grid(False)
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['bottom'].set_color('black')
  ax.spines['left'].set_color('black') 

axes = sc.pl.violin(clus19, newOfInterest, groupby='Condition2',ylabel=['TGFB-like',
                                                                        'POSTN-like'],show=False)
for ax in axes:
  ax.set_ylim(0, 7)
  ax.grid(False)
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['bottom'].set_color('black')
  ax.spines['left'].set_color('black')

png

#Cluster 35, early oocytes Violin Plots

bus_fs_clus.obs['fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
clus35 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([35])]

newOfInterest = ['XLOC_012655','XLOC_004306','XLOC_037567','XLOC_006902','XLOC_016286','XLOC_037771',
                 'XLOC_035249','XLOC_012527','XLOC_001916','XLOC_015039']

clus35.obs['Condition2'] = pd.Categorical(clus35.obs['fed'])
#sc.pl.umap(clus35, color='Condition')
sc.pl.violin(clus35, newOfInterest, groupby='Condition2',ylabel=['DNAJ','MCM8','TF AP4','CAF-1','SPO11','MOV10L1',
                                                                 'SIN3A','FAF1','CDK9','TMBIM4'])



newOfInterest = ['XLOC_016286','XLOC_012527']
#sc.pl.umap(clus35, color='Condition')
axes = sc.pl.violin(clus35, newOfInterest, groupby='Condition2',ylabel=['MEIOTIC RECOMBINATION PROTEIN','FAF1'],show=False)

for ax in axes:
  ax.set_ylim(0, 5)
  ax.grid(False)
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['bottom'].set_color('black')
  ax.spines['left'].set_color('black')

Trying to set attribute `.obs` of view, copying.

png

Overlap with Original CellRanger Clusters¶

Use Jaccard Index to quantify overlap of markers for the 36 cell types between the Kallisto-processed and initial, Cell Ranger processed data

n=100
bus_fs_clus = anndata.read("D1.1796")
cellRanger_fs = anndata.read("D1.1798")

bus_fs_clus.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])
sc.tl.rank_genes_groups(cellRanger_fs,groupby='louvain',n_genes=n,method='wilcoxon')
sc.tl.rank_genes_groups(bus_fs_clus,groupby='cellRanger_louvain',n_genes=n,method='wilcoxon')

#Show pairwise overlap in top 100 names between all clusters, 36x36 grid
clus = np.unique(bus_fs_clus.obs['cellRanger_louvain'])

cellR = [[]]*len(clus) #np array of top 100 genes for each of 36 clusters
busFS = [[]]*len(clus)#np array of top 100 genes for each of 36 clusters

for c in clus:
  cellR[c] =  list(cellRanger_fs.uns['rank_genes_groups']['names'][str(c)])
  busFS[c] = list(bus_fs_clus.uns['rank_genes_groups']['names'][str(c)])

1	`#https://stackoverflow.com/questions/52408910/python-pairwise-intersection-of-multiple-lists-then-sum-up-all-duplicates`

from itertools import combinations_with_replacement

#Changed to calculate Jaccard Index
def intersect(i,j):
  return len(set(cellR[i]).intersection(busFS[j]))/len(set(cellR[i]).union(busFS[j]))

def pairwise(clus):        
  # Initialise precomputed matrix (num of clusters, 36x36)
  precomputed = np.zeros((len(clus),len(clus)), dtype='float')
  # Initialise iterator over objects in X
  iterator = combinations_with_replacement(range(0,len(clus)), 2)
  # Perform the operation on each pair
  for i,j in iterator:
    precomputed[i,j] = intersect(i,j)          
  # Make symmetric and return
  return precomputed + precomputed.T - np.diag(np.diag(precomputed))

1	`pairCorrs = pairwise(clus)`

plt.figure(figsize=(7,7))
plt.imshow(pairCorrs, cmap='viridis')
plt.colorbar()
plt.xlabel('kallisto Clusters')
plt.ylabel('Cell Ranger Clusters')

plt.xticks(np.arange(0, 36, 1),fontsize=6)
plt.yticks(np.arange(0, 36, 1),fontsize=6)
plt.grid(color='black',linewidth=0.3)
plt.show()

png

Cell Type Data for SplitsTree¶

print(bus_fs_clus)

from sklearn.metrics import pairwise_distances

#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
    clusters = np.unique(overlap_combo.obs[clusType])
    centroids = np.zeros((len(clusters),pcs))

    for c in clusters:

        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
        centroid = pca_embed.mean(axis=0)

        centroids[int(c),:] = list(centroid)

    return centroids

AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

#Compare to pairwise distances between cell atlas clusters
fed_only = bus_fs_clus[bus_fs_clus.obs['fed'] == 'True']
centroids = getClusCentroids(fed_only,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape

(36, 36)

clus = np.unique(fed_only.obs['cellRanger_louvain'])
df = pd.DataFrame(pairCentroid_dists)
df['cluster'] = range(0,len(clus))

clusters = [int(i) for i in fed_only.obs['cellRanger_louvain']]
annos = fed_only.obs['annosSub']

annosDict = {}
for i in range(0,len(clusters)):
  annosDict[clusters[i]] = annos[i]


names = [annosDict[i] for i in df['cluster']]

df['annos'] = names
df.head()

df.to_csv('cellTypeDist_SplitsTree.csv')

1	`df.head()`

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	cluster	annos
0	0.000000	73.464909	65.525303	75.936037	74.636204	85.283588	73.824954	82.401569	75.196994	89.087199	99.485741	106.611646	54.108183	120.785654	108.300500	99.363475	95.612621	129.268401	110.133130	142.236416	77.468914	135.696652	128.288597	135.917111	138.019980	128.605249	129.160607	189.208824	96.878118	138.443425	110.750130	182.934958	237.364691	132.279790	226.214219	217.054862	0	Stem Cells
1	73.464909	0.000000	73.655777	70.877597	77.173203	89.054201	79.136972	76.645835	80.323026	87.011464	105.366511	117.057891	80.386629	119.954732	105.483109	96.652534	104.324513	139.127698	102.092035	121.881019	84.260834	138.856221	118.074936	131.726808	123.111721	131.677083	124.471839	171.451975	111.999539	148.526727	113.647093	178.768390	218.127628	135.157440	239.898399	266.314562	1	Exumbrella Epidermis
2	65.525303	73.655777	0.000000	64.303731	68.100740	81.223223	80.838113	77.755090	78.419210	84.807741	98.161895	103.657681	72.123625	96.296266	112.966357	86.755541	99.595890	125.994965	109.128990	145.543685	49.520744	140.668973	110.204552	134.502650	128.123380	119.130684	117.597645	187.189022	94.002531	142.049688	111.176973	174.081892	228.301164	109.351743	233.731906	245.953266	2	Large Oocytes
3	75.936037	70.877597	64.303731	0.000000	72.357871	87.616355	86.604400	60.036329	91.686399	93.482419	105.186968	110.185755	77.286173	122.598505	103.684279	79.500823	93.338071	128.078706	109.438380	131.196984	70.998642	143.392353	121.166188	126.212171	112.271870	130.067005	132.364972	186.111448	96.825716	135.182865	114.170083	186.398904	215.861273	123.974192	231.286331	255.491559	3	Digestive Subtype 1
4	74.636204	77.173203	68.100740	72.357871	0.000000	88.802717	85.075363	80.497037	85.861816	94.622659	105.989449	114.388857	79.916953	124.872919	122.905036	97.065098	101.276803	139.583311	104.866280	145.690134	68.573588	147.198918	125.230667	141.370164	131.855025	131.689243	127.044982	185.315343	103.332901	118.784507	98.033347	178.431799	234.030079	129.508063	237.051490	250.430968	4	Manubrium Epidermis

Data for SplitsTree analysis of all cells within cell types of interest

def convertCond(fed):
  if fed == 'True':
    return 'Control'
  else:
    return 'Starved'

#Save csv for cell type 14 (dig. gastroderm type)
pcs=60
sub = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'] == 14]
pca_embed = sub.obsm['X_pca'][:,0:pcs]
cellDists = pairwise_distances(pca_embed, metric = 'l1')

df = pd.DataFrame(cellDists)

conds =  [convertCond(i) for i in sub.obs['fed']]
for i in range(0,len(conds)):
  conds[i] = conds[i]+'_'+str(i)

df['annos'] = conds
df['cluster'] = np.repeat(14,len(conds))

df.to_csv('cells14Dist_SplitsTree.csv')

df.head()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	...	283	284	285	286	287	288	289	290	291	292	293	294	295	296	297	298	299	300	301	302	303	304	305	306	307	308	309	310	311	312	313	314	315	316	317	318	319	320	annos	cluster
0	0.000000	94.143427	48.626253	101.574068	109.223591	186.350220	190.945339	194.066881	156.403821	218.048962	168.793326	137.715137	179.836035	149.927222	195.354427	233.945757	166.067516	136.096284	216.107262	199.529440	193.207474	182.298597	178.294928	213.548189	172.876985	184.106836	181.708748	212.022591	195.449468	163.745207	163.936970	212.432469	191.489405	229.737238	203.838168	192.304306	189.078393	182.772713	230.196760	200.160259	...	229.297610	302.022273	103.226011	246.180590	79.685577	228.837330	179.028481	175.254564	204.683130	132.183881	190.640601	188.083746	174.172712	127.577098	91.216964	186.637843	220.118893	174.819850	106.416963	90.373066	98.419187	109.399699	181.359886	196.729804	157.085444	133.338628	161.478459	209.860675	186.578729	210.796355	210.508243	210.962729	153.360147	169.810844	214.112565	124.262565	163.718460	196.192738	Control_0	14
1	94.143427	0.000000	86.390540	58.534484	73.824113	153.057557	144.543805	144.581489	177.692855	163.166209	156.282159	101.568765	133.849859	105.531266	158.145493	182.800918	137.467117	150.280211	168.261541	193.167339	137.709932	173.786182	131.830320	164.291988	106.345033	152.030092	135.129226	157.747258	147.633373	122.283538	115.482289	169.319059	150.128244	186.767662	151.456408	152.404536	154.228067	158.826242	181.862659	158.542549	...	191.266843	271.680207	84.215284	199.883731	41.845021	175.484402	126.961714	127.455923	168.441797	123.490639	137.286613	138.587477	126.644105	91.531271	36.178894	129.433266	234.363872	136.274003	84.452477	95.836442	56.339853	64.970466	158.308563	182.310413	154.223460	103.979841	107.624468	153.881173	145.742968	195.334608	169.259788	149.438632	159.529412	131.624746	163.498180	102.808167	118.140950	162.508761	Control_1	14
2	48.626253	86.390540	0.000000	97.736700	98.147891	164.746364	184.626710	178.048935	149.629548	213.443877	147.289196	123.705298	168.429515	143.842020	179.956767	213.739130	143.481182	135.920917	206.268016	174.021207	182.332838	155.756659	164.451219	214.787755	162.804292	164.391505	169.553513	199.173782	185.730899	152.403594	156.011609	193.935450	167.398094	222.720465	201.603039	175.217648	180.261124	152.206669	228.494801	190.030389	...	220.140234	294.730742	85.742543	240.889364	71.882806	222.800527	166.456497	175.917007	181.432335	113.941515	187.045217	176.779761	168.326184	132.838011	80.586767	177.987493	212.425775	164.874481	88.778732	71.622405	83.718424	98.078372	157.648039	186.338032	144.322076	119.795497	149.711772	206.515952	187.489883	219.176888	207.909479	205.898141	131.169522	157.164535	194.733981	111.197437	150.988701	169.121609	Control_2	14
3	101.574068	58.534484	97.736700	0.000000	75.298443	161.961798	143.317353	149.625743	173.866739	142.182168	157.130683	94.061077	128.427204	103.337065	155.178435	177.437357	141.917049	167.117992	154.682007	205.637233	136.567256	176.289161	124.402694	136.016197	93.230688	157.686644	136.545843	139.872165	123.517754	129.625700	111.319758	165.674834	154.912371	177.827547	127.548855	155.486745	145.820134	170.591673	159.679090	152.910245	...	166.074201	277.655783	73.963112	181.608547	69.183016	153.739582	124.156899	106.971703	171.007710	125.184050	128.177026	124.388380	107.059112	87.626529	58.429277	130.799655	245.649857	121.897359	75.102888	91.946107	68.707121	75.724273	171.676829	182.136511	164.883143	99.161451	116.528071	128.388914	129.884990	203.863351	159.806983	140.013943	167.867174	137.436931	160.685175	90.797872	94.354009	176.591642	Control_3	14
4	109.223591	73.824113	98.147891	75.298443	0.000000	132.009158	134.578927	135.770046	143.583777	155.297091	127.364356	77.128204	115.733704	106.414797	133.832323	157.963138	103.322510	149.783598	163.513924	173.290917	130.087097	151.744598	126.224322	160.863930	100.225399	132.604051	119.144223	152.362733	149.329880	117.102320	116.754743	147.826127	124.619254	186.254123	147.426402	140.962448	132.043963	133.042190	182.821648	140.550387	...	175.898428	260.451376	49.450630	171.985034	75.543180	162.706951	113.656639	132.241623	145.007483	105.843326	145.833708	118.861925	119.538196	125.271453	67.955246	129.556370	218.066169	128.970532	62.333552	64.080649	52.578907	85.173873	137.664802	164.840957	128.423699	76.853709	126.766379	149.744109	163.905734	195.949054	150.687379	135.842524	128.599873	134.604387	143.898806	78.894951	112.537008	146.474484	Control_4	14

5 rows × 323 columns

#Save csv for cell type 8 
pcs=60
sub = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'] == 8]
pca_embed = sub.obsm['X_pca'][:,0:pcs]
cellDists = pairwise_distances(pca_embed, metric = 'l1')

df = pd.DataFrame(cellDists)

conds =  [convertCond(i) for i in sub.obs['fed']]

for i in range(0,len(conds)):
  conds[i] = conds[i]+'_'+str(i)

df['annos'] = conds
df['cluster'] = np.repeat(8,len(conds))

df.to_csv('cells8Dist_SplitsTree.csv')

df.head()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	...	551	552	553	554	555	556	557	558	559	560	561	562	563	564	565	566	567	568	569	570	571	572	573	574	575	576	577	578	579	580	581	582	583	584	585	586	587	588	annos	cluster
0	0.000000	60.144983	76.372043	66.494132	56.432907	106.750517	63.259248	64.543560	103.313487	88.505627	62.510605	62.839743	54.117143	60.876416	69.241539	49.890980	63.115900	75.901447	65.833620	46.972126	59.117521	50.828162	64.611214	83.392522	54.228186	60.750958	61.832451	65.970672	56.634726	62.320127	60.930371	94.799349	54.312209	80.935857	56.469866	78.045338	97.414141	47.106535	52.340316	47.393314	...	56.317990	78.665763	102.519642	54.470162	74.206045	71.110467	57.144690	60.212767	94.438412	64.947516	58.463693	96.371474	70.982465	68.470088	55.454763	90.557976	69.057868	57.370530	110.705628	62.855537	55.911682	78.440062	65.798097	64.860006	82.284685	78.548047	60.363950	47.283687	118.235103	61.281831	62.095791	70.993424	63.176492	82.638864	65.643171	49.659580	72.415766	100.235864	Control_0	8
1	60.144983	0.000000	86.442079	83.235973	51.924538	118.189361	69.300209	69.237901	111.011623	96.840631	66.761150	62.787025	64.509542	70.040120	82.535353	57.042035	62.217727	50.519683	50.821090	44.514138	56.116313	44.747514	55.996851	65.429151	58.836193	53.755685	63.995224	73.764828	80.625462	60.245426	70.809966	77.816029	57.881604	93.299843	47.710753	91.230807	118.168571	54.537929	54.853684	44.323889	...	52.669083	87.858019	116.350319	68.076112	84.519127	92.341931	58.762293	77.746750	103.652715	52.284794	71.777005	96.288046	79.784954	84.964503	50.106554	58.830359	46.453382	51.187307	119.734781	60.088097	55.391921	93.297142	56.703107	57.903604	92.084961	57.693223	76.727119	57.380273	123.147671	45.369704	58.613022	58.896148	77.726322	85.332088	79.407758	53.452267	63.079363	112.651485	Control_1	8
2	76.372043	86.442079	0.000000	92.197382	63.272943	105.308902	72.240901	63.892020	70.177240	56.916090	61.509721	60.661871	58.840806	50.095664	52.716062	60.069598	87.517705	86.563938	73.395764	74.351870	69.882013	76.141038	84.990894	105.735411	68.030166	70.006578	48.973458	48.041114	82.686475	90.131740	54.633708	118.670893	56.979855	62.765306	77.565609	46.419926	70.692113	77.078970	61.443398	75.723481	...	90.866300	64.740272	104.697828	50.934523	58.325615	56.903474	76.533149	59.223939	106.632709	87.948440	53.746843	141.567596	51.720590	56.263659	78.111106	105.336517	96.708537	66.323087	80.872255	102.348394	71.995598	51.604319	79.604278	87.046409	102.458091	84.284729	51.868968	60.383201	109.956103	66.412003	88.741395	98.565783	60.074302	75.270098	53.960435	68.212597	83.672812	83.044235	Starved_2	8
3	66.494132	83.235973	92.197382	0.000000	76.283285	100.474915	59.714631	75.692691	105.347625	96.883744	84.592759	77.868867	71.077264	71.623509	85.937109	68.892743	73.558295	102.805204	91.201422	76.869735	84.751180	70.014629	85.987749	104.618756	75.588171	80.724889	82.444481	84.341322	55.293334	76.457045	64.812197	103.164522	66.077887	81.304880	80.937751	86.652107	104.123518	70.264511	65.563247	74.535903	...	71.922489	86.318574	106.602828	77.004801	86.556066	83.766100	66.018927	77.703502	95.756355	84.888723	78.641428	88.892761	91.098623	86.168107	80.344344	109.601149	87.865686	79.999816	106.091861	73.944167	74.653632	94.131515	92.263945	75.156970	68.755884	93.553924	80.329967	69.485267	105.038163	76.858074	79.582475	92.043199	72.041728	92.608716	80.959029	88.158541	74.342947	106.086149	Starved_3	8
4	56.432907	51.924538	63.272943	76.283285	0.000000	113.375482	70.604369	65.810346	99.366576	83.913583	61.056639	59.528658	55.141460	59.658959	64.488933	48.897210	59.044761	62.925647	55.318539	51.081762	57.109698	44.735403	57.856466	82.044829	60.552988	58.173658	60.841931	64.581594	75.487261	62.866149	57.990504	89.923440	46.204944	86.102504	52.315008	66.294960	96.090289	50.348422	52.381180	40.238280	...	63.998809	81.335290	109.842015	49.136437	78.641090	70.909253	63.480354	62.613485	113.272593	63.541737	60.484872	107.174455	65.626185	74.159397	59.472820	83.899269	58.955134	46.782148	106.250577	66.641848	52.169798	76.424168	48.945158	56.051539	96.444591	66.309185	68.701733	45.760853	125.932776	47.310946	64.777106	71.938300	62.361760	88.076779	71.189860	54.868896	66.827119	106.934622	Control_4	8

5 rows × 591 columns

array([0, 0, 0, 0, 0])