import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
1
2
3
4
5
#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')
#Import cell barcodes --> individual ID matrix from ClickTag filtering
download_file('10.22002/D1.1817','.gz')
import scanpy as sc
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
import scipy.io as sio
import seaborn as sns
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
sc.set_figure_params(dpi=125)
%load_ext rpy2.ipython
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#Read in annotations
from io import StringIO
hg_ortho_df = pd.read_csv(StringIO(''.join(l.replace('|', '\t') for l in open('D1.1819'))),
sep="\t",header=None,skiprows=[0,1,2,3])
hg_ortho_df[['XLOC','TCONS']] = hg_ortho_df[13].str.split(expand=True)
hg_ortho_df[['Gene','gi']] = hg_ortho_df[3].str.split(expand=True)
hg_ortho_df['Description']= hg_ortho_df[11]
panther_df = pd.read_csv('D1.1820',
sep="\t",header=None) #skiprows=[0,1,2,3]
goTerm_df = pd.read_csv('D1.1822',
sep=" ",header=None) #skiprows=[0,1,2,3]
#Assign labels to cells (condition and organism ID/#)
!mv D1.1817 jelly4stim_individs_tagCells_50k.mat
barcodes_list = sio.loadmat('jelly4stim_individs_tagCells_50k.mat')
barcodes_list.pop('__header__', None)
barcodes_list.pop('__version__', None)
barcodes_list.pop('__globals__', None)
# Add all cell barcodes for each individual
barcodes = []
for b in barcodes_list:
if barcodes_list[b] != "None":
barcodes.append(b)
print(len(barcodes))
barcodes = [s.replace('-1', '-3') for s in barcodes]
barcodes = [s.replace('-2', '-1') for s in barcodes]
barcodes = [s.replace('-3', '-2') for s in barcodes]
#Flip -1 -2 labels (from Miseq/Hiseq lane ordering)
def convertCode(b_list):
b_list = [s.replace('-1', '-3') for s in b_list]
b_list = [s.replace('-2', '-1') for s in b_list]
b_list = [s.replace('-3', '-2') for s in b_list]
return b_list
def convertCode_str(b):
b = b.replace('-1', '-3')
b = b.replace('-2', '-1')
b = b.replace('-3', '-2')
return b
fixed_bars = {}
for b in barcodes:
fixed_bars[b] = barcodes_list[convertCode_str(b)][0]
#Add condition labels to individuals
ids = []
for name in bus_stim.obs_names.tolist():
for barcode, individ in fixed_bars.items():
if (name in barcode):
ids += [int(individ)]
condition = ['SW','SW','SW','SW','DI','DI','DI','DI','KCl','KCl','KCl','KCl']
#Add ID numbers to individuals
labels = []
for name in bus_stim.obs_names.tolist():
for barcode, individ in fixed_bars.items():
if name in barcode:
labels += [condition[int(individ)-1]]
bus_stim.obs['condition'] = pd.Categorical(labels)
bus_stim.obs['orgID'] = pd.Categorical(ids)
bus_stim
colors = bus_stim_clus.uns['annosSub_colors']#bus_fs_clus.uns['annosSub_colors']
colors
fig, ax = plt.subplots(figsize=(10,10))
c = np.unique(bus_stim_clus.obs["cellRanger_louvain"].values)
cmap = [i+'70' for i in colors]#plt.cm.get_cmap("tab20")
names = c
for idx, (cluster, name) in enumerate(zip(c, names)):
XX = bus_stim_clus[bus_stim_clus.obs.cellRanger_louvain.isin([cluster]),:].obsm["X_umap"]
text = list(bus_stim_clus[bus_stim_clus.obs.cellRanger_louvain.isin([cluster]),:].obs.annosSub)[0]
x = XX[:,0]
y = XX[:,1]
ax.scatter(x, y, color = cmap[idx], label=str(cluster)+': '+text,s=5)
ax.annotate(name,
(np.mean(x), np.mean(y)),
horizontalalignment='center',
verticalalignment='bottom',
size=10, weight='bold',
color="black",
backgroundcolor=cmap[idx])
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 8},frameon=False)
ax.set_axis_off()
plt.savefig('36ClusAtlas_Stim.pdf')
plt.show()
#Subsample for clusters > 100 in size
#Subsample from full dataset, across each cluster
def subSample(adata):
groups = np.unique(adata.obs['cellRanger_louvain'])
subSample = 100
cellNames = np.array(adata.obs_names)
allCells = []
for i in groups:
#subSample = clusSize[i] # Uncomment if not looking at similar scale cluster sizes
cells = np.array(list(adata.obs['cellRanger_louvain'].isin([i])))
cellLocs = list(np.where(cells)[0])
if len(cellLocs) > 100:
#Take all cells if < subSample
choice = random.sample(cellLocs,subSample)
else:
choice = cellLocs
pos = list(choice)
#print(len(pos))
allCells += list(cellNames[pos])
sub = adata[allCells,:]
return sub
# #Filtered list of top markers
topMarkers = pd.read_csv('D1.1809',sep=",")
#topMarkers.head()
topMarkers = topMarkers[0:51]
topGenes = []
names = []
var_groups = []
var_labels = []
ind = 0
#n_genes = 2
for i in np.unique(topMarkers.clus):
sub = topMarkers[topMarkers.clus == i]
#sub.sort_values(by='padj',ascending=True)
#noDups = [i for i in sub.markerGene if i not in topGenes] #Remove duplicate genes
topGenes += list(sub.markerGene)
names += list(sub.annot)
var_groups += [(ind,ind+len(list(sub.annot))-1)]
var_labels += [str(int(i))] #make i from clusNameDict[i]
ind += len(list(sub.annot))
#Show pairwise overlap in top 100 names between all clusters, 36x36 grid
#https://stackoverflow.com/questions/52408910/python-pairwise-intersection-of-multiple-lists-then-sum-up-all-duplicates
clus = np.unique(bus_fs_clus.obs['cellRanger_louvain'])
busStim = [[]]*len(clus) #np array of top 100 genes for each of 36 clusters
busFS = [[]]*len(clus)#np array of top 100 genes for each of 36 clusters
for c in clus:
busStim[c] = list(bus_stim_clus.uns['rank_genes_groups']['names'][str(c)])
busFS[c] = list(bus_fs_clus.uns['rank_genes_groups']['names'][str(c)])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from itertools import combinations_with_replacement
#Changed to calculate Jaccard Index
def intersect(i,j):
return len(set(busStim[i]).intersection(busFS[j]))/len(set(busStim[i]).union(busFS[j]))
def pairwise(clus):
# Initialise precomputed matrix (num of clusters, 36x36)
precomputed = np.zeros((len(clus),len(clus)), dtype='float')
# Initialise iterator over objects in X
iterator = combinations_with_replacement(range(0,len(clus)), 2)
# Perform the operation on each pair
for i,j in iterator:
precomputed[i,j] = intersect(i,j)
# Make symmetric and return
return precomputed + precomputed.T - np.diag(np.diag(precomputed))
#Calculate number of neighbors (out of top 15) with same starvation condition
#Input adata object and if score is for fed or starved (True or False)
def neighborScores(adata,conditionBool):
sc.pp.neighbors(adata,n_neighbors=15)
neighborDists = adata.uns['neighbors']['distances'].todense()
counts = []
for i in range(0,len(adata.obs_names)):
cellNames = adata.obs_names
#get condition observation for this cell
cellObs = adata.obs['condition'][cellNames[i]]
#get row for cell
nonZero = neighborDists[i,:]>0
l = nonZero.tolist()[0]
cellRow = neighborDists[i,:]
cellRow = cellRow[:,l]
#get 'fed' observations
obs = adata.obs['condition'][l]
# count # in 'fed' observations == cell obs
count = 0
#if cellObs == conditionBool:
for j in obs:
if j == conditionBool:
count += 1
counts += [count]
print(len(counts))
return counts