import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import random
# import scrublet as scr
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
%load_ext rpy2.ipython
Read in original Cell Ranger Data for clustering analysis¶
sc.pp.filter_cells(jelly3trin, min_counts=0)
sc.pp.filter_genes(jelly3trin, min_counts=0)
#The dataset has 1.4M cells and 46,716 genes before any filtering
jelly3trin
sc.pp.filter_cells(jelly3trin, min_counts=1)
sc.pp.filter_genes(jelly3trin, min_counts=1)
#Add n_countslog to the data
jelly3trin.obs['n_countslog']=np.log10(jelly3trin.obs['n_counts'])
#Of the possible cells and genes, 9.9M cells and 22149 genes have at least one read
jelly3trin
#True cells were identified in the filterStarvCells notebook, so we'll load that scanpy file and use its cells
!mv D1.1811 D1.1811.csv
HDfilteredbarcodes=pd.read_csv("D1.1811.csv") #DOWNLOAD
len(list(HDfilteredbarcodes.iloc[:,0]))
16813
1
2
3
#Filter out low-quality cells as determined from tag sequencing data
ojelly3=jelly3trin[jelly3trin.obs_names.isin(list(HDfilteredbarcodes.iloc[:,0]))]
ojelly3