Quantification of ClickTag Counts with kallisto-bustools (Stimulation Data)
Download Data¶
In [ ]:
#Install kallisto and bustools
!wget --quiet https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
!tar -xf kallisto_linux-v0.46.2.tar.gz
!cp kallisto/kallisto /usr/local/bin/
!wget --quiet https://github.com/BUStools/bustools/releases/download/v0.40.0/bustools_linux-v0.40.0.tar.gz
!tar -xf bustools_linux-v0.40.0.tar.gz
!cp bustools/bustools /usr/local/bin/
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#Get HiSeq/MiSeq sequencing of ClickTags
download_file('10.22002/D1.1793','.tar.gz')
!tar -xf D1.1793.tar.gz
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
In [ ]:
# Get ClickTag Barcodes/Sequences
download_file('10.22002/D1.1831','.gz')
#Get saved embedding of ClickTags
download_file('10.22002/D1.1838','.gz')
!gunzip *.gz
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy
!pip3 install --quiet leidenalg
!pip install --quiet louvain
!pip3 install --quiet biopython
|████████████████████████████████| 122kB 4.9MB/s |████████████████████████████████| 7.7MB 4.0MB/s |████████████████████████████████| 71kB 9.4MB/s |████████████████████████████████| 51kB 6.8MB/s Building wheel for sinfo (setup.py) ... done |████████████████████████████████| 2.4MB 4.2MB/s |████████████████████████████████| 3.2MB 32.8MB/s |████████████████████████████████| 2.2MB 4.3MB/s |████████████████████████████████| 2.3MB 4.0MB/s
Import Packages¶
In [ ]:
import pandas as pd
import copy
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import scipy
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from collections import OrderedDict
from Bio import SeqIO
import os
from scipy import io
import scipy.io as sio
import time
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
How ClickTag counts and Clustering Are Generated¶
Use kallisto-bustools to align reads to kallisto index of ClickTag barcode whitelist consisting of sequences hamming distance 1 away from original barcodes
In [ ]:
## Set parameters - below are parameters for 10x 3' v3 chemistry
## The cell hashing method uses tags of length 12, we included the variable B (T,C,G) in the end to make it lenght 13
cell_barcode_length = 16
UMI_length = 12
sample_tag_length=11
"""
This function returns all sample tags and and their single base mismatches (hamming distance 1).
ClickTag sequences are provided as a fasta file, and the script indexes the barcode region of the fasta
"""
"parse the tags file and output the set of tag sequences"
def parse_tags(filename):
odict = OrderedDict()
print('Read the following tags:')
for record in SeqIO.parse(filename, "fasta"):
counter=0
print(record.seq)
odict[record.name] = str(record.seq)[26:26+sample_tag_length]
for pos in range(sample_tag_length):
letter =str(record.seq)[26+pos]
barcode=list(str(record.seq)[26:26+sample_tag_length])
if letter=='A':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='G':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='C':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
else:
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
return odict
In [ ]:
# Make kallisto index from ClickTag sequences
!mv D1.1831 70BPbarcodes.fa
tags_file_path = ("70BPbarcodes.fa") #70BPbarcodes.fa
tag_map=parse_tags(tags_file_path)
work_folder = ''
data_folder = ''#/home/jgehring/scRNAseq/clytia/20190405/all_tag_fastqs//
write_folder = ''
#Write the list of barcodes as a fasta, then make a kallisto index
!mkdir barcode_corrected_tags
tagmap_file_path = "barcode_corrected_tags/70BPbarcodes_tagmap.fa"
tagmap_file = open(tagmap_file_path, "w+")
for i in list(tag_map.keys()):
tagmap_file.write(">" + i + "\n" +tag_map[i] + "\n")
tagmap_file.close()
!kallisto index -i {tagmap_file_path}.idx -k 11 {tagmap_file_path}
!kallisto inspect {tagmap_file_path}.idx
Run kallisto bus on ClickTag fastqs
In [ ]:
#Run kallisto bus on ClickTag fastqs
!mkdir jelly4stim_1_tags_HiSeq
write_folder = 'jelly4stim_1_tags_HiSeq'
!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv3 -t 20 {os.path.join(data_folder,'jelly4stim1MiSeqHiSeq_tags_R1.fastq')} {os.path.join(data_folder,'jelly4stim1MiSeqHiSeq_tags_R2.fastq')}
#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}
# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}
!mkdir jelly4stim_2_tags_HiSeq
write_folder = 'jelly4stim_2_tags_HiSeq'
!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv3 -t 20 {os.path.join(data_folder,'jelly4stim2MiSeqHiSeq_tags_R1.fastq')} {os.path.join(data_folder,'jelly4stim2MiSeqHiSeq_tags_R2.fastq')} \
#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}
# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}
mkdir: cannot create directory ‘jelly4stim_1_tags_HiSeq’: File exists mv: cannot stat 'tw7h9xi94348gsultf0owb6q4mp1ehlf': No such file or directory mv: cannot stat 'bmu19su5ouc6vze3p00pobb665q1pi3z': No such file or directory Warning: you asked for 20, but only 2 cores on the machine [index] k-mer length: 11 [index] number of targets: 680 [index] number of k-mers: 680 [index] number of equivalence classes: 680 [quant] will process sample 1: jelly4stim1MiSeqHiSeq_tags_R1.fastq jelly4stim1MiSeqHiSeq_tags_R2.fastq [quant] finding pseudoalignments for the reads ... done [quant] processed 16,904,784 reads, 8,500,027 reads pseudoaligned Read in 8500027 BUS records Read in 7801303 BUS records mkdir: cannot create directory ‘jelly4stim_2_tags_HiSeq’: File exists mv: cannot stat 'pmsgi1o664n5gynqggnulstje1djxcwb': No such file or directory mv: cannot stat '6ogvqwclzev3dsri2zv46yqrlutucrl1': No such file or directory Warning: you asked for 20, but only 2 cores on the machine [index] k-mer length: 11 [index] number of targets: 680 [index] number of k-mers: 680 [index] number of equivalence classes: 680 [quant] will process sample 1: jelly4stim2MiSeqHiSeq_tags_R1.fastq jelly4stim2MiSeqHiSeq_tags_R2.fastq [quant] finding pseudoalignments for the reads ... done [quant] processed 10,085,603 reads, 4,366,055 reads pseudoaligned Read in 4366055 BUS records Read in 4116905 BUS records
In [ ]:
write_folder = '' #/home/jgehring/scRNAseq/clytia/20190405/
bus_data_jelly1 = pd.read_csv(os.path.join(write_folder,'jelly4stim_1_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly1.head()
bus_data_jelly1
bus_data_jelly2 = pd.read_csv(os.path.join(write_folder,'jelly4stim_2_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly2.head()
tag_map_df = pd.DataFrame.from_dict(tag_map, orient = 'index').reset_index()
tag_map_df.columns=['tagname','tagseq']
tag_map_df['ClickTag'] = tag_map_df['tagname'].str.split('-').str.get(0)
tag_map_df.head()
bus_data_jelly1['tag']= bus_data_jelly1['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly1.head()
bus_data_jelly2['tag']= bus_data_jelly2['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly2.head()
print('Counting UMIs')
counted_data_jelly1 = bus_data_jelly1.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly1.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly1.head()
print('Counting UMIs')
counted_data_jelly2 = bus_data_jelly2.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly2.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly2.head()
Counting UMIs Counting UMIs
Out[ ]:
barcode | tag | umi_counts | |
---|---|---|---|
0 | AAAAAAAAAAAAAAAA | BC_36 | 1 |
1 | AAAAACAGCTACGTTT | BC_38 | 1 |
2 | AAAAACAGGCACTAGT | BC_30 | 1 |
3 | AAAAACATTTTTTTTT | BC_38 | 1 |
4 | AAAAACCACATGAAAA | BC_38 | 1 |
In [ ]:
!mkdir counted_tag_data
mkdir: cannot create directory ‘counted_tag_data’: File exists
In [ ]:
data_dict={'counted_data_jelly1':counted_data_jelly1, 'counted_data_jelly2':counted_data_jelly2}
counted_data_jelly1['barcode']=[x+'-1' for x in counted_data_jelly1['barcode'].values]
counted_data_jelly2['barcode']=[x+'-2' for x in counted_data_jelly2['barcode'].values]
for counted_data in data_dict:
le_barcode = LabelEncoder()
barcode_idx =le_barcode.fit_transform(data_dict[counted_data]['barcode'].values)
print('Barcode index shape:', barcode_idx.shape)
le_umi = LabelEncoder()
umi_idx = le_umi.fit_transform(data_dict[counted_data]['umi_counts'].values)
print('UMI index shape:', umi_idx.shape)
le_tag = LabelEncoder()
tag_idx = le_tag.fit_transform(data_dict[counted_data]['tag'].values)
print('Tag index shape:', tag_idx.shape)
# convert data to csr matrix
csr_matrix_data = scipy.sparse.csr_matrix((data_dict[counted_data]['umi_counts'].values,(barcode_idx,tag_idx)))
scipy.io.mmwrite(os.path.join(write_folder,'counted_tag_data/' + counted_data + '.mtx'),csr_matrix_data)
print('Saved sparse csr matrix')
pd.DataFrame(le_tag.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), index = False, header = False)
pd.DataFrame(le_barcode.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_barcode_labels.csv'), index = False, header = False)
print('Saved cell barcode and hashtag labels')
print('Number of unique cell barcodes seen:', len(le_barcode.classes_))
Barcode index shape: (1853354,) UMI index shape: (1853354,) Tag index shape: (1853354,) Saved sparse csr matrix Saved cell barcode and hashtag labels Number of unique cell barcodes seen: 439363 Barcode index shape: (977213,) UMI index shape: (977213,) Tag index shape: (977213,) Saved sparse csr matrix Saved cell barcode and hashtag labels Number of unique cell barcodes seen: 202035
In [ ]:
# Clicktag for both 10x lanes concatenated
ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1.mtx'))
ClickTagCounts=pd.DataFrame(ClickTagCountsmat.toarray(),
index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0]))
ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2.mtx'))
ClickTagCounts=ClickTagCounts.append(pd.DataFrame(ClickTagCountsmat.toarray(),
index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0])))
ClickTagCounts
Out[ ]:
BC_21 | BC_22 | BC_23 | BC_24 | BC_25 | BC_26 | BC_27 | BC_28 | BC_29 | BC_30 | BC_31 | BC_32 | BC_33 | BC_34 | BC_35 | BC_36 | BC_37 | BC_38 | BC_39 | BC_40 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAAAAAAGTTTGAATT-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
AAAAAACGTTTTGGCC-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
AAAAACGTCGCAGTCC-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
AAAAACGTTGTGGCCT-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
AAAAATCTTTTTTTTT-1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
TTTTTTGAGGCTCTAT-2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
TTTTTTGCAGAATGTA-2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
TTTTTTTTTAACATCC-2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
TTTTTTTTTTGTTTTT-2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TTTTTTTTTTTTTTTT-2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 5 | 1 | 0 |
641398 rows × 20 columns
Use UMI count knee-plot to filter for high-quality ClickTagged cells
In [ ]:
ClickTag_counts_sorted = copy.deepcopy(ClickTagCounts.T.sum())
ClickTag_counts_sorted = ClickTag_counts_sorted.sort_values(ascending=False)
plt.plot(ClickTag_counts_sorted.apply(np.log10),np.log10(range(len(ClickTag_counts_sorted))))
plt.axhline(np.log10(50000))
plt.show()
In [ ]:
ClickTag_counts_filtered=ClickTagCounts.loc[list(ClickTag_counts_sorted[:50000].index)]
ClickTag_counts_filtered=ClickTag_counts_filtered.iloc[:,4:]
filtered_ClickTag_counts=ClickTag_counts_filtered.T
filtered_ClickTag_counts
hits = 0
counter = 1
for barcode in filtered_ClickTag_counts.index:
for i in filtered_ClickTag_counts:
if filtered_ClickTag_counts[i].idxmax() == barcode:
if hits ==0:
sortedheatmap_dtf=pd.DataFrame({counter:filtered_ClickTag_counts[i]})
hits+=1
counter+=1
else:
sortedheatmap_dtf = sortedheatmap_dtf.assign(i = filtered_ClickTag_counts[i])
sortedheatmap_dtf.rename(columns = {'i':counter}, inplace = True)
counter+=1
filtered_ClickTag_counts=copy.deepcopy(sortedheatmap_dtf)
percentClickTags_counts = copy.deepcopy(filtered_ClickTag_counts)
for i in filtered_ClickTag_counts:
percentClickTags_counts[i] = filtered_ClickTag_counts[i]/filtered_ClickTag_counts[i].sum()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log10 This is separate from the ipykernel package so we can avoid doing imports until
In [ ]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(np.log1p(filtered_ClickTag_counts), cmap='viridis')
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(percentClickTags_counts, cmap='viridis')
plt.show()
filtered_ClickTag_counts
Out[ ]:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | ... | 49961 | 49962 | 49963 | 49964 | 49965 | 49966 | 49967 | 49968 | 49969 | 49970 | 49971 | 49972 | 49973 | 49974 | 49975 | 49976 | 49977 | 49978 | 49979 | 49980 | 49981 | 49982 | 49983 | 49984 | 49985 | 49986 | 49987 | 49988 | 49989 | 49990 | 49991 | 49992 | 49993 | 49994 | 49995 | 49996 | 49997 | 49998 | 49999 | 50000 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
BC_25 | 1025 | 435 | 344 | 235 | 231 | 205 | 220 | 204 | 199 | 226 | 171 | 236 | 183 | 172 | 128 | 127 | 163 | 171 | 180 | 135 | 174 | 152 | 188 | 110 | 134 | 185 | 162 | 201 | 172 | 165 | 137 | 171 | 144 | 133 | 146 | 121 | 145 | 137 | 133 | 142 | ... | 1 | 2 | 0 | 1 | 2 | 3 | 1 | 1 | 2 | 0 | 1 | 2 | 3 | 1 | 3 | 0 | 3 | 3 | 3 | 2 | 0 | 0 | 0 | 4 | 1 | 3 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 2 | 0 | 2 | 1 | 2 | 0 | 4 |
BC_26 | 2 | 9 | 3 | 2 | 3 | 0 | 5 | 3 | 0 | 0 | 0 | 4 | 1 | 1 | 2 | 2 | 3 | 1 | 5 | 7 | 2 | 3 | 4 | 47 | 4 | 3 | 0 | 1 | 2 | 4 | 2 | 3 | 0 | 3 | 4 | 2 | 3 | 0 | 2 | 3 | ... | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 2 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 |
BC_27 | 10 | 4 | 8 | 26 | 11 | 3 | 6 | 14 | 13 | 7 | 0 | 9 | 6 | 14 | 7 | 5 | 6 | 5 | 4 | 5 | 157 | 38 | 1 | 4 | 10 | 2 | 7 | 7 | 5 | 6 | 2 | 3 | 7 | 1 | 2 | 4 | 3 | 4 | 45 | 123 | ... | 1 | 2 | 5 | 3 | 0 | 6 | 1 | 0 | 2 | 0 | 1 | 4 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 5 | 2 | 1 | 1 | 2 | 1 | 5 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 0 | 1 | 3 | 3 | 1 | 2 | 2 |
BC_28 | 15 | 11 | 5 | 5 | 4 | 9 | 0 | 5 | 3 | 4 | 3 | 4 | 2 | 6 | 3 | 6 | 11 | 7 | 1 | 1 | 2 | 6 | 1 | 2 | 109 | 3 | 2 | 2 | 4 | 2 | 7 | 6 | 2 | 3 | 4 | 13 | 2 | 4 | 2 | 6 | ... | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 3 | 1 | 1 | 3 | 2 | 1 | 2 | 1 | 2 | 1 | 1 | 0 | 1 | 3 | 1 | 2 | 0 | 6 | 5 | 4 | 1 | 1 | 3 | 2 | 2 | 2 | 1 | 0 | 1 | 1 | 2 | 3 | 0 |
BC_29 | 7 | 5 | 8 | 223 | 11 | 14 | 8 | 1 | 10 | 3 | 7 | 7 | 4 | 3 | 125 | 1 | 3 | 3 | 158 | 6 | 4 | 12 | 136 | 4 | 33 | 131 | 3 | 4 | 4 | 112 | 3 | 115 | 3 | 1 | 50 | 6 | 3 | 3 | 47 | 1 | ... | 0 | 0 | 1 | 1 | 4 | 2 | 1 | 2 | 3 | 3 | 1 | 5 | 0 | 0 | 2 | 1 | 2 | 1 | 2 | 1 | 1 | 2 | 3 | 0 | 0 | 2 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 1 | 1 | 2 | 3 | 2 | 0 | 0 |
BC_30 | 14 | 4 | 7 | 1 | 2 | 4 | 3 | 5 | 4 | 1 | 4 | 132 | 1 | 4 | 64 | 2 | 2 | 1 | 6 | 7 | 7 | 96 | 3 | 48 | 4 | 0 | 2 | 2 | 1 | 9 | 8 | 3 | 3 | 2 | 6 | 4 | 2 | 1 | 10 | 5 | ... | 1 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 2 | 1 | 1 | 2 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 1 | 1 | 6 | 0 | 1 | 0 | 0 |
BC_31 | 7 | 9 | 269 | 5 | 225 | 3 | 138 | 2 | 4 | 1 | 5 | 9 | 6 | 4 | 7 | 37 | 1 | 1 | 3 | 4 | 8 | 11 | 5 | 106 | 3 | 5 | 4 | 3 | 0 | 6 | 8 | 13 | 6 | 2 | 81 | 2 | 126 | 1 | 28 | 2 | ... | 1 | 4 | 1 | 1 | 0 | 3 | 1 | 0 | 1 | 2 | 3 | 3 | 5 | 2 | 1 | 2 | 1 | 0 | 2 | 1 | 0 | 0 | 3 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 3 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 2 |
BC_32 | 16 | 20 | 11 | 26 | 41 | 10 | 91 | 5 | 8 | 8 | 18 | 18 | 7 | 5 | 20 | 10 | 7 | 4 | 19 | 21 | 23 | 33 | 19 | 25 | 19 | 16 | 6 | 4 | 5 | 27 | 8 | 17 | 5 | 5 | 19 | 8 | 22 | 4 | 39 | 22 | ... | 3 | 3 | 3 | 4 | 1 | 2 | 0 | 2 | 5 | 5 | 5 | 0 | 2 | 4 | 3 | 2 | 1 | 0 | 0 | 4 | 1 | 3 | 0 | 4 | 6 | 4 | 4 | 5 | 3 | 3 | 7 | 5 | 2 | 1 | 1 | 2 | 3 | 2 | 3 | 2 |
BC_33 | 4 | 4 | 0 | 0 | 0 | 4 | 0 | 2 | 5 | 2 | 1 | 0 | 5 | 0 | 0 | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 2 | 0 | 0 | 7 | 0 | 1 | 0 | 0 | ... | 0 | 13 | 15 | 0 | 0 | 1 | 17 | 7 | 1 | 4 | 0 | 0 | 0 | 11 | 2 | 14 | 1 | 8 | 13 | 3 | 1 | 0 | 12 | 13 | 0 | 0 | 0 | 9 | 13 | 1 | 7 | 10 | 15 | 1 | 0 | 0 | 1 | 10 | 8 | 11 |
BC_34 | 10 | 7 | 0 | 0 | 0 | 2 | 0 | 3 | 2 | 2 | 8 | 0 | 1 | 5 | 0 | 4 | 2 | 3 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 4 | 0 | 2 | 0 | 1 | 0 | 0 | ... | 14 | 2 | 0 | 1 | 13 | 9 | 2 | 1 | 7 | 11 | 11 | 8 | 2 | 0 | 12 | 1 | 2 | 5 | 2 | 1 | 14 | 15 | 0 | 2 | 1 | 9 | 12 | 1 | 2 | 11 | 2 | 2 | 0 | 15 | 15 | 9 | 3 | 0 | 1 | 2 |
BC_35 | 19 | 10 | 0 | 0 | 0 | 6 | 0 | 2 | 5 | 5 | 6 | 0 | 1 | 3 | 0 | 13 | 9 | 6 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 0 | 5 | 0 | 4 | 2 | 0 | 2 | 0 | 7 | 0 | 0 | ... | 3 | 2 | 1 | 0 | 0 | 4 | 0 | 12 | 4 | 3 | 1 | 4 | 7 | 3 | 0 | 3 | 1 | 2 | 0 | 2 | 4 | 6 | 4 | 2 | 1 | 2 | 2 | 2 | 1 | 1 | 6 | 0 | 0 | 2 | 2 | 1 | 1 | 2 | 3 | 2 |
BC_36 | 5 | 1 | 0 | 0 | 0 | 2 | 0 | 4 | 0 | 2 | 1 | 0 | 0 | 2 | 1 | 11 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 0 | 2 | 0 | 5 | 17 | 0 | 4 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 0 | 16 | 3 | 0 | 1 | 1 | 0 | 1 | 4 | 8 | 14 | 1 | 1 | 1 | 13 | 0 | 1 | 10 | 0 | 0 | 1 | 1 | 7 | 0 | 2 | 1 | 1 | 1 | 2 | 2 | 1 | 0 | 0 | 2 | 10 | 0 | 1 | 2 |
BC_37 | 697 | 396 | 0 | 0 | 0 | 181 | 0 | 182 | 182 | 181 | 171 | 0 | 182 | 168 | 0 | 116 | 157 | 171 | 0 | 133 | 0 | 0 | 0 | 0 | 0 | 0 | 137 | 104 | 123 | 0 | 132 | 0 | 132 | 126 | 0 | 108 | 0 | 127 | 0 | 0 | ... | 5 | 4 | 3 | 2 | 1 | 5 | 3 | 1 | 6 | 1 | 4 | 4 | 3 | 3 | 4 | 3 | 3 | 4 | 6 | 7 | 0 | 2 | 3 | 2 | 4 | 2 | 0 | 4 | 6 | 2 | 0 | 8 | 2 | 1 | 4 | 2 | 5 | 4 | 8 | 2 |
BC_38 | 103 | 51 | 0 | 0 | 0 | 70 | 0 | 35 | 25 | 19 | 32 | 0 | 28 | 39 | 0 | 59 | 30 | 26 | 0 | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 17 | 16 | 0 | 22 | 0 | 19 | 19 | 0 | 36 | 0 | 22 | 0 | 0 | ... | 9 | 11 | 10 | 10 | 11 | 7 | 13 | 13 | 12 | 11 | 10 | 8 | 4 | 13 | 12 | 9 | 12 | 14 | 11 | 9 | 15 | 8 | 11 | 9 | 12 | 6 | 12 | 13 | 9 | 11 | 11 | 8 | 8 | 10 | 14 | 11 | 8 | 14 | 8 | 10 |
BC_39 | 15 | 6 | 0 | 0 | 0 | 4 | 0 | 5 | 8 | 4 | 13 | 0 | 10 | 2 | 0 | 11 | 6 | 2 | 0 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 5 | 0 | 5 | 0 | 2 | 9 | 1 | 7 | 0 | 6 | 0 | 0 | ... | 17 | 15 | 17 | 17 | 19 | 12 | 18 | 14 | 13 | 14 | 13 | 10 | 15 | 16 | 13 | 17 | 15 | 16 | 14 | 11 | 16 | 17 | 15 | 16 | 16 | 15 | 14 | 15 | 17 | 14 | 12 | 15 | 20 | 20 | 16 | 14 | 15 | 15 | 9 | 16 |
BC_40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
16 rows × 50000 columns
Cluster cells to find clear ClickTag 'expression'
In [ ]:
adata = sc.AnnData(ClickTag_counts_filtered)
adata.var_names= list(ClickTag_counts_filtered.columns)
adata.obs_names= list(ClickTag_counts_filtered[:50000].index)
#pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2_ClickTag_barcode_labels.csv'), header=None, sep=',')[:15000][0].tolist()
In [ ]:
adata
Out[ ]:
AnnData object with n_obs × n_vars = 50000 × 16
In [ ]:
adata
sc.pp.filter_genes(adata, min_counts=0)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
adata.obs['n_countslog']=np.log(adata.obs['n_counts'])
sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['n_counts'])
sc.tl.tsne(adata, perplexity=30, use_rep=None)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.louvain(adata, resolution=2)
sc.pl.tsne(adata, color='n_countslog')
sc.set_figure_params(dpi=120)
sc.pl.tsne(adata,color=['louvain'])
# sc.pl.tsne(adata, color=adata.var_names)
# for i in range(20):
# sc.pl.violin(adata[adata.obs['louvain'].isin([str(i)])], keys=adata.var_names)
# sc.pl.tsne(adata, color=['louvain'],palette='rainbow',legend_loc='on data')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
In [ ]:
sc.pl.umap(adata, color='louvain')
sc.pl.umap(adata, color='n_countslog')
sc.pl.umap(adata, color=adata.var_names)