import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
1
2
3
4
#Get HiSeq/MiSeq sequencing of ClickTags
download_file('10.22002/D1.1793','.tar.gz')
!tar -xf D1.1793.tar.gz
import pandas as pd
import copy
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import scipy
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from collections import OrderedDict
from Bio import SeqIO
import os
from scipy import io
import scipy.io as sio
import time
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Use kallisto-bustools to align reads to kallisto index of ClickTag barcode whitelist consisting of sequences hamming distance 1 away from original barcodes
## Set parameters - below are parameters for 10x 3' v3 chemistry
## The cell hashing method uses tags of length 12, we included the variable B (T,C,G) in the end to make it lenght 13
cell_barcode_length = 16
UMI_length = 12
sample_tag_length=11
"""
This function returns all sample tags and and their single base mismatches (hamming distance 1).
ClickTag sequences are provided as a fasta file, and the script indexes the barcode region of the fasta
"""
"parse the tags file and output the set of tag sequences"
def parse_tags(filename):
odict = OrderedDict()
print('Read the following tags:')
for record in SeqIO.parse(filename, "fasta"):
counter=0
print(record.seq)
odict[record.name] = str(record.seq)[26:26+sample_tag_length]
for pos in range(sample_tag_length):
letter =str(record.seq)[26+pos]
barcode=list(str(record.seq)[26:26+sample_tag_length])
if letter=='A':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='G':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='C':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
else:
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
return odict
# Make kallisto index from ClickTag sequences
!mv D1.1831 70BPbarcodes.fa
tags_file_path = ("70BPbarcodes.fa") #70BPbarcodes.fa
tag_map=parse_tags(tags_file_path)
work_folder = ''
data_folder = ''#/home/jgehring/scRNAseq/clytia/20190405/all_tag_fastqs//
write_folder = ''
#Write the list of barcodes as a fasta, then make a kallisto index
!mkdir barcode_corrected_tags
tagmap_file_path = "barcode_corrected_tags/70BPbarcodes_tagmap.fa"
tagmap_file = open(tagmap_file_path, "w+")
for i in list(tag_map.keys()):
tagmap_file.write(">" + i + "\n" +tag_map[i] + "\n")
tagmap_file.close()
!kallisto index -i {tagmap_file_path}.idx -k 11 {tagmap_file_path}
!kallisto inspect {tagmap_file_path}.idx
data_dict={'counted_data_jelly1':counted_data_jelly1, 'counted_data_jelly2':counted_data_jelly2}
counted_data_jelly1['barcode']=[x+'-1' for x in counted_data_jelly1['barcode'].values]
counted_data_jelly2['barcode']=[x+'-2' for x in counted_data_jelly2['barcode'].values]
for counted_data in data_dict:
le_barcode = LabelEncoder()
barcode_idx =le_barcode.fit_transform(data_dict[counted_data]['barcode'].values)
print('Barcode index shape:', barcode_idx.shape)
le_umi = LabelEncoder()
umi_idx = le_umi.fit_transform(data_dict[counted_data]['umi_counts'].values)
print('UMI index shape:', umi_idx.shape)
le_tag = LabelEncoder()
tag_idx = le_tag.fit_transform(data_dict[counted_data]['tag'].values)
print('Tag index shape:', tag_idx.shape)
# convert data to csr matrix
csr_matrix_data = scipy.sparse.csr_matrix((data_dict[counted_data]['umi_counts'].values,(barcode_idx,tag_idx)))
scipy.io.mmwrite(os.path.join(write_folder,'counted_tag_data/' + counted_data + '.mtx'),csr_matrix_data)
print('Saved sparse csr matrix')
pd.DataFrame(le_tag.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), index = False, header = False)
pd.DataFrame(le_barcode.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_barcode_labels.csv'), index = False, header = False)
print('Saved cell barcode and hashtag labels')
print('Number of unique cell barcodes seen:', len(le_barcode.classes_))
Barcode index shape: (1853354,)UMI index shape: (1853354,)Tag index shape: (1853354,)Saved sparse csr matrixSaved cell barcode and hashtag labelsNumber of unique cell barcodes seen: 439363Barcode index shape: (977213,)UMI index shape: (977213,)Tag index shape: (977213,)Saved sparse csr matrixSaved cell barcode and hashtag labelsNumber of unique cell barcodes seen: 202035
ClickTag_counts_filtered=ClickTagCounts.loc[list(ClickTag_counts_sorted[:50000].index)]
ClickTag_counts_filtered=ClickTag_counts_filtered.iloc[:,4:]
filtered_ClickTag_counts=ClickTag_counts_filtered.T
filtered_ClickTag_counts
hits = 0
counter = 1
for barcode in filtered_ClickTag_counts.index:
for i in filtered_ClickTag_counts:
if filtered_ClickTag_counts[i].idxmax() == barcode:
if hits ==0:
sortedheatmap_dtf=pd.DataFrame({counter:filtered_ClickTag_counts[i]})
hits+=1
counter+=1
else:
sortedheatmap_dtf = sortedheatmap_dtf.assign(i = filtered_ClickTag_counts[i])
sortedheatmap_dtf.rename(columns = {'i':counter}, inplace = True)
counter+=1
filtered_ClickTag_counts=copy.deepcopy(sortedheatmap_dtf)
percentClickTags_counts = copy.deepcopy(filtered_ClickTag_counts)
for i in filtered_ClickTag_counts:
percentClickTags_counts[i] = filtered_ClickTag_counts[i]/filtered_ClickTag_counts[i].sum()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log10 This is separate from the ipykernel package so we can avoid doing imports until