"""
18 Nov 2014
"""
from __future__ import print_function
from warnings import warn, catch_warnings, simplefilter
from collections import OrderedDict
from pysam import AlignmentFile
from scipy.stats import norm as sc_norm, skew, kurtosis
from scipy.stats import pearsonr, spearmanr, linregress
from scipy.sparse.linalg import eigsh
from numpy.linalg import eigh
import numpy as np
try:
from matplotlib import rcParams
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import LinearSegmentedColormap
except ImportError:
warn('matplotlib not found\n')
from pytadbit import HiC_data
from pytadbit.utils.extraviews import tadbit_savefig, setup_plot
from pytadbit.utils.tadmaths import nozero_log_matrix as nozero_log
from pytadbit.utils.tadmaths import right_double_mad as mad
from pytadbit.parsers.hic_parser import load_hic_data_from_reads
from pytadbit.utils.extraviews import nicer
from pytadbit.utils.file_handling import mkdir
try:
basestring
except NameError:
basestring = str
[docs]def hic_map(data, resolution=None, normalized=False, masked=None,
by_chrom=False, savefig=None, show=False, savedata=None,
focus=None, clim=None, perc_clim=None, cmap='jet', pdf=False, decay=True,
perc=20, name=None, decay_resolution=None, **kwargs):
"""
function to retrieve data from HiC-data object. Data can be stored as
a square matrix, or drawn using matplotlib
:param data: can be either a path to a file with pre-processed reads
(filtered or not), or a Hi-C-data object
:param None resolution: at which to bin the data (try having a dense matrix
with < 10% of cells with zero interaction counts). Note: not necessary
if a hic_data object is passed as 'data'.
:param False normalized: used normalized data, based on precalculated biases
:param masked: a list of columns to be removed. Usually because to few
interactions
:param False by_chrom: data can be stored in a partitioned way. This
parameter can take the values of:
* 'intra': one output per each chromosome will be created
* 'inter': one output per each possible pair of chromosome will be
created
* 'all' : both of the above outputs
:param None savefig: path where to store the output images. Note that, if
the by_chrom option is used, then savefig will be the name of the
directory containing the output files.
:param None savedata: path where to store the output matrices. Note that, if
the by_chrom option is used, then savefig will be the name of the
directory containing the output files.
:param None focus: can be either two number (i.e.: (1, 100)) specifying the
start and end position of the sub-matrix to display (start and end, along
the diagonal of the original matrix); or directly a chromosome name; or
two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the
data corresponding to inter chromosomal interactions between these two
chromosomes
:param True decay: plot the correlation between genomic distance and
interactions (usually a decay).
:param False force_image: force to generate an image even if resolution is
crazy...
:param None clim: cutoff for the upper and lower bound in the coloring scale
of the heatmap. (perc_clim should be set to None)
:param None perc_clim: cutoff for the upper and lower bound in the coloring scale
of the heatmap; in percentile. (clim should be set to None)
:param False pdf: when using the bny_chrom option, to specify the format of
the stored images
:param jet cmap: color map to be used for the heatmap; "tadbit" color map is
also implemented and will use percentiles of the distribution of
interactions to defines intensities of red.
:param None decay_resolution: chromatin fragment size to consider when
calculating decay of the number of interactions with genomic distance.
Default is equal to resolution of the matrix.
"""
if isinstance(data, basestring):
data = load_hic_data_from_reads(data, resolution=resolution, **kwargs)
if not kwargs.get('get_sections', True) and decay:
warn('WARNING: not decay not available when get_sections is off.')
decay = False
if clim and perc_clim:
raise Exception('ERROR: only one of clim or perc_clim should be set\n')
hic_data = data
resolution = data.resolution
if not decay_resolution:
decay_resolution = resolution
if hic_data.bads and not masked:
masked = hic_data.bads
# save and draw the data
if by_chrom:
if focus:
raise Exception('Incompatible options focus and by_chrom\n')
if savedata:
mkdir(savedata)
if savefig:
mkdir(savefig)
for i, crm1 in enumerate(hic_data.chromosomes):
for crm2 in list(hic_data.chromosomes.keys())[i:]:
if by_chrom == 'intra' and crm1 != crm2:
continue
if by_chrom == 'inter' and crm1 == crm2:
continue
try:
subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized)
start1, _ = hic_data.section_pos[crm1]
start2, _ = hic_data.section_pos[crm2]
masked1 = {}
masked2 = {}
if focus and hic_data.bads:
# rescale masked
masked1 = dict([(m - start1, hic_data.bads[m])
for m in hic_data.bads])
masked2 = dict([(m - start2, hic_data.bads[m])
for m in hic_data.bads])
if masked1 or masked2:
for i in range(len(subdata)):
if i in masked1:
subdata[i] = [float('nan')
for j in range(len(subdata))]
for j in range(len(subdata)):
if j in masked2:
subdata[i][j] = float('nan')
if savedata:
hic_data.write_matrix('%s/%s.mat' % (
savedata, '_'.join(set((crm1, crm2)))),
focus=(crm1, crm2),
normalized=normalized)
if show or savefig:
if (len(subdata) > 10000
and not kwargs.get('force_image', False)):
warn('WARNING: Matrix image not created, more than '
'10000 rows, use a lower resolution to create images')
continue
draw_map(subdata,
OrderedDict([(k, hic_data.chromosomes[k])
for k in list(hic_data.chromosomes.keys())
if k in [crm1, crm2]]),
hic_data.section_pos,
'%s/%s.%s' % (savefig,
'_'.join(set((crm1, crm2))),
'pdf' if pdf else 'png'),
show, one=True, clim=clim, perc_clim=perc_clim,
cmap=cmap, decay_resolution=decay_resolution,
perc=perc, name=name, cistrans=float('NaN'))
except ValueError as e:
print('Value ERROR: problem with chromosome %s' % crm1)
print(str(e))
except IndexError as e:
print('Index ERROR: problem with chromosome %s' % crm1)
print(str(e))
else:
if savedata:
hic_data.write_matrix(savedata, focus=focus,
normalized=normalized)
if show or savefig:
subdata = hic_data.get_matrix(focus=focus, normalized=normalized)
if (len(subdata) > 10000 and not kwargs.get('force_image', False)):
warn('WARNING: Matrix image not created, more than '
'10000 rows, use a lower resolution to create images')
return
start1 = hic_data._focus_coords(focus)[0]
if focus and masked:
# rescale masked
masked = dict([(m - start1, masked[m]) for m in masked])
if masked:
for i in range(len(subdata)):
if i in masked:
subdata[i] = [float('nan')
for j in range(len(subdata))]
for j in range(len(subdata)):
if j in masked:
subdata[i][j] = float('nan')
draw_map(subdata,
{} if focus else hic_data.chromosomes,
hic_data.section_pos, savefig, show,
one = True if focus else False, decay=decay,
clim=clim, perc_clim=perc_clim, cmap=cmap,
decay_resolution=decay_resolution,
perc=perc, normalized=normalized,
max_diff=kwargs.get('max_diff', None),
name=name, cistrans=float('NaN') if focus else
hic_data.cis_trans_ratio(normalized,
kwargs.get('exclude', None),
kwargs.get('diagonal', True),
kwargs.get('equals', None)))
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None,
perc_clim=None, cmap='jet', decay=False, perc=20, name=None,
cistrans=None, decay_resolution=10000, normalized=False,
max_diff=None):
_ = plt.figure(figsize=(15.,12.5))
if not max_diff:
max_diff = len(data)
ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
if decay:
ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3,
resolution=decay_resolution,
max_diff=max_diff, normalized=normalized)
ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
try:
minoridata = np.nanmin(data)
maxoridata = np.nanmax(data)
except AttributeError:
vals = [i for d in data for i in d if not np.isnan(i)]
minoridata = np.min(vals)
maxoridata = np.max(vals)
totaloridata = np.nansum([data[i][j] for i in range(len(data))
for j in range(i, len(data[i]))]) # may not be square
data = nozero_log(data, np.log2)
vals = np.array([i for d in data for i in d])
vals = vals[np.isfinite(vals)]
if perc_clim:
try:
clim = np.percentile(vals, perc_clim[0]), np.percentile(vals, perc_clim[1])
except ValueError:
clim = None
mindata = np.nanmin(vals)
maxdata = np.nanmax(vals)
diff = maxdata - mindata
norm = lambda x: (x - mindata) / diff
posI = 0.01 if not clim else norm(clim[0]) if clim[0] != None else 0.01
posF = 1.0 if not clim else norm(clim[1]) if clim[1] != None else 1.0
if cmap == 'tadbit':
cuts = perc
cdict = {'red' : [(0.0, 1.0, 1.0)],
'green': [(0.0, 1.0, 1.0)],
'blue' : [(0.0, 1.0, 1.0)]}
for i in np.linspace(posI, posF, cuts, endpoint=False):
prc = (i / (posF - posI)) / 1.75
pos = norm(np.percentile(vals, i * 100.))
# print '%7.4f %7.4f %7.4f %7.4f' % (prc, pos, np.percentile(vals, i * 100.), i)
cdict['red' ].append([pos, 1 , 1 ])
cdict['green'].append([pos, 1 - prc, 1 - prc])
cdict['blue' ].append([pos, 1 - prc, 1 - prc])
cdict['red' ].append([1.0, 1, 1])
cdict['green'].append([1.0, 0, 0])
cdict['blue' ].append([1.0, 0, 0])
cmap = LinearSegmentedColormap(cmap, cdict)
clim = None
else:
cmap = plt.get_cmap(cmap)
cmap.set_bad('darkgrey', 1)
ax1.imshow(data, interpolation='none',
cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
size1 = len(data)
size2 = len(data[0])
if size1 == size2:
for i in range(size1):
for j in range(i, size2):
if np.isnan(data[i][j]):
data[i][j] = 0
data[j][i] = 0
else:
for i in range(size1):
for j in range(size2):
if np.isnan(data[i][j]):
data[i][j] = 0
#data[j][i] = data[i][j]
try:
evals, evect = eigh(data)
sort_perm = evals.argsort()
evect = evect[sort_perm]
except:
evals, evect = None, None
data = [i for d in data for i in d if np.isfinite(i)]
gradient = np.linspace(np.nanmin(data),
np.nanmax(data), max(size1, size2))
gradient = np.vstack((gradient, gradient))
try:
h = ax2.hist(data, color='darkgrey', linewidth=2,
bins=20, histtype='step', density=True)
except AttributeError:
h = ax2.hist(data, color='darkgrey', linewidth=2,
bins=20, histtype='step', normed=True)
_ = ax2.imshow(gradient, aspect='auto', cmap=cmap,
vmin=clim[0] if clim else None, vmax=clim[1] if clim else None,
extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0])))
if genome_seq:
for crm in genome_seq:
ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
color='w', linestyle='-', linewidth=1, alpha=1)
ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
color='w', linestyle='-', linewidth=1, alpha=1)
ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
color='k', linestyle='--')
ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
color='k', linestyle='--')
if not one:
vals = [0]
keys = ['']
for crm in genome_seq:
vals.append(cumcs[crm][0])
keys.append(crm)
vals.append(cumcs[crm][1])
ax1.set_yticks(vals)
ax1.set_yticklabels('')
ax1.set_yticks([float(vals[i]+vals[i+1])/2
for i in range(len(vals) - 1)], minor=True)
ax1.set_yticklabels(keys, minor=True)
for t in ax1.yaxis.get_minor_ticks():
t.tick1line.set_visible(False)
t.tick2line.set_visible(False)
# totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
# minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
# maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
plt.figtext(0.05,0.25, ''.join([
(name + '\n') if name else '',
'Number of interactions: %s\n' % str(totaloridata),
('' if np.isnan(cistrans) else
('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))),
'Min interactions: %s\n' % (minoridata),
'Max interactions: %s\n' % (maxoridata)]))
ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
ax2.set_ylim((0, max(h[0])))
ax1.set_xlim ((-0.5, size1 - .5))
ax1.set_ylim ((-0.5, size2 - .5))
ax2.set_xlabel('log interaction count')
# we reduce the number of dots displayed.... we just want to see the shape
subdata = np.array(list(set([float(int(d*100))/100 for d in data])))
try:
normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
except AttributeError:
normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4)
ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1)
ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data),
kurtosis(data)))
try:
ax4.vlines(list(range(size1)), 0, evect[:,-1], color='k')
except (TypeError, IndexError):
pass
ax4.hlines(0, 0, size2, color='red')
ax4.set_ylabel('E1')
ax4.set_yticklabels([])
try:
ax5.vlines(list(range(size1)), 0, evect[:,-2], color='k')
except (TypeError, IndexError):
pass
ax5.hlines(0, 0, size2, color='red')
ax5.set_ylabel('E2')
ax5.set_yticklabels([])
try:
ax6.vlines(list(range(size1)), 0, evect[:,-3], color='k')
except (TypeError, IndexError):
pass
ax6.hlines(0, 0, size2, color='red')
ax6.set_ylabel('E3')
ax6.set_yticklabels([])
xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
plt.setp(xticklabels, visible=False)
if savefig:
tadbit_savefig(savefig)
elif show:
plt.show()
plt.close('all')
[docs]def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False,
genome_seq=None, resolution=None, axe=None,
savefig=None, normalized=False,
plot_each_cell=False):
"""
Plot the number of interactions observed versus the genomic distance between
the mapped ends of the read. The slope is expected to be around -1, in
logarithmic scale and between 700 kb and 10 Mb (according to the prediction
of the fractal globule model).
:param data: input file name (either tsv or TADbit generated BAM), or
HiC_data object or list of lists
:param 10 min_diff: lower limit (in number of bins)
:param 1000 max_diff: upper limit (in number of bins) to look for
:param 100 resolution: group reads that are closer than this resolution
parameter
:param_hash False plot_each_cell: if false, only the mean distances by bin
will be represented, otherwise each pair of interactions will be plotted.
:param None axe: a matplotlib.axes.Axes object to define the plot
appearance
:param None savefig: path to a file where to save the image generated;
if None, the image will be shown using matplotlib GUI (the extension
of the file name will determine the desired format).
:returns: slope, intercept and R square of each of the 3 correlations
"""
if isinstance(data, basestring):
resolution = resolution or 1
dist_intr = dict([(i, {})
for i in range(min_diff, max_diff)])
fhandler = open(data)
line = next(fhandler)
while line.startswith('#'):
line = next(fhandler)
try:
while True:
_, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
if cr1 != cr2:
line = next(fhandler)
continue
diff = abs(int(ps1) // resolution - int(ps2) // resolution)
if max_diff > diff >= min_diff:
try:
dist_intr[diff][int(ps1) // resolution] += 1.
except KeyError:
dist_intr[diff][int(ps1) // resolution] = 1.
line = next(fhandler)
except StopIteration:
pass
fhandler.close()
for diff in dist_intr:
dist_intr[diff] = [dist_intr[diff].get(k, 0)
for k in range(max(dist_intr[diff]) - diff)]
elif isinstance(data, HiC_data):
resolution = resolution or data.resolution
dist_intr = dict([(i, []) for i in range(min_diff, max_diff)])
if normalized:
get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y]
else:
get_data = lambda x, y: data[x, y]
max_diff = min(len(data), max_diff)
if data.section_pos:
for crm in data.section_pos:
for diff in range(min_diff, min(
(max_diff, 1 + data.chromosomes[crm]))):
for i in range(data.section_pos[crm][0],
data.section_pos[crm][1] - diff):
dist_intr[diff].append(get_data(i, i + diff))
else:
for diff in range(min_diff, max_diff):
for i in range(len(data) - diff):
if not np.isnan(data[i, i + diff]):
dist_intr[diff].append(get_data(i, diff))
elif isinstance(data, dict): # if we pass decay/expected dictionary, computes weighted mean
dist_intr = {}
for i in range(min_diff, max_diff):
val = [data[c][i] for c in data
if i in data[c] and data[c][i] != data[c].get(i-1, 0)]
if val:
dist_intr[i] = [sum(val) / float(len(val))]
else:
dist_intr[i] = [0]
else:
dist_intr = dict([(i, []) for i in range(min_diff, max_diff)])
if genome_seq:
max_diff = min(max(genome_seq.values()), max_diff)
cnt = 0
for crm in genome_seq:
for diff in range(min_diff, min(
(max_diff, genome_seq[crm]))):
for i in range(cnt, cnt + genome_seq[crm] - diff):
if not np.isnan(data[i][i + diff]):
dist_intr[diff].append(data[i][i + diff])
cnt += genome_seq[crm]
else:
max_diff = min(len(data), max_diff)
for diff in range(min_diff, max_diff):
for i in range(len(data) - diff):
if not np.isnan(data[i][i + diff]):
dist_intr[diff].append(data[i][i + diff])
resolution = resolution or 1
if not axe:
fig=plt.figure()
axe = fig.add_subplot(111)
# remove last part of the plot in case no interaction is count... reduce max_dist
for diff in range(max_diff - 1, min_diff, -1):
try:
if not dist_intr[diff]:
del(dist_intr[diff])
max_diff -=1
continue
except KeyError:
max_diff -=1
continue
break
# get_cmap the mean values perc bins
mean_intr = dict([(i, float(sum(dist_intr[i])) / len(dist_intr[i]))
for i in dist_intr if len(dist_intr[i])])
if plot_each_cell:
xp, yp = [], []
for x, y in sorted(list(dist_intr.items()), key=lambda x:x[0]):
xp.extend([x] * len(y))
yp.extend(y)
x = []
y = []
for k in range(len(xp)):
if yp[k]:
x.append(xp[k])
y.append(yp[k])
axe.plot(x, y, color='grey', marker='.', alpha=0.1, ms=1,
linestyle='None')
xp, yp = list(zip(*sorted(list(mean_intr.items()), key=lambda x:x[0])))
x = []
y = []
for k in range(len(xp)):
if yp[k]:
x.append(xp[k])
y.append(yp[k])
axe.plot(x, y, 'k.', alpha=0.4)
best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0)
logx = np.log(x)
logy = np.log(y)
ntries = 100
# set k for better fit
# for k in xrange(1, ntries/5, ntries/5/5):
if resolution == 1:
k = 1
for i in range(3, ntries-2-k):
v1 = i * len(x) / ntries
try:
a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
except ValueError:
a1 = b1 = r21 = 0
r21 *= r21
for j in range(i + 1 + k, ntries - 2 - k):
v2 = j * len(x) / ntries
try:
a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2])
a3, b3, r23, _, _ = linregress(logx[v2+k: ], logy[v2+k: ])
except ValueError:
a2 = b2 = r22 = 0
a3 = b3 = r23 = 0
r2 = r21 + r22**2 + r23**2
if r2 > best[0]:
best = (r2, v1, v2, a1, a2, a3,
b1, b2, b3, k)
# plot line of best fit
(v1, v2,
a1, a2, a3,
b1, b2, b3, k) = best[1:]
yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
#label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]), color= 'orange', lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
# label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
axe.plot(x[v2+k: ], yfit3(x[v2+k: ] ), color= 'red' , lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
# label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
else:
# from 0.7 Mb
v1 = 700000 // resolution
# to 10 Mb
v2 = 10000000 // resolution
try:
a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
except ValueError:
a1, b1, r21 = 0, 0, 0
try:
a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2])
except ValueError:
a2, b2, r22 = 0, 0, 0
try:
a3, b3, r23, _, _ = linregress(logx[v2: ], logy[v2: ])
except ValueError:
a3, b3, r23 = 0, 0, 0
yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
axe.plot(x[ :v1], yfit1(x[ :v1] ), color= 'yellow', lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
#label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
axe.plot(x[v1:v2], yfit2(x[v1:v2]), color= 'orange', lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
# label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
axe.plot(x[v2: ], yfit3(x[v2: ] ), color= 'red' , lw=2,
label = r'$\alpha_{%s}=%.2f$' % (
'10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
# label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
axe.set_ylabel('Log interaction count')
axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution))
axe.legend(loc='lower left', frameon=False)
axe.set_xscale('log')
axe.set_yscale('log')
axe.set_xlim((min_diff, max_diff))
try:
with catch_warnings():
simplefilter("ignore")
axe.set_ylim((0, max(y)))
except ValueError:
pass
if savefig:
tadbit_savefig(savefig)
plt.close('all')
elif show:
plt.show()
plt.close('all')
return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
[docs]def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None):
"""
Plots the number of reads mapped at each step of the mapping process (in the
case of the iterative mapping, each step is mapping process with a given
size of fragments).
:param fnam: input file name
:param total_reads: total number of reads in the initial FASTQ file
:param None axe: a matplotlib.axes.Axes object to define the plot
appearance
:param None savefig: path to a file where to save the image generated;
if None, the image will be shown using matplotlib GUI (the extension
of the file name will determine the desired format).
:returns: a dictionary with the number of reads per mapped length
"""
count_by_len = {}
total_reads = total_reads or 1
if not axe:
fig=plt.figure()
_ = fig.add_subplot(111)
colors = ['olive', 'darkcyan']
iteration = False
for i, fnam in enumerate([fnam1, fnam2]):
fhandler = open(fnam)
line = next(fhandler)
count_by_len[i] = {}
while line.startswith('#'):
if line.startswith('# MAPPED '):
itr, num = line.split()[2:]
count_by_len[i][int(itr)] = int(num)
line = next(fhandler)
if not count_by_len[i]:
iteration = True
try:
while True:
_, length, _, _ = line.rsplit('\t', 3)
try:
count_by_len[i][int(length)] += 1
except KeyError:
count_by_len[i][int(length)] = 1
line = next(fhandler)
except StopIteration:
pass
fhandler.close()
lengths = sorted(count_by_len[i].keys())
for k in lengths[::-1]:
count_by_len[i][k] += sum([count_by_len[i][j]
for j in lengths if j < k])
plt.plot(lengths, [float(count_by_len[i][l]) / total_reads
for l in lengths],
label='read' + str(i + 1), linewidth=2, color=colors[i])
if iteration:
plt.xlabel('read length (bp)')
else:
plt.xlabel('Iteration number')
if total_reads != 1:
plt.ylabel('Proportion of mapped reads')
else:
plt.ylabel('Number of mapped reads')
plt.legend(loc=4)
if savefig:
tadbit_savefig(savefig)
elif not axe:
plt.show()
plt.close('all')
return count_by_len
def fragment_size(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
show=False, xlog=False, stats=('median', 'perc_max'),
too_large=10000):
"""
Plots the distribution of dangling-ends lengths
:param fnam: input file name
:param None savefig: path where to store the output images.
:param 99.9 max_size: top percentage of distances to consider, within the
top 0.01% are usually found very long outliers.
:param False xlog: represent x axis in logarithmic scale
:param ('median', 'perc_max') stats: returns this set of values calculated from the
distribution of insert/fragment sizes. Possible values are:
- 'median' median of the distribution
- 'mean' mean of the distribution
- 'perc_max' percentil defined by the other parameter 'max_size'
- 'first_deacay' starting from the median of the distribution to the
first window where 10 consecutive insert sizes are counted less than
a given value (this given value is equal to the sum of all
sizes divided by 100 000)
- 'MAD' Double Median Adjusted Deviation
:param 10000 too_large: upper bound limit for fragment size to consider
:param None nreads: number of reads to process (default: all reads)
:returns: the median value and the percentile inputed as max_size.
"""
distr = {}
genome_seq = OrderedDict()
pos = 0
fhandler = open(fnam)
for line in fhandler:
if line.startswith('#'):
if line.startswith('# CRM '):
crm, clen = line[6:].split('\t')
genome_seq[crm] = int(clen)
else:
break
pos += len(line)
fhandler.seek(pos)
des = []
for line in fhandler:
(crm1, pos1, dir1, _, re1, _,
crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
if re1 == re2 and crm1 == crm2 and dir1 == '1' and dir2 == '0':
pos1, pos2 = int(pos1), int(pos2)
des.append(pos2 - pos1)
if len(des) == nreads:
break
des = [i for i in des if i <= too_large]
fhandler.close()
if not des:
raise Exception('ERROR: no dangling-ends found in %s' % (fnam))
max_perc = np.percentile(des, max_size)
perc99 = np.percentile(des, 99)
perc01 = np.percentile(des, 1)
perc50 = np.percentile(des, 50)
meanfr = np.mean(des)
perc95 = np.percentile(des, 95)
perc05 = np.percentile(des, 5)
to_return = {'median': perc50}
cutoff = len(des) / 100000.
count = 0
for v in range(int(perc50), int(max(des))):
if des.count(v) < cutoff:
count += 1
else:
count = 0
if count >= 10:
to_return['first_decay'] = v - 10
break
else:
raise ZeroDivisionError('ERROR: no dangling-ends found')
to_return['perc_max'] = max_perc
to_return['MAD'] = mad(des)
to_return['mean'] = meanfr
if not savefig and not axe and not show:
return [to_return[k] for k in stats]
ax = setup_plot(axe, figsize=(10, 5.5))
desapan = ax.axvspan(perc95, perc99, facecolor='black', alpha=.2,
label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
ax.axvspan(perc01, perc05, facecolor='black', alpha=.2)
desapan = ax.axvspan(perc05, perc95, facecolor='black', alpha=.4,
label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
deshist = ax.hist(des, bins=100, range=(0, max_perc), lw=2,
alpha=.5, edgecolor='darkred', facecolor='darkred', label='Dangling-ends')
ylims = ax.get_ylim()
plots = []
ax.set_xlabel('Genomic distance between reads')
ax.set_ylabel('Count')
ax.set_title('Distribution of dangling-ends ' +
'lenghts\nmedian: %s (mean: %s), top %.1f%%: %0.f nts' % (
int(perc50), int(meanfr), max_size, int(max_perc)))
if xlog:
ax.set_xscale('log')
ax.set_xlim((50, max_perc))
plt.subplots_adjust(left=0.1, right=0.75)
ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
if savefig:
tadbit_savefig(savefig)
elif show and not axe:
plt.show()
plt.close('all')
return [to_return[k] for k in stats]
[docs]def plot_genomic_distribution(fnam, first_read=None, resolution=10000,
ylim=None, yscale=None, savefig=None, show=False,
savedata=None, chr_names=None, nreads=None):
"""
Plot the number of reads in bins along the genome (or along a given
chromosome).
:param fnam: input file name
:param True first_read: uses first read.
:param 100 resolution: group reads that are closer than this resolution
parameter
:param None ylim: a tuple of lower and upper bound for the y axis
:param None yscale: if set_bad to "log" values will be represented in log2
scale
:param None savefig: path to a file where to save the image generated;
if None, the image will be shown using matplotlib GUI (the extension
of the file name will determine the desired format).
:param None savedata: path where to store the output read counts per bin.
:param None chr_names: can pass a list of chromosome names in case only some
them the need to be plotted (this option may last even more than default)
:param None nreads: number of reads to process (default: all reads)
"""
if first_read:
warn('WARNING: first_read parameter should no loonger be used.')
distr = {}
genome_seq = OrderedDict()
if chr_names:
chr_names = set(chr_names)
cond1 = lambda x: x not in chr_names
else:
cond1 = lambda x: False
if nreads:
cond2 = lambda x: x >= nreads
else:
cond2 = lambda x: False
cond = lambda x, y: cond1(x) or cond2(y)
count = 0
pos = 0
fhandler = open(fnam)
for line in fhandler:
if line.startswith('#'):
if line.startswith('# CRM '):
crm, clen = line[6:].split('\t')
genome_seq[crm] = int(clen)
else:
break
pos += len(line)
fhandler.seek(pos)
for line in fhandler:
line = line.strip().split('\t')
count += 1
for idx1, idx2 in ((1, 3), (7, 9)):
crm, pos = line[idx1:idx2]
if cond(crm, count):
if cond2(count):
break
continue
pos = int(pos) // resolution
try:
distr[crm][pos] += 1
except KeyError:
try:
distr[crm][pos] = 1
except KeyError:
distr[crm] = {pos: 1}
else:
continue
break
fhandler.close()
if savefig or show:
_ = plt.figure(figsize=(15, 1 + 3 * len(
chr_names if chr_names else list(distr.keys()))))
max_y = max([max(distr[c].values()) for c in distr])
max_x = max([len(list(distr[c].values())) for c in distr])
ncrms = len(chr_names if chr_names else genome_seq if genome_seq else distr)
data = {}
for i, crm in enumerate(chr_names if chr_names else genome_seq
if genome_seq else distr):
try:
# data[crm] = [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))] # genome_seq[crm]
data[crm] = [distr[crm].get(j, 0)
for j in range(genome_seq[crm] // resolution + 1)]
if savefig or show:
plt.subplot(ncrms, 1, i + 1)
plt.plot(list(range(genome_seq[crm] // resolution + 1)), data[crm],
color='red', lw=1.5, alpha=0.7)
if yscale:
plt.yscale(yscale)
except KeyError:
pass
if savefig or show:
if ylim:
plt.vlines(genome_seq[crm] // resolution, ylim[0], ylim[1])
else:
plt.vlines(genome_seq[crm] // resolution, 0, max_y)
plt.xlim((0, max_x))
plt.ylim(ylim or (0, max_y))
plt.title(crm)
if savefig:
tadbit_savefig(savefig)
if not show:
plt.close('all')
elif show:
plt.show()
if savedata:
out = open(savedata, 'w')
out.write('# CRM\tstart-end\tcount\n')
out.write('\n'.join('%s\t%d-%d\t%d' % (c, (i * resolution) + 1,
((i + 1) * resolution), v)
for c in data for i, v in enumerate(data[c])))
out.write('\n')
out.close()
def _unitize(vals):
return np.argsort(vals) / float(len(vals))
[docs]def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, axe=None,
savefig=None, show=False, savedata=None, min_dist=1,
normalized=False, remove_bad_columns=True, **kwargs):
"""
Compare the interactions of two Hi-C matrices at a given distance,
with Spearman rank correlation.
Also computes the SCC reproducibility score as in HiCrep (see
https://doi.org/10.1101/gr.220640.117). It's implementation is inspired
by the version implemented in dryhic by Enrique Vidal
(https://github.com/qenvio/dryhic).
:param hic_data1: Hi-C-data object
:param hic_data2: Hi-C-data object
:param 1 resolution: to be used for scaling the plot
:param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will
not look further than 10 times the resolution)
:param 1 min_dist: minimum distance from diagonal (set to 0 to reproduce
result from HicRep)
:param None savefig: path to save the plot
:param False intra: only takes into account intra-chromosomal contacts
:param False show: displays the plot
:param False normalized: use normalized data
:param True remove_bads: computes the union of bad columns between samples
and exclude them from the comparison
:returns: list of correlations, list of genomic distances, SCC and standard
deviation of SCC
"""
spearmans = []
pearsons = []
dists = []
weigs = []
if normalized:
get_the_guy1 = lambda i, j: (hic_data1[j, i] / hic_data1.bias[i] /
hic_data1.bias[j])
get_the_guy2 = lambda i, j: (hic_data2[j, i] / hic_data2.bias[i] /
hic_data2.bias[j])
else:
get_the_guy1 = lambda i, j: hic_data1[j, i]
get_the_guy2 = lambda i, j: hic_data2[j, i]
if remove_bad_columns:
# union of bad columns
bads = hic_data1.bads.copy()
bads.update(hic_data2.bads)
if (intra and hic_data1.sections and hic_data2.sections and
hic_data1.sections == hic_data2.sections):
for dist in range(1, max_dist + 1):
diag1 = []
diag2 = []
for crm in hic_data1.section_pos:
for j in range(hic_data1.section_pos[crm][0],
hic_data1.section_pos[crm][1] - dist):
i = j + dist
if j in bads or i in bads:
continue
diag1.append(get_the_guy1(i, j))
diag2.append(get_the_guy2(i, j))
spearmans.append(spearmanr(diag1, diag2)[0])
pearsons.append(spearmanr(diag1, diag2)[0])
r1 = _unitize(diag1)
r2 = _unitize(diag2)
weigs.append((np.var(r1, ddof=1) *
np.var(r2, ddof=1))**0.5 * len(diag1))
dists.append(dist)
else:
if intra:
warn('WARNING: hic_dta does not contain chromosome coordinates, ' +
'intra set to False')
for dist in range(min_dist, max_dist + min_dist):
diag1 = []
diag2 = []
for j in range(len(hic_data1) - dist):
i = j + dist
if j in bads or i in bads:
continue
diag1.append(get_the_guy1(i, j))
diag2.append(get_the_guy2(i, j))
spearmans.append(spearmanr(diag1, diag2)[0])
pearsons.append(pearsonr(diag1, diag2)[0])
r1 = _unitize(diag1)
r2 = _unitize(diag2)
weigs.append((np.var(r1, ddof=1) *
np.var(r2, ddof=1))**0.5 * len(diag1))
dists.append(dist)
# compute scc
# print pearsons
# print weigs
tot_weigth = sum(weigs)
scc = sum(pearsons[i] * weigs[i] / tot_weigth
for i in range(len(pearsons)))
var_corr = np.var(pearsons, ddof=1)
std = (sum(weigs[i]**2 for i in range(len(pearsons))) * var_corr /
sum(weigs)**2)**0.5
# plot
if show or savefig or axe:
if not axe:
fig = plt.figure()
axe = fig.add_subplot(111)
given_axe = False
else:
given_axe = True
axe.plot(dists, spearmans, color='orange', linewidth=3, alpha=.8)
axe.set_xlabel('Genomic distance in bins')
axe.set_ylabel('Spearman rank correlation')
axe.set_xlim((0, dists[-1]))
if savefig:
tadbit_savefig(savefig)
if show:
plt.show()
if not given_axe:
plt.close('all')
if savedata:
out = open(savedata, 'w')
out.write('# genomic distance\tSpearman rank correlation\n')
for i in range(len(spearmans)):
out.write('%s\t%s\n' % (dists[i], spearmans[i]))
out.close()
if kwargs.get('get_bads', False):
return spearmans, dists, scc, std, bads
return spearmans, dists, scc, std
def scc(mat1, mat2, max_dist=50, min_dist=1):
pearsons=[]
dists = []
weigs=[]
for dist in range(min_dist, max_dist + min_dist):
diag1 = []
diag2 = []
for j in range(len(mat1) - dist):
i = j + dist
if np.isnan(mat1[i][j]) or np.isnan(mat1[i][j]):
continue
diag1.append(mat1[i][j])
diag2.append(mat2[i][j])
if len(diag1) > 1:
with catch_warnings():
simplefilter("ignore")
p_corr = pearsonr(diag1, diag2)[0]
if not np.isnan(p_corr):
pearsons.append(p_corr)
r1 = _unitize(diag1)
r2 = _unitize(diag2)
weigs.append((np.var(r1, ddof=1) *
np.var(r2, ddof=1))**0.5 * len(diag1))
dists.append(dist)
if len(pearsons) == 0:
return 0, 0
tot_weigth = sum(weigs)
scc = sum(pearsons[i] * weigs[i] / tot_weigth
for i in range(len(pearsons)))
var_corr = np.var(pearsons, ddof=1)
std = (sum(weigs[i]**2 for i in range(len(pearsons))) * var_corr /
sum(weigs)**2)**0.5
return scc, std
def _evec_dist(v1,v2):
d1=np.dot(v1-v2,v1-v2)
d2=np.dot(v1+v2,v1+v2)
if d1<d2:
d=d1
else:
d=d2
return np.sqrt(d)
def _get_Laplacian(M):
S=M.sum(1)
i_nz=np.where(S>0)[0]
S=S[i_nz]
M=(M[i_nz].T)[i_nz].T
S=1/np.sqrt(S)
M=S*M
M=(S*M.T).T
n=np.size(S)
M=np.identity(n)-M
M=(M+M.T)/2
return M
def get_ipr(evec):
ipr=1.0/(evec*evec*evec*evec).sum()
return ipr
def get_reproducibility(hic_data1, hic_data2, num_evec, verbose=True,
normalized=False, remove_bad_columns=True):
"""
Compute reproducibility score similarly to HiC-spector
(https://doi.org/10.1093/bioinformatics/btx152)
:param hic_data1: Hi-C-data object
:param hic_data2: Hi-C-data object
:param 20 num_evec: number of eigenvectors to compare
:returns: reproducibility score (bellow 0.5 ~ different cell types)
"""
M1 = hic_data1.get_matrix(normalized=normalized)
M2 = hic_data2.get_matrix(normalized=normalized)
if remove_bad_columns:
# union of bad columns
bads = hic_data1.bads.copy()
bads.update(hic_data2.bads)
# remove them form both matrices
for bad in sorted(bads, reverse=True):
del(M1[bad])
del(M2[bad])
for i in range(len(M1)):
_ = M1[i].pop(bad)
_ = M2[i].pop(bad)
M1 = np.matrix(M1)
M2 = np.matrix(M2)
k1=np.sign(M1.A).sum(1)
d1=np.diag(M1.A)
kd1=~((k1==1)*(d1>0))
k2=np.sign(M2.A).sum(1)
d2=np.diag(M2.A)
kd2=~((k2==1)*(d2>0))
iz=np.nonzero((k1+k2>0)*(kd1>0)*(kd2>0))[0]
M1b=(M1[iz].A.T)[iz].T
M2b=(M2[iz].A.T)[iz].T
i_nz1=np.where(M1b.sum(1)>0)[0]
i_nz2=np.where(M2b.sum(1)>0)[0]
i_z1=np.where(M1b.sum(1)==0)[0]
i_z2=np.where(M2b.sum(1)==0)[0]
M1b_L=_get_Laplacian(M1b)
M2b_L=_get_Laplacian(M2b)
a1, b1=eigsh(M1b_L,k=num_evec,which="SM")
a2, b2=eigsh(M2b_L,k=num_evec,which="SM")
b1_extend=np.zeros((np.size(M1b,0),num_evec))
b2_extend=np.zeros((np.size(M2b,0),num_evec))
for i in range(num_evec):
b1_extend[i_nz1,i]=b1[:,i]
b2_extend[i_nz2,i]=b2[:,i]
ipr_cut=5
ipr1=np.zeros(num_evec)
ipr2=np.zeros(num_evec)
for i in range(num_evec):
ipr1[i]=get_ipr(b1_extend[:,i])
ipr2[i]=get_ipr(b2_extend[:,i])
b1_extend_eff=b1_extend[:,ipr1>ipr_cut]
b2_extend_eff=b2_extend[:,ipr2>ipr_cut]
num_evec_eff=min(np.size(b1_extend_eff,1),np.size(b2_extend_eff,1))
evd=np.zeros(num_evec_eff)
for i in range(num_evec_eff):
evd[i]=_evec_dist(b1_extend_eff[:,i],b2_extend_eff[:,i])
Sd=evd.sum()
l=np.sqrt(2)
evs=abs(l-Sd/num_evec_eff)/l
N = float(M1.shape[1])
if verbose:
if (np.sum(ipr1>N/100)<=1)|(np.sum(ipr2>N/100)<=1):
print("at least one of the maps does not look like typical Hi-C maps")
else:
print("size of maps: %d" %(np.size(M1,0)))
print("reproducibility score: %6.3f " %(evs))
print("num_evec_eff: %d" %(num_evec_eff))
return evs
[docs]def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, normalized=False,
savefig=None, show=False, savedata=None,
remove_bad_columns=True, **kwargs):
"""
Compare the interactions of two Hi-C matrices using their 6 first
eigenvectors, with Pearson correlation
:param hic_data1: Hi-C-data object
:param hic_data2: Hi-C-data object
:param 6 nvect: number of eigenvectors to compare
:param None savefig: path to save the plot
:param False show: displays the plot
:param False normalized: use normalized data
:param True remove_bads: computes the union of bad columns between samples
and exclude them from the comparison
:param kwargs: any argument to pass to matplotlib imshow function
:returns: matrix of correlations
"""
data1 = hic_data1.get_matrix(normalized=normalized)
data2 = hic_data2.get_matrix(normalized=normalized)
## reduce matrices to remove bad columns
if remove_bad_columns:
# union of bad columns
bads = hic_data1.bads.copy()
bads.update(hic_data2.bads)
# remove them form both matrices
for bad in sorted(bads, reverse=True):
del(data1[bad])
del(data2[bad])
for i in range(len(data1)):
_ = data1[i].pop(bad)
_ = data2[i].pop(bad)
# get the log
data1 = nozero_log(data1, np.log2)
data2 = nozero_log(data2, np.log2)
# get the eigenvectors
ev1, evect1 = eigh(data1)
ev2, evect2 = eigh(data2)
corr = [[0 for _ in range(nvect)] for _ in range(nvect)]
# sort eigenvectors according to their eigenvalues => first is last!!
sort_perm = ev1.argsort()
ev1.sort()
evect1 = evect1[sort_perm]
sort_perm = ev2.argsort()
ev2.sort()
evect2 = evect2[sort_perm]
# calculate Pearson correlation
for i in range(nvect):
for j in range(nvect):
corr[i][j] = abs(pearsonr(evect1[:,-i-1],
evect2[:,-j-1])[0])
# plot
axe = plt.axes([0.1, 0.1, 0.6, 0.8])
cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
if show or savefig:
im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
axe.set_xlabel('Eigen Vectors exp. 1')
axe.set_ylabel('Eigen Vectors exp. 2')
axe.set_xticks(list(range(nvect)))
axe.set_yticks(list(range(nvect)))
axe.set_xticklabels(list(range(1, nvect + 1)))
axe.set_yticklabels(list(range(1, nvect + 1)))
axe.xaxis.set_tick_params(length=0, width=0)
axe.yaxis.set_tick_params(length=0, width=0)
cbar = plt.colorbar(im, cax = cbaxes )
cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
verticalalignment='bottom')
axe2 = axe.twinx()
axe2.set_yticks(list(range(nvect)))
axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
verticalalignment='bottom')
axe2.set_ylim((-0.5, nvect - 0.5))
axe2.yaxis.set_tick_params(length=0, width=0)
axe3 = axe.twiny()
axe3.set_xticks(list(range(nvect)))
axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
axe3.set_xlabel('corresponding Eigen Values exp. 1')
axe3.set_xlim((-0.5, nvect - 0.5))
axe3.xaxis.set_tick_params(length=0, width=0)
axe.set_ylim((-0.5, nvect - 0.5))
axe.set_xlim((-0.5, nvect - 0.5))
if savefig:
tadbit_savefig(savefig)
if show:
plt.show()
plt.close('all')
if savedata:
out = open(savedata, 'w')
out.write('# ' + '\t'.join(['Eigen Vector %s'% i
for i in range(nvect)]) + '\n')
for i in range(nvect):
out.write('\t'.join([str(corr[i][j])
for j in range(nvect)]) + '\n')
out.close()
if kwargs.get('get_bads', False):
return corr, bads
else:
return corr
def plot_rsite_reads_distribution(reads_file, outprefix, window=20,
maxdist=1000):
de_right={}
de_left={}
print("process reads")
fl=open(reads_file)
while True:
line=next(fl)
if not line.startswith('#'):
break
nreads=0
try:
while True:
nreads += 1
if nreads % 1000000 == 0:
print(nreads)
try:
_, n1, sb1, sd1, l1, ru1, rd1, n2, sb2, sd2, l2, ru2, rd2\
= line.split()
sb1, sd1, l1, ru1, rd1, sb2, sd2, l2, ru2, rd2 = \
list(map(int, [sb1, sd1, l1, ru1, rd1, sb2, sd2, l2,
ru2, rd2]))
except ValueError:
print(line)
raise ValueError("line is not the right format!")
if n1 != n2:
line=next(fl)
continue
#read1 ahead of read2
if sb1 > sb2:
sb1, sd1, l1, ru1, rd1, sb2, sd2, l2, ru2, rd2 = \
sb2, sd2, l2, ru2, rd2, sb1, sd1, l1, ru1, rd1
#direction always -> <-
if not (sd1 == 1 and sd2 == 0):
line=next(fl)
continue
#close to the diagonal
if sb2-sb1 > maxdist:
line=next(fl)
continue
#close to RE 1
if abs(sb1-ru1) < abs(sb1-rd1):
rc1=ru1
else:
rc1=rd1
pos=sb1-rc1
if abs(pos)<=window:
if not pos in de_right:
de_right[pos]=0
de_right[pos]+=1
#close to RE 2
if abs(sb2-ru2) < abs(sb2-rd2):
rc2=ru2
else:
rc2=rd2
pos=sb2-rc2
if abs(pos)<=window:
if not pos in de_left:
de_left[pos]=0
de_left[pos]+=1
line=next(fl)
except StopIteration:
pass
print(" finished processing {} reads".format(nreads))
#transform to arrays
ind = list(range(-window,window+1))
de_r = [de_right.get(x,0) for x in ind]
de_l = [de_left.get(x,0) for x in ind]
#write to files
print("write to files")
fl=open(outprefix+'_count.dat','w')
fl.write('#dist\tX~~\t~~X\n')
for i,j,k in zip(ind,de_r, de_l):
fl.write('{}\t{}\t{}\n'.format(i, j, k))
#write plot
rcParams.update({'font.size': 10})
pp = PdfPages(outprefix+'_plot.pdf')
ind = np.array(ind)
width = 1
pr = plt.bar(ind-0.5, de_r, width, color='r')
pl = plt.bar(ind-0.5, de_l, width, bottom=de_r, color='b')
plt.ylabel("Count")
plt.title("Histogram of counts around cut site")
plt.xticks(ind[::2], rotation="vertical")
plt.legend((pl[0], pr[0]), ("~~X", "X~~"))
plt.gca().set_xlim([-window-1,window+1])
pp.savefig()
pp.close()
def moving_average(a, n=3):
ret = np.cumsum(a, dtype=float)
ret[n:] = ret[n:] - ret[:-n]
return ret[n - 1:] / n
def plot_diagonal_distributions(reads_file, outprefix, ma_window=20,
maxdist=800, de_left=[-2,3], de_right=[0,5]):
rbreaks={}
rejoined={}
des={}
print("process reads")
fl=open(reads_file)
while True:
line=next(fl)
if not line.startswith('#'):
break
nreads=0
try:
while True:
nreads += 1
if nreads % 1000000 == 0:
print(nreads)
try:
_, n1, sb1, sd1, _, ru1, rd1, n2, sb2, sd2, _, ru2, rd2\
= line.split()
sb1, sd1, ru1, rd1, sb2, sd2, ru2, rd2 = \
list(map(int, [sb1, sd1, ru1, rd1, sb2, sd2, ru2, rd2]))
except ValueError:
print(line)
raise ValueError("line is not the right format!")
if n1 != n2:
line=next(fl)
continue
#read1 ahead of read2
if sb1 > sb2:
sb1, sd1, ru1, rd1, sb2, sd2, ru2, rd2 = \
sb2, sd2, ru2, rd2, sb1, sd1, ru1, rd1
#direction always -> <-
if not (sd1 == 1 and sd2 == 0):
line=next(fl)
continue
mollen = sb2-sb1
if mollen > maxdist:
line=next(fl)
continue
#DE1
if abs(sb1-ru1) < abs(sb1-rd1):
rc1=ru1
else:
rc1=rd1
pos=sb1-rc1
if pos in de_right:
if not mollen in des:
des[mollen]=0
des[mollen]+=1
line=next(fl)
continue
#DE2
if abs(sb2-ru2) < abs(sb2-rd2):
rc2=ru2
else:
rc2=rd2
pos=sb2-rc2
if pos in de_left:
if not mollen in des:
des[mollen]=0
des[mollen]+=1
line=next(fl)
continue
#random: map on same fragment
if rd1 == rd2:
if not mollen in rbreaks:
rbreaks[mollen]=0
rbreaks[mollen]+=1
line=next(fl)
continue
#rejoined ends
if not mollen in rejoined:
rejoined[mollen]=0
rejoined[mollen]+=1
line=next(fl)
except StopIteration:
pass
print(" finished processing {} reads".format(nreads))
#transform to arrays
maxlen = max(max(rejoined),max(des),max(rbreaks))
ind = list(range(1,maxlen+1))
des = [des.get(x,0) for x in ind]
rbreaks = [rbreaks.get(x,0) for x in ind]
rejoined = [rejoined.get(x,0) for x in ind]
#reweight corner for rejoined
rejoined = [x**.5 * rejoined[x-1]/x for x in ind]
#write to files
print("write to files")
fl=open(outprefix+'_count.dat','w')
fl.write('#dist\trbreaks\tdes\trejoined\n')
for i,j,k,l in zip(ind,rbreaks,des,rejoined):
fl.write('{}\t{}\t{}\t{}\n'.format(i, j, k, l))
#transform data a bit more
ind, des, rbreaks, rejoined = \
[moving_average(np.array(x), ma_window) for x in [ind, des, rbreaks, rejoined]]
des, rbreaks, rejoined = [x/float(x.sum()) for x in [des, rbreaks, rejoined]]
np.insert(ind,0,0)
np.insert(des,0,0)
np.insert(rbreaks,0,0)
np.insert(rejoined,0,0)
#write plot
pp = PdfPages(outprefix+'_plot.pdf')
rcParams.update({'font.size': 10})
pde = plt.fill_between(ind, des, 0, color='r', alpha=0.5)
prb = plt.fill_between(ind, rbreaks, 0, color='b', alpha=0.5)
prj = plt.fill_between(ind, rejoined, 0, color='y', alpha=0.5)
plt.ylabel("Normalized count")
plt.ylabel("Putative DNA molecule length")
plt.title("Histogram of counts close to the diagonal")
#plt.xticks(ind[::10], rotation="vertical")
plt.legend((prb, pde, prj), ("Random breaks", "Dangling ends",
"Rejoined"))
plt.gca().set_xlim([0,maxlen])
pp.savefig()
pp.close()
[docs]def plot_strand_bias_by_distance(fnam, nreads=1000000, valid_pairs=True,
half_step=20, half_len=2000,
full_step=500, full_len=50000, savefig=None):
"""
Classify reads into four categories depending on the strand on which each
of its end is mapped, and plots the proportion of each of these categories
in function of the genomic distance between them.
Only full mapped reads mapped on two diferent restriction fragments (still
same chromosome) are considered.
The four categories are:
- Both read-ends mapped on the same strand (forward)
- Both read-ends mapped on the same strand (reverse)
- Both read-ends mapped on the different strand (facing), like extra-dangling-ends
- Both read-ends mapped on the different strand (opposed), like extra-self-circles
:params fnam: path to tsv file with intersection of mapped ends
:params True valid_pairs: consider only read-ends mapped
on different restriction fragments. If False, considers only read-ends
mapped on the same restriction fragment.
:params 1000000 nreads: number of reads used to plot (if None, all will be used)
:params 20 half_step: binning for the first part of the plot
:params 2000 half_len: maximum distance for the first part of the plot
:params 500 full_step: binning for the second part of the plot
:params 50000 full_len: maximum distance for the second part of the plot
:params None savefig: path to save figure
"""
max_len = 100000
genome_seq = OrderedDict()
pos = 0
fhandler = open(fnam)
for line in fhandler:
if line.startswith('#'):
if line.startswith('# CRM '):
crm, clen = line[6:].split('\t')
genome_seq[crm] = int(clen)
else:
break
pos += len(line)
fhandler.seek(pos)
names = ['<== <== both reverse',
'<== ==> opposed (Extra-self-circles)',
'==> <== facing (Extra-dangling-ends)',
'==> ==> both forward']
dirs = [[0 for i in range(max_len)],
[0 for i in range(max_len)],
[0 for i in range(max_len)],
[0 for i in range(max_len)]]
iterator = (next(fhandler) for _ in range(nreads)) if nreads else fhandler
if valid_pairs:
comp_re = lambda x, y: x != y
else:
comp_re = lambda x, y: x == y
for line in iterator:
(crm1, pos1, dir1, len1, re1, _,
crm2, pos2, dir2, len2, re2) = line.strip().split('\t')[1:12]
pos1, pos2 = int(pos1), int(pos2)
if pos2 < pos1:
pos2, pos1 = pos1, pos2
dir2, dir1 = dir1, dir2
len2, len1 = len1, len2
dir1, dir2 = int(dir1), int(dir2)
len1, len2 = int(len1), int(len2)
if dir1 == 0:
pos1 -= len1
if dir2 == 1:
pos2 += len2
diff = pos2 - pos1
# only ligated; same chromsome; bellow max_dist; not multi-contact
if comp_re(re1, re2) and crm1 == crm2 and diff < max_len and len1 == len2:
dir1, dir2 = dir1 * 2, dir2
dirs[dir1 + dir2][diff] += 1
sum_dirs = [0 for i in range(max_len)]
for i in range(max_len):
sum_dir = float(sum(dirs[d][i] for d in range(4)))
for d in range(4):
try:
dirs[d][i] = dirs[d][i] / sum_dir
except ZeroDivisionError:
dirs[d][i] = 0.
sum_dirs[i] = sum_dir
plt.figure(figsize=(14, 9))
if full_step:
axLp = plt.subplot2grid((3, 2), (0, 0), rowspan=2)
axLb = plt.subplot2grid((3, 2), (2, 0), sharex=axLp)
axRp = plt.subplot2grid((3, 2), (0, 1), rowspan=2, sharey=axLp)
axRb = plt.subplot2grid((3, 2), (2, 1), sharex=axRp, sharey=axLb)
else:
axLp = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
axLb = plt.subplot2grid((3, 1), (2, 0), sharex=axLp)
for d in range(4):
axLp.plot([sum(dirs[d][i:i + half_step]) / half_step
for i in range(0, half_len - half_step, half_step)],
alpha=0.7, label=names[d])
axLp.set_ylim(0, 1)
axLp.set_yticks([0, 0.25, 0.5, 0.75, 1])
axLp.set_xlim(0, half_len / half_step)
axLp.set_xticks(axLp.get_xticks()[:-1])
axLp.set_xticklabels([str(int(i)) for i in axLp.get_xticks() * half_step])
axLp.grid()
if full_step:
axLp.spines['right'].set_visible(False)
plt.setp(axLp.get_xticklabels(), visible=False)
axLb.spines['right'].set_visible(False)
axLp.set_ylabel('Proportion of reads in each category')
axLb.bar(list(range(0, half_len // half_step - 1)),
[sum(sum_dirs[i:i + half_step]) / half_step
for i in range(0, half_len - half_step, half_step)],
alpha=0.5, color='k')
axLb.set_ylabel("Log number of reads\nper genomic position")
axLb.set_yscale('log')
axLb.grid()
axLb.set_xlabel('Distance between mapping position of the two ends\n'
'(averaged in windows of 20 nucleotides)')
if full_step:
for d in range(4):
axRp.plot([sum(dirs[d][i:i + full_step]) // full_step
for i in range(half_len, full_len + full_step, full_step)],
alpha=0.7, label=names[d])
axRp.spines['left'].set_visible(False)
axRp.set_xlim(0, full_len // full_step - 2000 // full_step)
axRp.set_xticks(list(range((10000 - half_step) // full_step, (full_len + full_step) // full_step, 20)))
axRp.set_xticklabels([int(i) for i in range(10000, full_len + full_step, full_step * 20)])
plt.setp(axRp.get_xticklabels(), visible=False)
axRp.legend(title='Strand on which each read-end is mapped\n(first read-end is always smaller than second)')
axRp.yaxis.tick_right()
axRp.tick_params(labelleft=False)
axRp.tick_params(labelright=False)
axRp.grid()
axRb.bar(list(range(0, full_len // full_step - half_len // full_step + 1)),
[sum(sum_dirs[i:i + full_step]) // full_step
for i in range(half_len, full_len + full_step, full_step)],
alpha=0.5, color='k')
axRb.set_ylim(0, max(sum_dirs) * 1.1)
axRb.spines['left'].set_visible(False)
axRb.yaxis.tick_right()
axRb.tick_params(labelleft=False)
axRb.tick_params(labelright=False)
axRb.set_xlabel('Distance between mapping position of the two ends\n'
'(averaged in windows of 500 nucleotide)')
axRb.set_yscale('log')
axRb.grid()
# decorate...
d = .015 # how big to make the diagonal lines in axes coordinates
# arguments to pass to plot, just so we don't keep repeating them
kwargs = dict(transform=axLp.transAxes, color='k', clip_on=False)
axLp.plot((1 - d, 1 + d), (1-d, 1+d), **kwargs) # top-left diagonal
axLp.plot((1 - d, 1 + d), (-d, +d), **kwargs) # top-right diagonal
kwargs.update(transform=axRp.transAxes) # switch to the bottom axes
axRp.plot((-d, +d), (1 - d, 1 + d), **kwargs) # bottom-left diagonal
axRp.plot((-d, +d), (-d, +d), **kwargs) # bottom-right diagonal
w = .015
h = .030
# arguments to pass to plot, just so we don't keep repeating them
kwargs = dict(transform=axLb.transAxes, color='k', clip_on=False)
axLb.plot((1 - w, 1 + w), (1 - h, 1 + h), **kwargs) # top-left diagonal
axLb.plot((1 - w, 1 + w), ( - h, + h), **kwargs) # top-right diagonal
kwargs.update(transform=axRb.transAxes) # switch to the bottom axes
axRb.plot((- w, + w), (1 - h, 1 + h), **kwargs) # bottom-left diagonal
axRb.plot((- w, + w), ( - h, + h), **kwargs) # bottom-right diagonal
plt.subplots_adjust(wspace=0.05)
plt.subplots_adjust(hspace=0.1)
else:
axLp.legend(title='Strand on which each read-end is mapped\n(first read-end is always smaller than second)')
if savefig:
tadbit_savefig(savefig)
else:
plt.show()
# For back compatibility
[docs]def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
show=False, xlog=False, stats=('median', 'perc_max'),
too_large=10000):
"""
Deprecated function, use fragment_size
"""
warn("WARNING: function has been replaced by fragment_size", category=DeprecationWarning,)
return fragment_size(fnam, savefig=savefig, nreads=nreads, max_size=max_size, axe=axe,
show=show, xlog=xlog, stats=stats,
too_large=too_large)