"""
23 Jan 2013
functions to align contact maps.
Algorithm based on:
Di Lena, P., Fariselli, P., Margara, L., Vassura, M., & Casadio, R. (2010).
Fast overlapping of protein contact maps by alignment of eigenvectors.
Bioinformatics (Oxford, England), 26(18), 2250-8. doi:10.1093/bioinformatics/btq402
"""
from __future__ import print_function
from numpy import array, sqrt
from numpy import min as npmin
from numpy import max as npmax
from numpy import sum as npsum
from numpy import mean
from numpy.linalg import eigh # eigh is for symmetric matrices
from scipy.stats import spearmanr
from itertools import product
from itertools import combinations_with_replacement as combinations
from copy import deepcopy
# for aleigen:
from numpy import median
import re
from subprocess import Popen, PIPE
def _sort_match(matches):
return sorted([(a, b) for a, b in enumerate(matches)],
key=lambda x: x[1], reverse=True)
def core_nw_long(p_scores, penalty, l_p1, l_p2):
"""
Core of the long Needleman-Wunsch algorithm that aligns matrices
"""
scores = virgin_score(penalty, l_p1 + 1, l_p2 + 1)
ins = rmv = 0
lpen = 4
# pens=[penalty-penalty*(1-1/exp(i**2)) for i in xrange(0, 2 * (lpen+ 1),2)]
pens = [penalty - penalty / (2 * lpen) * i for i in range(lpen + 1)] # BEST
# pens = [penalty, penalty, penalty, penalty/1.2, 0]
# pens = [penalty, penalty*1.5, penalty*2, penalty, penalty]
for i in range(1, l_p1 + 1):
for j in range(1, l_p2 + 1):
mmax = max((ins, rmv))
match = p_scores[i - 1][j - 1] + scores[i - 1][j - 1]
insert = scores[i - 1][j] + pens[mmax]
delete = scores[i][j - 1] + pens[mmax]
tmp = _sort_match((match, insert, delete))
if tmp[0][0] == 1:
if ins < lpen:
ins += 1
rmv = 0
elif tmp[0][0] == 2:
if rmv < lpen:
rmv += 1
ins = 0
else:
ins = rmv = 0
scores[i][j] = tmp[0][1]
align1 = []
align2 = []
i = l_p1
j = l_p2
while i and j:
score = scores[i][j]
value = scores[i - 1][j - 1] + p_scores[i - 1][j - 1]
if _equal(score, value):
i -= 1
j -= 1
align1.insert(0, i)
align2.insert(0, j)
ins = rmv = 0
continue
if _equal(score, scores[i - 1][j] + pens[0]):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
continue
if _equal(score, scores[i][j - 1] + pens[0]):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
continue
if _equal(score, scores[i - 1][j] + pens[1]):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
continue
if _equal(score, scores[i][j - 1] + pens[1]):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
continue
if _equal(score, scores[i - 1][j] + pens[2]):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
continue
if _equal(score, scores[i][j - 1] + pens[2]):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
continue
if _equal(score, scores[i - 1][j] + pens[3]):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
continue
if _equal(score, scores[i][j - 1] + pens[3]):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
continue
if _equal(score, scores[i - 1][j] + pens[4]):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
continue
if _equal(score, scores[i][j - 1] + pens[4]):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
continue
print(score)
print(value)
print(scores[i-1][j])
print(scores[i][j-1])
print(penalty, ins, rmv)
for scr in scores:
print(' '.join(['%7s' % (round(y, 4)) for y in scr]))
raise Exception('Something is failing and it is my fault...')
return align1, align2, scores[i][j]
def core_nw(p_scores, penalty, l_p1, l_p2):
"""
Core of the fast Needleman-Wunsch algorithm that aligns matrices
"""
scores = virgin_score(penalty, l_p1 + 1, l_p2 + 1)
for i in range(1, l_p1 + 1):
for j in range(1, l_p2 + 1):
match = p_scores[i - 1][j - 1] + scores[i - 1][j - 1]
insert = scores[i - 1][j] + penalty
delete = scores[i][j - 1] + penalty
scores[i][j] = max((match, insert, delete))
align1 = []
align2 = []
i = l_p1
j = l_p2
while i and j:
score = scores[i][j]
value = scores[i - 1][j - 1] + p_scores[i - 1][j - 1]
if _equal(score, value):
i -= 1
j -= 1
align1.insert(0, i)
align2.insert(0, j)
elif _equal(score, scores[i - 1][j] + penalty):
i -= 1
align1.insert(0, i)
align2.insert(0, '-')
elif _equal(score, scores[i][j - 1] + penalty):
j -= 1
align1.insert(0, '-')
align2.insert(0, j)
else:
for scr in scores:
print(' '.join(['%7s' % (round(y, 4)) for y in scr]))
raise Exception('Something is failing and it is my fault...')
return align1, align2, scores[i][j]
def _equal(a, b, cut_off=1e-9):
"""
Equality for floats
"""
return abs(a - b) < cut_off
[docs]def optimal_cmo(hic1, hic2, num_v=None, max_num_v=None, verbose=False,
method='frobenius', long_nw=True, long_dist=True):
"""
Calculates the optimal contact map overlap between 2 matrices
TODO: make the selection of number of eigen vectors automatic or relying on
the summed contribution (e.g. select the EVs that sum 80% of the info)
.. note::
penalty is defined as the minimum value of the pre-scoring matrix.
:param hic1: first matrix to align
:param hic2: second matrix to align
:param None num_v: number of eigen vectors to consider, max is:
max(min(len(hic1), len(hic2)))
:param None max_num_v: maximum number of eigen vectors to consider.
:param score method: distance function to use as alignment score. if 'score'
distance will be the result of the last value of the Needleman-Wunsch
algorithm. If 'frobenius' a modification of the Frobenius distance will
be used
:returns: two lists, one per aligned matrix, plus a dict summarizing the
goodness of the alignment with the distance between matrices, their
Spearman correlation Rho value and pvalue.
"""
l_p1 = len(hic1)
l_p2 = len(hic2)
num_v = num_v or min(l_p1, l_p2)
if max_num_v:
num_v = min(max_num_v, num_v)
if num_v > l_p1 or num_v > l_p2:
raise Exception('\nnum_v should be at most %s\n' % (min(l_p1, l_p2)))
val1, vec1 = eigh(hic1)
if npsum(vec1).imag:
raise Exception("ERROR: Hi-C data is not symmetric.\n" +
'%s\n\n%s' % (hic1, vec1))
val2, vec2 = eigh(hic2)
if npsum(vec2).imag:
raise Exception("ERROR: Hi-C data is not symmetric.\n" +
'%s\n\n%s' % (hic2, vec2))
#
val1 = array([sqrt(abs(v)) for v in val1])
val2 = array([sqrt(abs(v)) for v in val2])
idx = val1.argsort()[::-1]
val1 = val1[idx]
vec1 = vec1[idx]
idx = val2.argsort()[::-1]
val2 = val2[idx]
vec2 = vec2[idx]
#
vec1 = array([val1[i] * vec1[:, i] for i in range(num_v)]).transpose()
vec2 = array([val2[i] * vec2[:, i] for i in range(num_v)]).transpose()
nearest = float('inf')
nw = core_nw_long if long_nw else core_nw
dister = _get_dist_long if long_dist else _get_dist
best_alis = []
for num in range(1, num_v + 1):
for factors in product([1, -1], repeat=num):
vec1p = factors * vec1[:, :num]
vec2p = vec2[:, :num]
p_scores = _prescoring(vec1p, vec2p, l_p1, l_p2)
penalty = min([npmin(p_scores)] + [-npmax(p_scores)])
align1, align2, dist = nw(p_scores, penalty, l_p1, l_p2)
try:
if method == 'frobenius':
dist = dister(align1, align2, hic1, hic2)
else:
dist *= -1
if dist < nearest:
if not penalty:
for scr in p_scores:
print(' '.join(['%7s' % (round(y, 2)) for y in scr]))
nearest = dist
best_alis = [align1, align2]
best_pen = penalty
except IndexError as e:
print(e)
try:
align1, align2 = best_alis
except ValueError:
pass
if verbose:
print('\n Alignment (score = %s):' % (nearest))
print('TADS 1: '+'|'.join(['%4s' % (str(int(x)) \
if x!='-' else '-'*3) for x in align1]))
print('TADS 2: '+'|'.join(['%4s' % (str(int(x)) \
if x!='-' else '-'*3) for x in align2]))
rho, pval = _get_score(align1, align2, hic1, hic2)
# print best_pen
if not best_pen:
print('WARNING: penalty NULL!!!\n\n')
return align1, align2, {'dist': nearest, 'rho': rho, 'pval': pval}
def virgin_score(penalty, l_p1, l_p2):
"""
Fill a matrix with zeros, except first row and first column filled with \
multiple values of penalty.
"""
zeros = [0.0 for _ in range(l_p2)]
return [[penalty * j for j in range(l_p2)]] + \
[[penalty * i] + zeros for i in range(1, l_p1)]
def _prescoring(vc1, vc2, l_p1, l_p2):
"""
yes... this is the bottle neck, almost 2/3 of the time spent here
"""
p_score = []
for i in range(l_p1):
vci = vc1[i].__mul__
p_score.append([vci(vc2[j]).sum() for j in range(l_p2)])
return p_score
#return [[sum(vc1[i] * vc2[j]) for j in xrange(l_p2)] for i in xrange(l_p1)]
def _get_dist(align1, align2, tad1, tad2):
"""
Frobenius norm
"""
map1 = []
map2 = []
for i, j in zip(align1, align2):
if i != '-' and j != '-':
map1.append(i)
map2.append(j)
pp1 = [tad1[i][j] for i, j in combinations(map1, 2)]
pp2 = [tad2[i][j] for i, j in combinations(map2, 2)]
return float(sum([(pp-pp2[p])**2 for p, pp in enumerate(pp1)]))\
/ (len(pp1)+1)
def _get_dist_long(align1, align2, tad1, tad2):
"""
Frobenius norm
"""
map1 = []
map2 = []
pen = ((float(mean(tad2)) + mean(tad1)) / 2) * len(align1)
xpen = 0
exti = extd = 1
for i, j in zip(align1, align2):
if i != '-' and j != '-':
map1.append(i)
map2.append(j)
exti = extd = 1
elif i == '-': # favour long gaps: stop penalizing if GAP lon enough
xpen += pen / exti
exti += 1
extd = 1
else:
xpen += pen / extd
extd += 1
exti = 1
pp1 = [tad1[i][j] for i, j in combinations(map1, 2)]
pp2 = [tad2[i][j] for i, j in combinations(map2, 2)]
return float(sum([(pp-pp2[p])**2 for p, pp in enumerate(pp1)]))\
/ (len(pp1)+1) + xpen
def _get_score(align1, align2, tad1, tad2):
"""
Using Spearman Rho correation value, between matrices.
Computed only in half of the matrix, and without the diagonal
"""
map1 = []
map2 = []
for i, j in zip(align1, align2):
if j != '-' and i != '-':
map1.append(i)
map2.append(j)
pp1 = [[tad1[i][j] for j in map1] for i in map1] # do not use
pp2 = [[tad2[i][j] for j in map2] for i in map2] # diagonal?
return spearmanr(pp1, pp2, axis=None)
def _get_OLD_score(align1, align2, p1, p2):
"""
Original scoring function, based on contact map overlap.
"""
map1 = []
map2 = []
for i, j in zip(align1, align2):
if j != '-' and i != '-':
map1.append(i)
map2.append(j)
com = len(map1)
pp1 = [[p1[i][j] for j in map1] for i in map1]
pp2 = [[p2[i][j] for j in map2] for i in map2]
cm1 = sum([pp1[i][j] for i in range(com) for j in range(i + 1, com)])
cm2 = sum([pp2[i][j] for i in range(com) for j in range(i + 1, com)])
cmo = sum([1 - abs(pp2[i][j] - pp1[i][j]) \
for i in range(com) for j in range(i + 1, com)])
return float(2 * cmo)/(cm1+cm2)
###
# Following is for aleigen
def matrix2binnary_contacts(tad1, tad2):
cutoff = median(tad1)
contacts1 = []
contacts2 = []
for i in range(len(tad1)):
for j in range(i, len(tad1)):
if tad1[i][j] > cutoff:
contacts1.append((i, j))
cutoff = median(tad2)
for i in range(len(tad2)):
for j in range(i, len(tad2)):
if tad2[i][j] > cutoff:
contacts2.append((i, j))
return contacts1, contacts2
def _run_aleigen(contacts1, contacts2, num_v):
"""
- c1, c2 = number of contacts of the first and second contact map
(after removing non-matching columns/rows)
- cmo = total number of matching contacts (above the first diagonal)
of the computed overlap
- score = 2*CMO/(C1+C2)
"""
f_string = '/tmp/lala%s.txt'
f_name1 = f_string % (1)
f_name2 = f_string % (2)
write_contacts(contacts1, contacts2, f_string)
sc_str = re.compile('Score\s+C1\s+C2\s+CMO\n([0-9.]+)\s+[0-9]+\s+.*')
out = Popen('aleigen %s %s %s' % (f_name1, f_name2, num_v),
shell=True, stdout=PIPE, universal_newlines=True).communicate()[0]
score = [float(c) for c in re.findall(sc_str, out)]
print(out)
align1 = []
align2 = []
for line in out.split('\n')[2:]:
if not re.match('[0-9]+\s+[0-9]+', line):
continue
el1, el2 = [int(c) for c in line.split()]
align1.append(el1)
align2.append(el2)
return align1, align2, score
def write_contacts(contacts1, contacts2, f_string):
for i, contacts in enumerate([contacts1, contacts2]):
out = open(f_string % (i+1), 'w')
out.write(str(max([max(c) for c in contacts])+1) + '\n')
out.write('\n'.join([str(c1) + ' ' + str(c2) for c1, c2 in contacts]))
out.write('\n')
out.close()
def merge_tads(tad1, tad2, ali1, ali2):
ali = []
ali.append(deepcopy(ali1))
ali.append(deepcopy(ali2))
for i in range(max(ali1[0], ali2[0]) - 1, -1, -1):
if ali[0][0]:
ali1.insert(0, i)
ali2.insert(0, '-')
elif ali[1][0]:
ali1.insert(0, '-')
ali2.insert(0, i)
size = len(ali1)
matrix1 = [[0.1 for _ in range(size)] for _ in range(size)]
matrix2 = [[0.1 for _ in range(size)] for _ in range(size)]
matrixm = [[0.1 for _ in range(size)] for _ in range(size)]
for i in range(size):
if ali1[i] == '-' or ali2[i] == '-':
matrixm[i] = [float('nan') for _ in range(size)]
if ali1[i] == '-':
matrix1[i] = [float('nan') for _ in range(size)]
matrix2[i] = [tad2[ali2[i]][ali2[j]] \
if ali2[j] != '-' else float('nan')\
for j in range(size)]
elif ali2[i] == '-':
matrix2[i] = [float('nan') for _ in range(size)]
matrix1[i] = [tad1[ali1[i]][ali1[j]] \
if ali1[j] != '-' else float('nan')\
for j in range(size)]
continue
for j in range(size):
if ali1[j] == '-' or ali2[j] == '-':
matrixm[i][j] = float('nan')
if ali1[j] == '-':
matrix1[i][j] = float('nan')
matrix2[i][j] = tad2[ali2[i]][ali2[j]]
elif ali2[j] == '-':
matrix2[i][j] = float('nan')
matrix1[i][j] = tad1[ali1[i]][ali1[j]]
continue
matrix1[i][j] = tad1[ali1[i]][ali1[j]]
matrix2[i][j] = tad2[ali2[i]][ali2[j]]
matrixm[i][j] = tad1[ali1[i]][ali1[j]]
matrixm[i][j] += tad2[ali2[i]][ali2[j]]
matrixm[i][j] /= 2
return matrix1, matrix2, matrixm