Source code for pytadbit.tad_clustering.tad_cmo

"""
23 Jan 2013

functions to align contact maps.
Algorithm based on:
Di Lena, P., Fariselli, P., Margara, L., Vassura, M., & Casadio, R. (2010). 
Fast overlapping of protein contact maps by alignment of eigenvectors. 
Bioinformatics (Oxford, England), 26(18), 2250-8. doi:10.1093/bioinformatics/btq402

"""
from __future__ import print_function

from numpy        import array, sqrt
from numpy        import min as npmin
from numpy        import max as npmax
from numpy        import sum as npsum
from numpy        import mean
from numpy.linalg import eigh # eigh is for symmetric matrices
from scipy.stats  import spearmanr
from itertools    import product
from itertools    import combinations_with_replacement as combinations
from copy         import deepcopy

# for aleigen:
from numpy import median

import re
from subprocess import Popen, PIPE


def _sort_match(matches):
    return sorted([(a, b) for a, b in enumerate(matches)],
                  key=lambda x: x[1], reverse=True)

def core_nw_long(p_scores, penalty, l_p1, l_p2):
    """
    Core of the long Needleman-Wunsch algorithm that aligns matrices
    """
    scores = virgin_score(penalty, l_p1 + 1, l_p2 + 1)
    ins = rmv = 0
    lpen = 4
    # pens=[penalty-penalty*(1-1/exp(i**2)) for i in xrange(0, 2 * (lpen+ 1),2)]
    pens = [penalty - penalty / (2 * lpen) * i for i in range(lpen + 1)] # BEST
    # pens = [penalty, penalty, penalty, penalty/1.2, 0]
    # pens = [penalty, penalty*1.5, penalty*2, penalty, penalty]
    for i in range(1, l_p1 + 1):
        for j in range(1, l_p2 + 1):
            mmax = max((ins, rmv))
            match  = p_scores[i - 1][j - 1] + scores[i - 1][j - 1]
            insert = scores[i - 1][j] + pens[mmax]
            delete = scores[i][j - 1] + pens[mmax]
            tmp = _sort_match((match, insert, delete))
            if tmp[0][0] == 1:
                if ins < lpen:
                    ins += 1
                rmv = 0
            elif tmp[0][0] == 2:
                if rmv < lpen:
                    rmv += 1
                ins = 0
            else:
                ins = rmv = 0
            scores[i][j] = tmp[0][1]
    align1 = []
    align2 = []
    i = l_p1 
    j = l_p2
    while i and j:
        score = scores[i][j]
        value = scores[i - 1][j - 1] + p_scores[i - 1][j - 1]
        if _equal(score, value):
            i -= 1
            j -= 1
            align1.insert(0, i)
            align2.insert(0, j)
            ins = rmv = 0
            continue
        if _equal(score, scores[i - 1][j] + pens[0]):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
            continue
        if _equal(score, scores[i][j - 1] + pens[0]):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
            continue
        if _equal(score, scores[i - 1][j] + pens[1]):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
            continue
        if _equal(score, scores[i][j - 1] + pens[1]):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
            continue
        if _equal(score, scores[i - 1][j] + pens[2]):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
            continue
        if _equal(score, scores[i][j - 1] + pens[2]):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
            continue
        if _equal(score, scores[i - 1][j] + pens[3]):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
            continue
        if _equal(score, scores[i][j - 1] + pens[3]):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
            continue
        if _equal(score, scores[i - 1][j] + pens[4]):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
            continue
        if _equal(score, scores[i][j - 1] + pens[4]):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
            continue
        print(score)
        print(value)
        print(scores[i-1][j])
        print(scores[i][j-1])
        print(penalty, ins, rmv)
        for scr in scores:
            print(' '.join(['%7s' % (round(y, 4)) for y in scr]))
        raise Exception('Something  is failing and it is my fault...')
    return align1, align2, scores[i][j]


def core_nw(p_scores, penalty, l_p1, l_p2):
    """
    Core of the fast Needleman-Wunsch algorithm that aligns matrices
    """
    scores = virgin_score(penalty, l_p1 + 1, l_p2 + 1)
    for i in range(1, l_p1 + 1):
        for j in range(1, l_p2 + 1):
            match  = p_scores[i - 1][j - 1] + scores[i - 1][j - 1]
            insert = scores[i - 1][j] + penalty
            delete = scores[i][j - 1] + penalty
            scores[i][j] = max((match, insert, delete))
    align1 = []
    align2 = []
    i = l_p1 
    j = l_p2 
    while i and j:
        score = scores[i][j]
        value = scores[i - 1][j - 1] + p_scores[i - 1][j - 1]
        if _equal(score, value):
            i -= 1
            j -= 1
            align1.insert(0, i)
            align2.insert(0, j)
        elif _equal(score, scores[i - 1][j] + penalty):
            i -= 1
            align1.insert(0, i)
            align2.insert(0, '-')
        elif _equal(score, scores[i][j - 1] + penalty):
            j -= 1
            align1.insert(0, '-')
            align2.insert(0, j)
        else:
            for scr in scores:
                print(' '.join(['%7s' % (round(y, 4)) for y in scr]))
            raise Exception('Something  is failing and it is my fault...')
    return align1, align2, scores[i][j]


def _equal(a, b, cut_off=1e-9):
    """
    Equality for floats
    """
    return abs(a - b) < cut_off


[docs]def optimal_cmo(hic1, hic2, num_v=None, max_num_v=None, verbose=False,
                method='frobenius', long_nw=True, long_dist=True):
    """
    Calculates the optimal contact map overlap between 2 matrices

    TODO: make the selection of number of eigen vectors automatic or relying on
          the summed contribution (e.g. select the EVs that sum 80% of the info)

    .. note::

      penalty is defined as the minimum value of the pre-scoring matrix.
    
    :param hic1: first matrix to align
    :param hic2: second matrix to align
    :param None num_v: number of eigen vectors to consider, max is:
        max(min(len(hic1), len(hic2)))
    :param None max_num_v: maximum number of eigen vectors to consider.
    :param score method: distance function to use as alignment score. if 'score'
       distance will be the result of the last value of the Needleman-Wunsch
       algorithm. If 'frobenius' a modification of the Frobenius distance will
       be used

    :returns: two lists, one per aligned matrix, plus a dict summarizing the
        goodness of the alignment with the distance between matrices, their 
        Spearman correlation Rho value and pvalue.
    """

    l_p1 = len(hic1)
    l_p2 = len(hic2)
    num_v = num_v or min(l_p1, l_p2)
    if max_num_v:
        num_v = min(max_num_v, num_v)
    if num_v > l_p1 or num_v > l_p2:
        raise Exception('\nnum_v should be at most %s\n' % (min(l_p1, l_p2)))
    val1, vec1 = eigh(hic1)
    if npsum(vec1).imag:
        raise Exception("ERROR: Hi-C data is not symmetric.\n" +
                        '%s\n\n%s' % (hic1, vec1))
    val2, vec2 = eigh(hic2)
    if npsum(vec2).imag:
        raise Exception("ERROR: Hi-C data is not symmetric.\n" +
                        '%s\n\n%s' % (hic2, vec2))
    #
    val1 = array([sqrt(abs(v)) for v in val1])
    val2 = array([sqrt(abs(v)) for v in val2])
    idx = val1.argsort()[::-1]
    val1 = val1[idx]
    vec1 = vec1[idx]
    idx = val2.argsort()[::-1]
    val2 = val2[idx]
    vec2 = vec2[idx]
    #
    vec1 = array([val1[i] * vec1[:, i] for i in range(num_v)]).transpose()
    vec2 = array([val2[i] * vec2[:, i] for i in range(num_v)]).transpose()
    nearest = float('inf')
    nw = core_nw_long if long_nw else core_nw
    dister = _get_dist_long if long_dist else _get_dist
    best_alis = []
    for num in range(1, num_v + 1):
        for factors in product([1, -1], repeat=num):
            vec1p = factors * vec1[:, :num]
            vec2p = vec2[:, :num]
            p_scores = _prescoring(vec1p, vec2p, l_p1, l_p2)
            penalty = min([npmin(p_scores)] + [-npmax(p_scores)])
            align1, align2, dist = nw(p_scores, penalty, l_p1, l_p2)
            try:
                if method == 'frobenius':
                    dist = dister(align1, align2, hic1, hic2)
                else:
                    dist *= -1
                if dist < nearest:
                    if not penalty:
                        for scr in p_scores:
                            print(' '.join(['%7s' % (round(y, 2)) for y in scr]))
                    nearest = dist
                    best_alis = [align1, align2]
                    best_pen = penalty
            except IndexError as e:
                print(e)
    try:
        align1, align2 = best_alis
    except ValueError:
        pass
    if verbose:
        print('\n Alignment (score = %s):' % (nearest))
        print('TADS 1: '+'|'.join(['%4s' % (str(int(x)) \
                                            if x!='-' else '-'*3) for x in align1]))
        print('TADS 2: '+'|'.join(['%4s' % (str(int(x)) \
                                            if x!='-' else '-'*3) for x in align2]))
    rho, pval = _get_score(align1, align2, hic1, hic2)
    # print best_pen
    if not best_pen:
        print('WARNING: penalty NULL!!!\n\n')
    return align1, align2, {'dist': nearest, 'rho': rho, 'pval': pval}
    

def virgin_score(penalty, l_p1, l_p2):
    """
    Fill a matrix with zeros, except first row and first column filled with \
    multiple values of penalty.
    """
    zeros    = [0.0 for _ in range(l_p2)]
    return [[penalty * j for j in range(l_p2)]] + \
           [[penalty * i] + zeros for i in range(1, l_p1)]


def _prescoring(vc1, vc2, l_p1, l_p2):
    """
    yes... this is the bottle neck, almost 2/3 of the time spent here
    """
    p_score = []
    for i in range(l_p1):
        vci = vc1[i].__mul__
        p_score.append([vci(vc2[j]).sum() for j in range(l_p2)])
    return p_score
    #return [[sum(vc1[i] * vc2[j]) for j in xrange(l_p2)] for i in xrange(l_p1)]


def _get_dist(align1, align2, tad1, tad2):
    """
    Frobenius norm
    """
    map1 = []
    map2 = []
    for i, j in zip(align1, align2):
        if i != '-' and j != '-':
            map1.append(i)
            map2.append(j)
    pp1 = [tad1[i][j] for i, j in combinations(map1, 2)]
    pp2 = [tad2[i][j] for i, j in combinations(map2, 2)]
    return float(sum([(pp-pp2[p])**2 for p, pp in enumerate(pp1)]))\
           / (len(pp1)+1)


def _get_dist_long(align1, align2, tad1, tad2):
    """
    Frobenius norm
    """
    map1 = []
    map2 = []
    pen  = ((float(mean(tad2)) + mean(tad1)) / 2) * len(align1)
    xpen = 0
    exti = extd = 1
    for i, j in zip(align1, align2):
        if i != '-' and j != '-':
            map1.append(i)
            map2.append(j)
            exti = extd = 1
        elif i == '-': # favour long gaps: stop penalizing if GAP lon enough
            xpen += pen / exti
            exti += 1
            extd = 1
        else:
            xpen += pen / extd
            extd += 1
            exti = 1
    pp1 = [tad1[i][j] for i, j in combinations(map1, 2)]
    pp2 = [tad2[i][j] for i, j in combinations(map2, 2)]
    return float(sum([(pp-pp2[p])**2 for p, pp in enumerate(pp1)]))\
           / (len(pp1)+1) + xpen


def _get_score(align1, align2, tad1, tad2):
    """
    Using Spearman Rho correation value, between matrices.
    Computed only in half of the matrix, and without the diagonal
    
    """
    map1 = []
    map2 = []
    for i, j in zip(align1, align2):
        if j != '-' and i != '-':
            map1.append(i)
            map2.append(j)
    pp1 = [[tad1[i][j] for j in map1] for i in map1] # do not use 
    pp2 = [[tad2[i][j] for j in map2] for i in map2] # diagonal?
    return spearmanr(pp1, pp2, axis=None)


def _get_OLD_score(align1, align2, p1, p2):
    """
    Original scoring function, based on contact map overlap.
    """
    map1 = []
    map2 = []
    for i, j in zip(align1, align2):
        if j != '-' and i != '-':
            map1.append(i)
            map2.append(j)
    com = len(map1)
    pp1 = [[p1[i][j] for j in map1] for i in map1]
    pp2 = [[p2[i][j] for j in map2] for i in map2]
    cm1 = sum([pp1[i][j] for i in range(com) for j in range(i + 1, com)])
    cm2 = sum([pp2[i][j] for i in range(com) for j in range(i + 1, com)])
    cmo = sum([1 - abs(pp2[i][j] - pp1[i][j]) \
               for i in range(com) for j in range(i + 1, com)])
    return float(2 * cmo)/(cm1+cm2)


###
# Following is for aleigen


def matrix2binnary_contacts(tad1, tad2):
    cutoff = median(tad1)
    contacts1 = []
    contacts2 = []
    for i in range(len(tad1)):
        for j in range(i, len(tad1)):
            if tad1[i][j] > cutoff:
                contacts1.append((i, j))
    cutoff = median(tad2)
    for i in range(len(tad2)):
        for j in range(i, len(tad2)):
            if tad2[i][j] > cutoff:
                contacts2.append((i, j))
    return contacts1, contacts2


def _run_aleigen(contacts1, contacts2, num_v):
    """

        - c1, c2 = number of contacts of the first and second contact map
                   (after removing non-matching columns/rows)
        - cmo = total number of matching contacts (above the first diagonal)
          of the computed overlap
        - score = 2*CMO/(C1+C2)

    """
    f_string = '/tmp/lala%s.txt'
    f_name1 = f_string % (1)
    f_name2 = f_string % (2)
    write_contacts(contacts1, contacts2, f_string)
    sc_str = re.compile('Score\s+C1\s+C2\s+CMO\n([0-9.]+)\s+[0-9]+\s+.*')
    out = Popen('aleigen %s %s %s' % (f_name1, f_name2, num_v),
                shell=True, stdout=PIPE, universal_newlines=True).communicate()[0]
    score = [float(c) for c in re.findall(sc_str, out)]
    print(out)
    align1 = []
    align2 = []
    for line in out.split('\n')[2:]:
        if not re.match('[0-9]+\s+[0-9]+', line):
            continue
        el1, el2 = [int(c) for c in line.split()]
        align1.append(el1)
        align2.append(el2)
    return align1, align2, score


def write_contacts(contacts1, contacts2, f_string):
    for i, contacts in enumerate([contacts1, contacts2]):
        out = open(f_string % (i+1), 'w')
        out.write(str(max([max(c) for c in contacts])+1) + '\n')
        out.write('\n'.join([str(c1) + ' ' + str(c2) for c1, c2 in contacts]))
        out.write('\n')
        out.close()


def merge_tads(tad1, tad2, ali1, ali2):
    ali = []
    ali.append(deepcopy(ali1))
    ali.append(deepcopy(ali2))
    for i in range(max(ali1[0], ali2[0]) - 1, -1, -1):
        if ali[0][0]:
            ali1.insert(0, i)
            ali2.insert(0, '-')
        elif ali[1][0]:
            ali1.insert(0, '-')
            ali2.insert(0, i)
    size = len(ali1)
    matrix1 = [[0.1 for _ in range(size)] for _ in range(size)]
    matrix2 = [[0.1 for _ in range(size)] for _ in range(size)]
    matrixm = [[0.1 for _ in range(size)] for _ in range(size)]
    for i in range(size):
        if ali1[i] == '-' or ali2[i] == '-':
            matrixm[i] = [float('nan') for _ in range(size)]
            if ali1[i] == '-':
                matrix1[i]  = [float('nan') for _ in range(size)]
                matrix2[i]  = [tad2[ali2[i]][ali2[j]] \
                               if ali2[j] != '-' else float('nan')\
                               for j in range(size)]
            elif ali2[i] == '-':
                matrix2[i]  = [float('nan') for _ in range(size)]
                matrix1[i]  = [tad1[ali1[i]][ali1[j]] \
                               if ali1[j] != '-' else float('nan')\
                               for j in range(size)]
            continue
        for j in range(size):
            if ali1[j] == '-' or ali2[j] == '-':
                matrixm[i][j] = float('nan')
                if ali1[j] == '-':
                    matrix1[i][j] = float('nan')
                    matrix2[i][j]  = tad2[ali2[i]][ali2[j]]
                elif ali2[j] == '-':
                    matrix2[i][j] = float('nan')
                    matrix1[i][j] = tad1[ali1[i]][ali1[j]]
                continue
            matrix1[i][j]  = tad1[ali1[i]][ali1[j]]
            matrix2[i][j]  = tad2[ali2[i]][ali2[j]]
            matrixm[i][j]  = tad1[ali1[i]][ali1[j]]
            matrixm[i][j] += tad2[ali2[i]][ali2[j]]
            matrixm[i][j] /= 2
    return matrix1, matrix2, matrixm