Source code for pytadbit.parsers.hic_parser

"""
November 7, 2013.

"""
from __future__ import print_function

from future import standard_library
from builtins import next
standard_library.install_aliases()
from sys                             import stderr, modules
from io                              import IOBase
from collections                     import OrderedDict
from warnings                        import warn
from math                            import sqrt, isnan
try:
    from pickle5                     import load  # python < 3.8
except ImportError:
    from pickle                      import load

from pysam                           import AlignmentFile

import numpy as np
from pytadbit.parsers.gzopen         import gzopen
from pytadbit                        import HiC_data
from pytadbit.parsers.hic_bam_parser import get_matrix
try:
    from pytadbit.parsers.cooler_parser import parse_cooler, is_cooler
except ImportError:
    def is_cooler(*unused):
        stderr.write('WARNING: cannot detect if input is a cooler file. Probably ' +
                     'you need to install h5py\n')
        return False
    pass

try:
    file_types = file, IOBase
except NameError:
    file_types = (IOBase,)

try:
    basestring
except NameError:
    basestring = str

HIC_DATA = True


class AutoReadFail(Exception):
    """
    Exception to handle failed autoreader.
    """
    pass


def is_asymmetric(matrix):
    """
    Helper functions for the autoreader.
    """
    maxn = len(matrix)
    for i in range(maxn):
        maxi = matrix[i] # slightly more efficient
        for j in range(i+1, maxn):
            if maxi[j] != matrix[j][i]:
                if isnan(maxi[j]) and isnan(matrix[j][i]):
                    continue
                return True
    return False


def is_asymmetric_dico(hic):
    """
    Helper functions for the optimal_reader
    """
    ncol = len(hic)
    for i in range(ncol):
        for j in range(i, ncol):
            p1 = i * ncol + j
            p2 = j * ncol + i
            if hic.get(p1, 0) != hic.get(p2, 0):
                return True
    return False


def symmetrize_dico(hic):
    """
    Make an HiC_data object symmetric by summing two halves of the matrix
    """
    ncol = len(hic)
    for i in range(ncol):
        incol = i * ncol
        for j in range(i, ncol):
            p1 = incol + j
            p2 = j * ncol + i
            val = hic.get(p1, 0) + hic.get(p2, 0)
            if val:
                hic[p1] = hic[p2] = val


def symmetrize(matrix):
    """
    Make a matrix symmetric by summing two halves of the matrix
    """
    maxn = len(matrix)
    for i in range(maxn):
        for j in range(i, maxn):
            matrix[i][j] = matrix[j][i] = matrix[i][j] + matrix[j][i]


def optimal_reader(f, normalized=False, resolution=1):
    """
    Reads a matrix generated by TADbit.
    Can be slower than autoreader, but uses almost a third of the memory

    :param f: an iterable (typically an open file).
    :param False normalized: if the matrix is normalized
    :param 1 resolution: resolution of the matrix

    """
    # get masked bins
    masked = {}
    pos = 0
    for line in f:
        if line[0] != '#':
            break
        pos += len(line)
        if line.startswith('# MASKED'):
            masked = dict([(int(n), True) for n in line.split()[2:]])
    f.seek(pos)

    # super fast
    header = [tuple(line.split(None, 2)[:2]) for line in f]

    f.seek(pos)

    ncol = len(header)

    # Get the numeric values and remove extra columns
    num = float if normalized else int
    chromosomes, sections, resolution = _header_to_section(header, resolution)

    #############################################################
    # monkey patch HiC_data to make it faster
    def fast_setitem(self, key, val):
        "Use directly dict setitem"
        super(HiC_data, self).__setitem__(key, val)

    def fast_getitem(self, key):
        "Use directly dict setitem"
        try:
            return super(HiC_data, self).__getitem__(key)
        except KeyError:
            return 0

    original_setitem = HiC_data.__setitem__
    original_getitem = HiC_data.__getitem__
    # apply_async the patch
    HiC_data.__setitem__ = fast_setitem
    HiC_data.__getitem__ = fast_getitem

    hic = HiC_data(((j, num(v))
                    for i, line in enumerate(f)
                    for j, v in enumerate(line.split()[2:], i * ncol)
                    if num(v)), size=ncol, masked=masked,
                   dict_sec=sections, chromosomes=chromosomes,
                   resolution=resolution, symmetricized=False)

    # make it symmetric
    if is_asymmetric_dico(hic):
        hic.symmetricized = True
        symmetrize_dico(hic)

    # undo patching
    HiC_data.__setitem__ = original_setitem
    HiC_data.__getitem__ = original_getitem
    hic.__setitem__ = original_setitem
    hic.__getitem__ = original_getitem
    #############################################################
    return hic


def __read_file_header(f):
    """
    Read file header, inside first commented lines of a file

    :returns masked dict, chromsomes orderedDict, crm, beg, end, resolution:
    """
    masked = {}
    chromosomes = OrderedDict()
    crm, beg, end, reso = None, None, None, None
    fpos = f.tell()
    for line in f:
        if line[0] != '#':
            break
        fpos += len(line)
        if line.startswith('# MASKED'):
            try:
                masked = dict([(int(n), True) for n in line.split()[-1].split(',')])
            except ValueError:  # nothing here
                pass
        elif line.startswith('# CRM'):
            _, _, crm, size = line.split()
            chromosomes[crm] = int(size)
        elif 'resolution:' in line:
            _, coords, reso = line.split()
            try:
                crm, pos = coords.split(':')
                beg, end = list(map(int, pos.split('-')))
            except ValueError:
                crm = coords
                beg, end = None, None
            reso = int(reso.split(':')[1])
    f.seek(fpos)
    if crm == 'full':
        crm = None
    return masked, chromosomes, crm, beg, end, reso


def abc_reader(f):
    """
    Read matrix stored in 3 column format (bin1, bin2, value)

    :param f: an iterable (typically an open file).

    :returns: An iterator to be converted in dictionary, matrix size, raw_names
       as list of tuples (chr, pos), dictionary of masked bins, and boolean
       reporter of symetric transformation
    """
    masked, chroms, crm, beg, end, reso = __read_file_header(f)  # TODO rest of it not used here
    sections = {}
    size = 0
    for c in chroms:
        sections[c] = size
        size += chroms[c] // reso + 1
    if beg:
        header = [(crm, '%d-%d' % (l * reso + 1, (l + 1) * reso))
                  for l in range(beg, end)]
    else:
        header = [(c, '%d-%d' % (l * reso + 1, (l + 1) * reso))
                  for c in chroms
                  for l in range(sections[c], sections[c] + chroms[c] // reso + 1)]
    num = int if HIC_DATA else float
    offset = (beg or 0) * (1 + size)
    def _disect(x):
        a, b, v = x.split()
        return (int(a) + int(b) * size + offset, num(v))
    items = tuple(_disect(line) for line in f)
    return items, size, header, masked, False


def __is_abc(f):
    """
    Only works for matrices with more than 3 bins
    """
    fpos = f.tell()
    count = 0
    for line in f:
        if line.startswith('#'):
            continue
        count += 1
        if len(line.split()) != 3:
            f.seek(fpos)
            return False
        if count > 3:
            f.seek(fpos)
            return True
    f.seek(fpos)
    return False

def autoreader(f):
    """
    Auto-detect matrix format of HiC data file.

    :param f: an iterable (typically an open file).

    :returns: An iterator to be converted in dictionary, matrix size, raw_names
       as list of tuples (chr, pos), dictionary of masked bins, and boolean
       reporter of symetric transformation
    """
    masked = __read_file_header(f)[0]  # TODO rest of it not used here

    # Skip initial comment lines and read in the whole file
    # as a list of lists.
    line = next(f)
    items = [line.split()] + [line.split() for line in f]

    # Count the number of elements per line after the first.
    # Wrapping in a set is a trick to make sure that every line
    # has the same number of elements.
    S = set([len(line) for line in items[1:]])
    ncol = S.pop()
    # If the set 'S' is not empty, at least two lines have a
    # different number of items.
    if S:
        raise AutoReadFail('ERROR: unequal column number')

    # free little memory
    del(S)

    nrow = len(items)
    # Auto-detect the format, there are only 4 cases.
    if ncol == nrow:
        try:
            _ = [float(item) for item in items[0]
                 if not item.lower() in ['na', 'nan']]
            # Case 1: pure number matrix.
            header = False
            trim = 0
        except ValueError:
            # Case 2: matrix with row and column names.
            header = True
            trim = 1
            warn('WARNING: found header')
    else:
        if len(items[0]) == len(items[1]):
            # Case 3: matrix with row information.
            header = False
            trim = ncol - nrow
            # warn('WARNING: found %d colum(s) of row names' % trim)
        else:
            # Case 4: matrix with header and row information.
            header = True
            trim = ncol - nrow + 1
            warn('WARNING: found header and %d colum(s) of row names' % trim)
    # Remove header line if needed.
    if header and not trim:
        header = items.pop(0)
        nrow -= 1
    elif not trim:
        header = list(range(1, nrow + 1))
    elif not header:
        header = [tuple([a for a in line[:trim]]) for line in items]
    else:
        del(items[0])
        nrow -= 1
        header = [tuple([a for a in line[:trim]]) for line in items]
    # Get the numeric values and remove extra columns
    num = int if HIC_DATA else float
    try:
        items = [[num(a) for a in line[trim:]] for line in items]
    except ValueError:
        if not HIC_DATA:
            raise AutoReadFail('ERROR: non numeric values')
        try:
            # Dekker data 2009, uses integer but puts a comma...
            items = [[int(float(a)+.5) for a in line[trim:]] for line in items]
            warn('WARNING: non integer values')
        except ValueError:
            try:
                # Some data may contain 'NaN' or 'NA'
                items = [
                    [0 if a.lower() in ['na', 'nan']
                     else int(float(a)+.5) for a in line[trim:]]
                for line in items]
                warn('WARNING: NA or NaN founds, set to zero')
            except ValueError:
                raise AutoReadFail('ERROR: non numeric values')

    # Check that the matrix is square.
    ncol -= trim
    if ncol != nrow:
        raise AutoReadFail('ERROR: non square matrix')

    symmetricized = False
    if is_asymmetric(items):
        warn('WARNING: matrix not symmetric: summing cell_ij with cell_ji')
        symmetrize(items)
        symmetricized = True
    return (((i + j * ncol, a) for i, line in enumerate(items)
             for j, a in enumerate(line) if a),
            ncol, header, masked, symmetricized)


def _header_to_section(header, resolution):
    """
    converts row-names of the form 'chr12\t1000-2000' into sections, suitable
    to create HiC_data objects. Also creates chromosomes, from the reads
    """
    sections = {}
    chromosomes = None
    if (isinstance(header, list)
        and isinstance(header[0], tuple)
        and len(header[0]) > 1):
        chromosomes = OrderedDict()
        for i, h in enumerate(header):
            if '-' in h[1]:
                a, b = list(map(int, h[1].split('-')))
                if resolution==1:
                    resolution = abs(b - a) + 1
                elif resolution != abs(b - a) + 1:
                    raise Exception('ERROR: found different resolution, ' +
                                    'check headers')
            else:
                a = int(h[1])
                if resolution==1 and i:
                    resolution = abs(a - b) + 1
                elif resolution == 1:
                    b = a
            sections[(h[0], a // resolution)] = i
            chromosomes.setdefault(h[0], 0)
            chromosomes[h[0]] += 1
    return chromosomes, sections, resolution


[docs]def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs):
    """
    Read and checks a matrix from a file (using
    :func:`pytadbit.parser.hic_parser.autoreader`) or a list.

    :param things: might be either a file name, a file handler or a list of
        list (all with same length)
    :param None parser: a parser function that returns a tuple of lists
       representing the data matrix,
       with this file example.tsv:
       ::

         chrT_001    chrT_002    chrT_003    chrT_004
         chrT_001    629    164    88    105
         chrT_002    86    612    175    110
         chrT_003    159    216    437    105
         chrT_004    100    111    146    278

       the output of parser('example.tsv') might be:
       ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
       105, 278])``

    :param 1 resolution: resolution of the matrix
    :param True hic: if False, TADbit assumes that files contains normalized
       data
    :returns: the corresponding matrix concatenated into a huge list, also
       returns number or rows

    """
    one = kwargs.get('one', True)
    global HIC_DATA
    HIC_DATA = hic
    if not isinstance(things, list):
        things = [things]
    matrices = []
    for thing in things:
        if isinstance(thing, HiC_data):
            matrices.append(thing)
        elif isinstance(thing, file_types):
            parser = parser or (abc_reader if __is_abc(thing) else autoreader)
            matrix, size, header, masked, sym = parser(thing)
            thing.close()
            chromosomes, sections, resolution = _header_to_section(header,
                                                                   resolution)
            matrices.append(HiC_data(matrix, size, dict_sec=sections,
                                     chromosomes=chromosomes,
                                     resolution=resolution,
                                     symmetricized=sym, masked=masked))
        elif isinstance(thing, basestring):
            if is_cooler(thing, resolution if resolution > 1 else None):
                matrix, size, header, masked, sym = parse_cooler(thing,
                                                                 resolution if resolution > 1 else None,
                                                                 not hic)
            else:
                try:
                    with gzopen(thing) as f_thing:
                        parser = parser or (abc_reader if __is_abc(f_thing) else autoreader)
                        matrix, size, header, masked, sym = parser(f_thing)
                except IOError:
                    if len(thing.split('\n')) > 1:
                        parser = parser or (abc_reader if __is_abc(thing.split('\n')) else autoreader)
                        matrix, size, header, masked, sym = parser(thing.split('\n'))
                    else:
                        raise IOError('\n   ERROR: file %s not found\n' % thing)
            sections = dict([(h, i) for i, h in enumerate(header)])
            chromosomes, sections, resolution = _header_to_section(header,
                                                                   resolution)
            matrices.append(HiC_data(matrix, size, dict_sec=sections,
                                     chromosomes=chromosomes, masked=masked,
                                     resolution=resolution,
                                     symmetricized=sym))
        elif isinstance(thing, list):
            if all([len(thing)==len(l) for l in thing]):
                size = len(thing)
                matrix  = [(i + j * size, v) for i, l in enumerate(thing)
                           for j, v in enumerate(l) if v]
            else:
                raise Exception('must be list of lists, all with same length.')
            matrices.append(HiC_data(matrix, size))
        elif isinstance(thing, tuple):
            # case we know what we are doing and passing directly list of tuples
            matrix = thing
            siz = sqrt(len(thing))
            if int(siz) != siz:
                raise AttributeError('ERROR: matrix should be square.\n')
            size = int(siz)
            matrices.append(HiC_data(matrix, size))
        elif isinstance(thing, (np.ndarray, np.generic) ):
            try:
                row, col = thing.shape
                if row != col:
                    raise Exception('matrix needs to be square.')
                matrix  = thing.reshape(-1).tolist()[0]
                size = row
            except Exception as exc:
                print('Error found:', exc)
            matrices.append(HiC_data(matrix, size))
        else:
            raise Exception('Unable to read this file or whatever it is :)')
    if one:
        return matrices[0]
    else:
        return matrices


[docs]def load_hic_data_from_reads(fnam, resolution, **kwargs):
    """
    :param fnam: tsv file with reads1 and reads2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param genome_seq: a dictionary containing the genomic sequence by
       chromosome
    :param False get_sections: for very very high resolution, when the column
       index does not fit in memory
    """
    sections = []
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = next(fhandler)
    size = 0
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen) // resolution + 1
            size += genome_seq[crm]
        line = next(fhandler)
    if kwargs.get('get_sections', True):
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            sections.extend([(crm, i) for i in range(len_crm)])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution)
    try:
        while True:
            _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
            try:
                ps1 = dict_sec[(cr1, int(ps1) // resolution)]
                ps2 = dict_sec[(cr2, int(ps2) // resolution)]
            except KeyError:
                ps1 = int(ps1) // resolution
                ps2 = int(ps2) // resolution
            imx[ps1, ps2] += 1
            imx[ps2, ps1] += 1
            line = next(fhandler)
    except StopIteration:
        pass
    fhandler.close()
    imx.symmetricized = True
    return imx


def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None, verbose=True, clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in
                             zip(bam.references,
                                 [x // resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    if region:
        size = genome_seq[region]
        sections.extend([(region, i) for i in range(size)])
    else:
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            sections.extend([(crm, i) for i in range(len_crm)])
    
        size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases,'rb'))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' % (
                                biases['resolution'], resolution))
        if region:
            chrom_start = 0
            chrom_end = 0
            for crm in genome_seq:
                if crm == region:
                    chrom_end = chrom_start + genome_seq[crm]
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bias  = dict((k - chrom_start, v)
                          for k, v in biases.get('biases', {}).items()
                          if chrom_start <= k < chrom_end)
            imx.bads  = dict((k - chrom_start, v)
                          for k, v in biases.get('badcol', {}).items()
                          if chrom_start <= k < chrom_end)
        else:
            imx.bads     = biases['badcol']
            imx.bias     = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude,
               normalization='raw', tmpdir=tmpdir, clean=clean,
               ncpus=ncpus, dico=imx, region1=region, verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx