Source code for pytadbit.parsers.hic_parser

"""
November 7, 2013.

"""
from __future__ import print_function

from future import standard_library
from builtins import next
standard_library.install_aliases()
from sys                             import stderr, modules
from io                              import IOBase
from collections                     import OrderedDict
from warnings                        import warn
from math                            import sqrt, isnan
try:
    from pickle5                     import load  # python < 3.8
except ImportError:
    from pickle                      import load

from pysam                           import AlignmentFile

import numpy as np
from pytadbit.parsers.gzopen         import gzopen
from pytadbit                        import HiC_data
from pytadbit.parsers.hic_bam_parser import get_matrix
try:
    from pytadbit.parsers.cooler_parser import parse_cooler, is_cooler
except ImportError:
    def is_cooler(*unused):
        stderr.write('WARNING: cannot detect if input is a cooler file. Probably ' +
                     'you need to install h5py\n')
        return False
    pass

try:
    file_types = file, IOBase
except NameError:
    file_types = (IOBase,)

try:
    basestring
except NameError:
    basestring = str

HIC_DATA = True


class AutoReadFail(Exception):
    """
    Exception to handle failed autoreader.
    """
    pass


def is_asymmetric(matrix):
    """
    Helper functions for the autoreader.
    """
    maxn = len(matrix)
    for i in range(maxn):
        maxi = matrix[i] # slightly more efficient
        for j in range(i+1, maxn):
            if maxi[j] != matrix[j][i]:
                if isnan(maxi[j]) and isnan(matrix[j][i]):
                    continue
                return True
    return False


def is_asymmetric_dico(hic):
    """
    Helper functions for the optimal_reader
    """
    ncol = len(hic)
    for i in range(ncol):
        for j in range(i, ncol):
            p1 = i * ncol + j
            p2 = j * ncol + i
            if hic.get(p1, 0) != hic.get(p2, 0):
                return True
    return False


def symmetrize_dico(hic):
    """
    Make an HiC_data object symmetric by summing two halves of the matrix
    """
    ncol = len(hic)
    for i in range(ncol):
        incol = i * ncol
        for j in range(i, ncol):
            p1 = incol + j
            p2 = j * ncol + i
            val = hic.get(p1, 0) + hic.get(p2, 0)
            if val:
                hic[p1] = hic[p2] = val


def symmetrize(matrix):
    """
    Make a matrix symmetric by summing two halves of the matrix
    """
    maxn = len(matrix)
    for i in range(maxn):
        for j in range(i, maxn):
            matrix[i][j] = matrix[j][i] = matrix[i][j] + matrix[j][i]


def optimal_reader(f, normalized=False, resolution=1):
    """
    Reads a matrix generated by TADbit.
    Can be slower than autoreader, but uses almost a third of the memory

    :param f: an iterable (typically an open file).
    :param False normalized: if the matrix is normalized
    :param 1 resolution: resolution of the matrix

    """
    # get masked bins
    masked = {}
    pos = 0
    for line in f:
        if line[0] != '#':
            break
        pos += len(line)
        if line.startswith('# MASKED'):
            masked = dict([(int(n), True) for n in line.split()[2:]])
    f.seek(pos)

    # super fast
    header = [tuple(line.split(None, 2)[:2]) for line in f]

    f.seek(pos)

    ncol = len(header)

    # Get the numeric values and remove extra columns
    num = float if normalized else int
    chromosomes, sections, resolution = _header_to_section(header, resolution)

    #############################################################
    # monkey patch HiC_data to make it faster
    def fast_setitem(self, key, val):
        "Use directly dict setitem"
        super(HiC_data, self).__setitem__(key, val)

    def fast_getitem(self, key):
        "Use directly dict setitem"
        try:
            return super(HiC_data, self).__getitem__(key)
        except KeyError:
            return 0

    original_setitem = HiC_data.__setitem__
    original_getitem = HiC_data.__getitem__
    # apply_async the patch
    HiC_data.__setitem__ = fast_setitem
    HiC_data.__getitem__ = fast_getitem

    hic = HiC_data(((j, num(v))
                    for i, line in enumerate(f)
                    for j, v in enumerate(line.split()[2:], i * ncol)
                    if num(v)), size=ncol, masked=masked,
                   dict_sec=sections, chromosomes=chromosomes,
                   resolution=resolution, symmetricized=False)

    # make it symmetric
    if is_asymmetric_dico(hic):
        hic.symmetricized = True
        symmetrize_dico(hic)

    # undo patching
    HiC_data.__setitem__ = original_setitem
    HiC_data.__getitem__ = original_getitem
    hic.__setitem__ = original_setitem
    hic.__getitem__ = original_getitem
    #############################################################
    return hic


def __read_file_header(f):
    """
    Read file header, inside first commented lines of a file

    :returns masked dict, chromsomes orderedDict, crm, beg, end, resolution:
    """
    masked = {}
    chromosomes = OrderedDict()
    crm, beg, end, reso = None, None, None, None
    fpos = f.tell()
    for line in f:
        if line[0] != '#':
            break
        fpos += len(line)
        if line.startswith('# MASKED'):
            try:
                masked = dict([(int(n), True) for n in line.split()[-1].split(',')])
            except ValueError:  # nothing here
                pass
        elif line.startswith('# CRM'):
            _, _, crm, size = line.split()
            chromosomes[crm] = int(size)
        elif 'resolution:' in line:
            _, coords, reso = line.split()
            try:
                crm, pos = coords.split(':')
                beg, end = list(map(int, pos.split('-')))
            except ValueError:
                crm = coords
                beg, end = None, None
            reso = int(reso.split(':')[1])
    f.seek(fpos)
    if crm == 'full':
        crm = None
    return masked, chromosomes, crm, beg, end, reso


def abc_reader(f):
    """
    Read matrix stored in 3 column format (bin1, bin2, value)

    :param f: an iterable (typically an open file).

    :returns: An iterator to be converted in dictionary, matrix size, raw_names
       as list of tuples (chr, pos), dictionary of masked bins, and boolean
       reporter of symetric transformation
    """
    masked, chroms, crm, beg, end, reso = __read_file_header(f)  # TODO rest of it not used here
    sections = {}
    size = 0
    for c in chroms:
        sections[c] = size
        size += chroms[c] // reso + 1
    if beg:
        header = [(crm, '%d-%d' % (l * reso + 1, (l + 1) * reso))
                  for l in range(beg, end)]
    else:
        header = [(c, '%d-%d' % (l * reso + 1, (l + 1) * reso))
                  for c in chroms
                  for l in range(sections[c], sections[c] + chroms[c] // reso + 1)]
    num = int if HIC_DATA else float
    offset = (beg or 0) * (1 + size)
    def _disect(x):
        a, b, v = x.split()
        return (int(a) + int(b) * size + offset, num(v))
    items = tuple(_disect(line) for line in f)
    return items, size, header, masked, False


def __is_abc(f):
    """
    Only works for matrices with more than 3 bins
    """
    fpos = f.tell()
    count = 0
    for line in f:
        if line.startswith('#'):
            continue
        count += 1
        if len(line.split()) != 3:
            f.seek(fpos)
            return False
        if count > 3:
            f.seek(fpos)
            return True
    f.seek(fpos)
    return False

def autoreader(f):
    """
    Auto-detect matrix format of HiC data file.

    :param f: an iterable (typically an open file).

    :returns: An iterator to be converted in dictionary, matrix size, raw_names
       as list of tuples (chr, pos), dictionary of masked bins, and boolean
       reporter of symetric transformation
    """
    masked = __read_file_header(f)[0]  # TODO rest of it not used here

    # Skip initial comment lines and read in the whole file
    # as a list of lists.
    line = next(f)
    items = [line.split()] + [line.split() for line in f]

    # Count the number of elements per line after the first.
    # Wrapping in a set is a trick to make sure that every line
    # has the same number of elements.
    S = set([len(line) for line in items[1:]])
    ncol = S.pop()
    # If the set 'S' is not empty, at least two lines have a
    # different number of items.
    if S:
        raise AutoReadFail('ERROR: unequal column number')

    # free little memory
    del(S)

    nrow = len(items)
    # Auto-detect the format, there are only 4 cases.
    if ncol == nrow:
        try:
            _ = [float(item) for item in items[0]
                 if not item.lower() in ['na', 'nan']]
            # Case 1: pure number matrix.
            header = False
            trim = 0
        except ValueError:
            # Case 2: matrix with row and column names.
            header = True
            trim = 1
            warn('WARNING: found header')
    else:
        if len(items[0]) == len(items[1]):
            # Case 3: matrix with row information.
            header = False
            trim = ncol - nrow
            # warn('WARNING: found %d colum(s) of row names' % trim)
        else:
            # Case 4: matrix with header and row information.
            header = True
            trim = ncol - nrow + 1
            warn('WARNING: found header and %d colum(s) of row names' % trim)
    # Remove header line if needed.
    if header and not trim:
        header = items.pop(0)
        nrow -= 1
    elif not trim:
        header = list(range(1, nrow + 1))
    elif not header:
        header = [tuple([a for a in line[:trim]]) for line in items]
    else:
        del(items[0])
        nrow -= 1
        header = [tuple([a for a in line[:trim]]) for line in items]
    # Get the numeric values and remove extra columns
    num = int if HIC_DATA else float
    try:
        items = [[num(a) for a in line[trim:]] for line in items]
    except ValueError:
        if not HIC_DATA:
            raise AutoReadFail('ERROR: non numeric values')
        try:
            # Dekker data 2009, uses integer but puts a comma...
            items = [[int(float(a)+.5) for a in line[trim:]] for line in items]
            warn('WARNING: non integer values')
        except ValueError:
            try:
                # Some data may contain 'NaN' or 'NA'
                items = [
                    [0 if a.lower() in ['na', 'nan']
                     else int(float(a)+.5) for a in line[trim:]]
                for line in items]
                warn('WARNING: NA or NaN founds, set to zero')
            except ValueError:
                raise AutoReadFail('ERROR: non numeric values')

    # Check that the matrix is square.
    ncol -= trim
    if ncol != nrow:
        raise AutoReadFail('ERROR: non square matrix')

    symmetricized = False
    if is_asymmetric(items):
        warn('WARNING: matrix not symmetric: summing cell_ij with cell_ji')
        symmetrize(items)
        symmetricized = True
    return (((i + j * ncol, a) for i, line in enumerate(items)
             for j, a in enumerate(line) if a),
            ncol, header, masked, symmetricized)


def _header_to_section(header, resolution):
    """
    converts row-names of the form 'chr12\t1000-2000' into sections, suitable
    to create HiC_data objects. Also creates chromosomes, from the reads
    """
    sections = {}
    chromosomes = None
    if (isinstance(header, list)
        and isinstance(header[0], tuple)
        and len(header[0]) > 1):
        chromosomes = OrderedDict()
        for i, h in enumerate(header):
            if '-' in h[1]:
                a, b = list(map(int, h[1].split('-')))
                if resolution==1:
                    resolution = abs(b - a) + 1
                elif resolution != abs(b - a) + 1:
                    raise Exception('ERROR: found different resolution, ' +
                                    'check headers')
            else:
                a = int(h[1])
                if resolution==1 and i:
                    resolution = abs(a - b) + 1
                elif resolution == 1:
                    b = a
            sections[(h[0], a // resolution)] = i
            chromosomes.setdefault(h[0], 0)
            chromosomes[h[0]] += 1
    return chromosomes, sections, resolution


[docs]def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs): """ Read and checks a matrix from a file (using :func:`pytadbit.parser.hic_parser.autoreader`) or a list. :param things: might be either a file name, a file handler or a list of list (all with same length) :param None parser: a parser function that returns a tuple of lists representing the data matrix, with this file example.tsv: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 86 612 175 110 chrT_003 159 216 437 105 chrT_004 100 111 146 278 the output of parser('example.tsv') might be: ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110, 105, 278])`` :param 1 resolution: resolution of the matrix :param True hic: if False, TADbit assumes that files contains normalized data :returns: the corresponding matrix concatenated into a huge list, also returns number or rows """ one = kwargs.get('one', True) global HIC_DATA HIC_DATA = hic if not isinstance(things, list): things = [things] matrices = [] for thing in things: if isinstance(thing, HiC_data): matrices.append(thing) elif isinstance(thing, file_types): parser = parser or (abc_reader if __is_abc(thing) else autoreader) matrix, size, header, masked, sym = parser(thing) thing.close() chromosomes, sections, resolution = _header_to_section(header, resolution) matrices.append(HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, resolution=resolution, symmetricized=sym, masked=masked)) elif isinstance(thing, basestring): if is_cooler(thing, resolution if resolution > 1 else None): matrix, size, header, masked, sym = parse_cooler(thing, resolution if resolution > 1 else None, not hic) else: try: with gzopen(thing) as f_thing: parser = parser or (abc_reader if __is_abc(f_thing) else autoreader) matrix, size, header, masked, sym = parser(f_thing) except IOError: if len(thing.split('\n')) > 1: parser = parser or (abc_reader if __is_abc(thing.split('\n')) else autoreader) matrix, size, header, masked, sym = parser(thing.split('\n')) else: raise IOError('\n ERROR: file %s not found\n' % thing) sections = dict([(h, i) for i, h in enumerate(header)]) chromosomes, sections, resolution = _header_to_section(header, resolution) matrices.append(HiC_data(matrix, size, dict_sec=sections, chromosomes=chromosomes, masked=masked, resolution=resolution, symmetricized=sym)) elif isinstance(thing, list): if all([len(thing)==len(l) for l in thing]): size = len(thing) matrix = [(i + j * size, v) for i, l in enumerate(thing) for j, v in enumerate(l) if v] else: raise Exception('must be list of lists, all with same length.') matrices.append(HiC_data(matrix, size)) elif isinstance(thing, tuple): # case we know what we are doing and passing directly list of tuples matrix = thing siz = sqrt(len(thing)) if int(siz) != siz: raise AttributeError('ERROR: matrix should be square.\n') size = int(siz) matrices.append(HiC_data(matrix, size)) elif isinstance(thing, (np.ndarray, np.generic) ): try: row, col = thing.shape if row != col: raise Exception('matrix needs to be square.') matrix = thing.reshape(-1).tolist()[0] size = row except Exception as exc: print('Error found:', exc) matrices.append(HiC_data(matrix, size)) else: raise Exception('Unable to read this file or whatever it is :)') if one: return matrices[0] else: return matrices
[docs]def load_hic_data_from_reads(fnam, resolution, **kwargs): """ :param fnam: tsv file with reads1 and reads2 :param resolution: the resolution of the experiment (size of a bin in bases) :param genome_seq: a dictionary containing the genomic sequence by chromosome :param False get_sections: for very very high resolution, when the column index does not fit in memory """ sections = [] genome_seq = OrderedDict() fhandler = open(fnam) line = next(fhandler) size = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) // resolution + 1 size += genome_seq[crm] line = next(fhandler) if kwargs.get('get_sections', True): for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in range(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution) try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) try: ps1 = dict_sec[(cr1, int(ps1) // resolution)] ps2 = dict_sec[(cr2, int(ps2) // resolution)] except KeyError: ps1 = int(ps1) // resolution ps2 = int(ps2) // resolution imx[ps1, ps2] += 1 imx[ps2, ps1] += 1 line = next(fhandler) except StopIteration: pass fhandler.close() imx.symmetricized = True return imx
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region=None, verbose=True, clean=True): """ :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2 :param resolution: the resolution of the experiment (size of a bin in bases) :param None biases: path to pickle file where are stored the biases. Keys in this file should be: 'biases', 'badcol', 'decay' and 'resolution' :param '.' tmpdir: path to folder where to create temporary files :param 8 ncpus: :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param None region: chromosome name, if None, all genome will be loaded :returns: HiC_data object """ bam = AlignmentFile(fnam) genome_seq = OrderedDict((c, l) for c, l in zip(bam.references, [x // resolution + 1 for x in bam.lengths])) bam.close() sections = [] if region: size = genome_seq[region] sections.extend([(region, i) for i in range(size)]) else: for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in range(len_crm)]) size = sum(genome_seq.values()) chromosomes = {region: genome_seq[region]} if region else genome_seq dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec, resolution=resolution) if biases: if isinstance(biases, basestring): biases = load(open(biases,'rb')) if biases['resolution'] != resolution: raise Exception('ERROR: resolution of biases do not match to the ' 'one wanted (%d vs %d)' % ( biases['resolution'], resolution)) if region: chrom_start = 0 chrom_end = 0 for crm in genome_seq: if crm == region: chrom_end = chrom_start + genome_seq[crm] break len_crm = genome_seq[crm] chrom_start += len_crm imx.bias = dict((k - chrom_start, v) for k, v in biases.get('biases', {}).items() if chrom_start <= k < chrom_end) imx.bads = dict((k - chrom_start, v) for k, v in biases.get('badcol', {}).items() if chrom_start <= k < chrom_end) else: imx.bads = biases['badcol'] imx.bias = biases['biases'] imx.expected = biases['decay'] get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude, normalization='raw', tmpdir=tmpdir, clean=clean, ncpus=ncpus, dico=imx, region1=region, verbose=verbose) imx._symmetricize() imx.symmetricized = True return imx