Source code for pytadbit.hic_data

"""
December 12, 2014.

"""
from __future__ import print_function

from future import standard_library
standard_library.install_aliases()
import os
from sys                            import stderr, modules
from collections                    import OrderedDict
from warnings                       import warn
from bisect                         import bisect_right as bisect
from pickle                         import HIGHEST_PROTOCOL, dump, load

from numpy.linalg                   import LinAlgError
from numpy                          import corrcoef, nansum, array, isnan, mean
from numpy                          import meshgrid, asarray, exp, linspace, std
from numpy                          import nanpercentile as npperc, log as nplog
from numpy                          import nanmax, ma, zeros_like
from scipy.stats                    import ttest_ind, spearmanr
from scipy.special                  import gammaincc
from scipy.cluster.hierarchy        import linkage, fcluster, dendrogram
from scipy.sparse.linalg            import eigsh
from scipy.sparse                   import csr_matrix

from pytadbit.utils.extraviews      import plot_compartments
from pytadbit.utils.extraviews      import plot_compartments_summary
from pytadbit.utils.hic_filtering   import filter_by_mean, filter_by_zero_count
from pytadbit.utils.normalize_hic   import iterative, expected
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.parsers.bed_parser    import parse_bed
from pytadbit.utils.file_handling   import mkdir
from pytadbit.utils.hmm             import gaussian_prob, best_path, train
from pytadbit.utils.tadmaths        import calinski_harabasz
try:
    from pytadbit.parsers.cooler_parser import cooler_file
except ImportError:
    pass

try:
    basestring
except NameError:
    basestring = str

def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
    """
    https://stackoverflow.com/questions/5595425/what-is-the-best-way-to-compare-floats-for-almost-equality-in-python/33024979#33024979
    """
    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)


[docs]class HiC_data(dict):
    """
    This may also hold the print/write-to-file matrix functions
    """
    def __init__(self, items, size, chromosomes=None, dict_sec=None,
                 resolution=1, masked=None, symmetricized=False):
        super(HiC_data, self).__init__(items)
        self.__size = size
        self._size2 = size**2
        self._symmetricize()
        self.bias = None
        self.bads = masked or {}
        self.chromosomes = chromosomes
        self.sections = dict_sec
        self.section_pos = {}
        self.resolution = resolution
        self.expected = None
        self.symmetricized = symmetricized
        self.compartments = {}
        if self.chromosomes:
            total = 0
            for crm in self.chromosomes:
                self.section_pos[crm] = (total, total + self.chromosomes[crm])
                total += self.chromosomes[crm]
        if self.sections == {}:
            self.section_pos = {None: (0, self.__size)}
            self.sections = dict([((None, i), i)
                                  for i in range(0, self.__size)])

    def _symmetricize(self):
        """
        Check if matrix is symmetric (check first 10 non-zero values) and,
        if not, make it symmetric
         - if matrix is half empty, copy values on one side to the other side
         - if matrix is asymmetric, sum non-diagonal values
        """
        to_sum = False
        symmetric = True
        count  = 0
        for n in self:
            i = n // self.__size
            j = n % self.__size
            if i == j or self[i, j] == self[i, j] == 0:
                continue
            if not isclose(self[i, j], self[j, i]):
                if self[i, j] != 0 and self[j, i] != 0:
                    to_sum = True
                symmetric = False
                break
            if count > 10:
                return
            count += 1
        if symmetric:  # may not reach 10 values
            return
        if to_sum:
            for n in list(self.keys())[:]:
                i = n // self.__size
                j = n % self.__size
                if i != j:
                    self[j, i] = self[i, j] = self[j, i] + self[i, j]
        else:
            for n in list(self.keys())[:]:
                i = n // self.__size
                j = n % self.__size
                self[j, i] = self[i, j] = self[n]

    def _update_size(self, size):
        self.__size +=  size
        self._size2 = self.__size**2

    def __len__(self):
        return self.__size

    def __getitem__(self, row_col):
        """
        slow one... for user
        for fast item getting, use self.get()
        """
        try:
            row, col = row_col
            pos = row * self.__size + col
            if pos > self._size2:
                raise IndexError(
                    'ERROR: row or column larger than %s' % self.__size)
            return self.get(pos, 0)
        except TypeError:
            if row_col > self._size2:
                raise IndexError(
                    'ERROR: position %d larger than %s^2' % (row_col,
                                                             self.__size))
            return self.get(row_col, 0)

    def __setitem__(self, row_col, val):
        """
        slow one... for user
        for fast item getting, use self.get()
        """
        try:
            row, col = row_col
            pos = row * self.__size + col
            if pos > self._size2:
                print(row, col, pos)
                raise IndexError(
                    'ERROR: row or column larger than %s' % self.__size)
            super(HiC_data, self).__setitem__(pos, val)
        except TypeError:
            if hasattr(self, '_size2') and row_col > self._size2:
                raise IndexError(
                    'ERROR: position %d larger than %s^2' % (row_col,
                                                             self.__size))
            super(HiC_data, self).__setitem__(row_col, val)

[docs]    def get_hic_data_as_csr(self):
        """
        Returns a scipy sparse matrix in Compressed Sparse Row format of the Hi-C data in the dictionary

        :returns: scipy sparse matrix in Compressed Sparse Row format
        """
        values = []
        cols = []
        rows = []
        for key, value in self.items():
            row, col = round(key // self.__size), key % self.__size
            values.append(float(value))
            cols.append(col)
            rows.append(row)

        return csr_matrix((values, (rows, cols)), shape=(self.__size,self.__size))

[docs]    def add_sections_from_fasta(self, fasta):
        """
        Add genomic coordinate to HiC_data object by getting them from a FASTA
        file containing chromosome sequences

        :param fasta: path to a FASTA file
        """
        genome = parse_fasta(fasta, verbose=False)
        sections = []
        genome_seq = OrderedDict()
        size = 0
        for crm in  genome:
            genome_seq[crm] = int(len(genome[crm])) // self.resolution + 1
            size += genome_seq[crm]
        section_sizes = {}
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            section_sizes[(crm,)] = len_crm
            sections.extend([(crm, i) for i in range(len_crm)])
        dict_sec = dict([(j, i) for i, j in enumerate(sections)])
        self.chromosomes = genome_seq
        self.sections = dict_sec
        if self.chromosomes:
            total = 0
            for crm in self.chromosomes:
                self.section_pos[crm] = (total, total + self.chromosomes[crm])
                total += self.chromosomes[crm]
        if size != self.__size:
            warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size)
                 + 'should adjust the resolution')
        self.__size = size
        self._size2 = size**2

[docs]    def add_sections(self, lengths, chr_names=None, binned=False):
        """
        Add genomic coordinate to HiC_data object by getting them from a FASTA
        file containing chromosome sequences. Orders matters.

        :param lengths: list of chromosome lengths
        :param None chr_names: list of corresponding chromosome names.
        :param False binned: if True, lengths will not be divided by resolution
        """
        sections = []
        genome_seq = OrderedDict()
        size = 0
        resolution = 1 if binned else self.resolution
        for crm, length in  enumerate(lengths):
            cnam = 'chr' + str(crm) if not chr_names else chr_names[crm]
            genome_seq[cnam] = int(length) // resolution + 1
            size += genome_seq[cnam]
        section_sizes = {}
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            section_sizes[(crm,)] = len_crm
            sections.extend([(crm, i) for i in range(len_crm)])
        dict_sec = dict([(j, i) for i, j in enumerate(sections)])
        self.chromosomes = genome_seq
        self.sections = dict_sec
        if self.chromosomes:
            total = 0
            for crm in self.chromosomes:
                self.section_pos[crm] = (total, total + self.chromosomes[crm])
                total += self.chromosomes[crm]
        if size != self.__size:
            warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size)
                 + 'should adjust the resolution')
        self.__size = size
        self._size2 = size**2

[docs]    def cis_trans_ratio(self, normalized=False, exclude=None, diagonal=True,
                        equals=None):
        """
        Counts the number of interactions occurring within chromosomes (cis) with
        respect to the total number of interactions

        :param False normalized: used normalized data
        :param None exclude: exclude a given list of chromosome from the
           ratio (may want to exclude translocated chromosomes)
        :param False diagonal: replace values in the diagonal by 0 or 1
        :param None equals: can pass a function that would decide if 2 chromosomes
           have to be considered as the same. e.g. lambda x, y: x[:4]==y[:4] will
           consider chr2L and chr2R as being the same chromosome. WARNING: only
           working on consecutive chromosomes.

        :returns: the ratio of cis interactions over the total number of
           interactions. This number is expected to be between at least 40-60%
           in Human classic dilution Hi-C with HindIII as restriction enzyme.
        """
        if normalized and not self.bias:
            raise Exception('ERROR: experiment not normalized yet')
        if exclude == None:
            exclude = []
        if equals == None:
            equals = lambda x, y: x == y
        intra = 0
        if not self.chromosomes:
            return float('nan')
        # define chromosomes to be merged
        to_skip = set()
        c_prev = ''
        for c in self.chromosomes:
            if equals(c, c_prev):
                to_skip.add(c_prev)
            c_prev = c
        sections = sorted([-1] + [self.section_pos[c][1]
                                  for c in self.section_pos
                                  if not c in to_skip])
        # defines columns to be skipped
        bads = set(self.bads.keys())
        for c in exclude:
            bads.update(i for i in range(*self.section_pos[c]))
        # diagonal
        if diagonal:
            valid = lambda x, y: True
        else:
            valid = lambda x, y: x != y
        # normalization
        if normalized:
            transform = lambda x, y, z: x / self.bias[y] / self.bias[z]
        else:
            transform = lambda x, y, z: x
        # compute ratio
        for k, v in self.items():
            i, j = divmod(k, self.__size)
            if bisect(sections, i) != bisect(sections, j):
                continue
            if i in bads or j in bads:
                continue
            if valid(i, j):  # diagonal thing
                intra += transform(v, i, j)
        try:
            return float(intra) / self.sum(bias=self.bias if normalized else None, bads=bads)
        except ZeroDivisionError:
            return 0.

[docs]    def filter_columns(self, draw_hist=False, savefig=None, perc_zero=99,
                       by_mean=True, min_count=None, silent=False):
        """
        Call filtering function, to remove artifactual columns in a given Hi-C
        matrix. This function will detect columns with very low interaction
        counts. Filtered out columns will be
        stored in the dictionary Experiment._zeros.

        :param False draw_hist: shows the distribution of mean values by column
           the polynomial fit, and the cut applied.
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        :param 75 perc_zero: maximum percentage of cells with no interactions
           allowed.
        :param None min_count: minimum number of reads mapped to a bin (recommended
           value could be 2500). If set this option overrides the perc_zero
           filtering... This option is slightly slower.
        :param True by_mean: filter columns by mean column value using
           :func:`pytadbit.utils.hic_filtering.filter_by_mean` function

        """
        self.bads = filter_by_zero_count(self, perc_zero, min_count=min_count,
                                         silent=silent)
        if by_mean:
            self.bads.update(filter_by_mean(
                self, draw_hist=draw_hist, silent=silent,
                savefig=savefig, bads=self.bads))
        if not silent:
            print('Found %d of %d columns with poor signal' % (len(self.bads),
                                                               len(self)))

[docs]    def sum(self, bias=None, bads=None):
        """
        Sum Hi-C data matrix
        WARNING: parameters are not meant to be used by external users

        :params None bias: expects a dictionary of biases to use normalized matrix
        :params None bads: extends computed bad columns

        :returns: the sum of the Hi-C matrix skipping bad columns
        """
        N = self.__size
        norm_sum = 0
        bads = bads or self.bads
        if bias:
            for k, v in self.items():
                i, j = divmod(k, N)
                if i in bads or j in bads:
                    continue
                norm_sum += v / (bias[i] * bias[j])
        else:
            for k, v in self.items():
                i, j = divmod(k, N)
                if i in bads or j in bads:
                    continue
                norm_sum += v
        return norm_sum

    def normalize_expected(self, **kwargs):
        self.expected = expected(self, bads=self.bads, **kwargs)

[docs]    def normalize_hic(self, iterations=0, max_dev=0.1, silent=False,
                      sqrt=False, factor=1):
        """
        Normalize the Hi-C data.

        It fills the Experiment.norm variable with the Hi-C values divided by
        the calculated weight.

        :param 0 iteration: number of iterations
        :param 0.1 max_dev: iterative process stops when the maximum deviation
           between the sum of row is equal to this number (0.1 means 10%)
        :param False silent: does not warn when overwriting weights
        :param False sqrt: uses the square root of the computed biases
        :param 1 factor: final mean number of normalized interactions wanted
           per cell (excludes filtered, or bad, out columns)
        """
        bias = iterative(self, iterations=iterations,
                         max_dev=max_dev, bads=self.bads,
                         verbose=not silent)
        if sqrt:
            bias = dict((b, bias[b]**0.5) for b in bias)
        if factor:
            if not silent:
                print('rescaling to factor %d' % factor)
                print('  - getting the sum of the matrix')
            # get the sum on half of the matrix
            norm_sum = self.sum(bias)
            if not silent:
                print('    => %.3f' % norm_sum)
                print('  - rescaling biases')
            # divide biases
            target = (norm_sum / float(len(self) * len(self) * factor))**0.5
            bias = dict([(b, bias[b] * target) for b in bias])
        self.bias = bias

[docs]    def save_biases(self, fnam, protocol=None):
        """
        Save biases, decay and bad columns in pickle format (to be loaded by
        the function load_hic_data_from_bam)

        :param fnam: path to output file
        """
        out = open(fnam, 'wb')

        dump({'biases'    : self.bias,
              'decay'     : self.expected,
              'badcol'    : self.bads,
              'resolution': self.resolution}, out,
             protocol if protocol else HIGHEST_PROTOCOL)
        out.close()

[docs]    def load_biases(self, fnam, protocol=None):
        """
        Load biases, decay and bad columns from pickle file

        :param fnam: path to input pickle file
        """
        biases = load(open(fnam,'rb'))
        if biases['resolution'] != self.resolution:
            raise Exception(('Error: resolution in Pickle (%d) does not match '
                             'the one of this HiC_data object (%d)') % (
                                 biases['resolution'], self.resolution))
        self.bias     = biases['biases']
        self.expected = biases['decay']
        self.bads     = biases['badcol']

    def get_as_tuple(self):
        return tuple([self[i, j]
                      for j in range(len(self))
                      for i in range(len(self))])

[docs]    def write_coord_table(self, fname, focus=None, diagonal=True,
                          normalized=False, format='BED'):
        """
        writes a coordinate table to a file.

        :param None focus: a tuple with the (start, end) position of the desired
           window of data (start, starting at 1, and both start and end are
           inclusive). Alternatively a chromosome name can be input or a tuple
           of chromosome name, in order to retrieve a specific inter-chromosomal
           region
        :param True diagonal: if False, diagonal is replaced by zeroes
        :param False normalized: get normalized data
        :param BED format: either "BED"
               chr1   \t   111   \t   222   \t   chr2:333-444,55   \t   1   \t   .
               chr2   \t   333   \t   444   \t   chr1:111-222,55   \t   2   \t   .
           or "long-range" format:
               chr1:111-222   \t   chr2:333-444   \t   55
               chr2:333-444   \t   chr1:111-222   \t   55
        """
        if focus:
            if isinstance(focus, tuple) and isinstance(focus[0], int):
                if len(focus) == 2:
                    start1, end1 = focus
                    start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
                else:
                    start1, end1, start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
            elif isinstance(focus, tuple) and isinstance(focus[0], basestring):
                start1, end1 = self.section_pos[focus[0]]
                start2, end2 = self.section_pos[focus[1]]
            else:
                start1, end1 = self.section_pos[focus]
                start2, end2 = self.section_pos[focus]
        else:
            start1 = start2 = 0
            end1   = end2   = len(self)
        out = open(fname, 'w')
        if format == 'long-range':
            rownam = ['%s:%d-%d' % (k[0],
                                     k[1] * self.resolution,
                                     (k[1] + 1) * self.resolution)
                      for k in sorted(self.sections,
                                      key=lambda x: self.sections[x])
                      if start2 <= self.sections[k] < end2]
            if not rownam:
                raise Exception('ERROR: HiC data object should have genomic coordinates')
            iter_rows = self.yield_matrix(focus=focus, diagonal=diagonal,
                                          normalized=normalized)
            pair_string = '%s\t%s\t%f\n' if normalized else '%s\t%s\t%d\n'
            for nrow, row in enumerate(rownam, 1):
                line = next(iter_rows)
                iter_cols = iter(line)
                for col in rownam[nrow:]:
                    val = next(iter_cols)
                    if not val:
                        continue
                    out.write(pair_string % (row, col, val))
        elif format == 'BED':
            rownam = ['%s\t%d\t%d' % (k[0],
                                     k[1] * self.resolution,
                                     (k[1] + 1) * self.resolution)
                      for k in sorted(self.sections,
                                      key=lambda x: self.sections[x])
                      if start2 <= self.sections[k] < end2]
            colnam = ['%s:%d-%d' % (k[0],
                                     k[1] * self.resolution,
                                     (k[1] + 1) * self.resolution)
                      for k in sorted(self.sections,
                                      key=lambda x: self.sections[x])
                      if start2 <= self.sections[k] < end2]
            if not rownam:
                raise Exception('ERROR: Hi-C data object should have genomic coordinates')
            iter_rows = self.yield_matrix(focus=focus, diagonal=diagonal,
                                          normalized=normalized)
            pair_string = '%s\t%s,%f\t%d\t.\n' if normalized else '%s\t%s,%d\t%d\t.\n'
            count = 1
            for nrow, row in enumerate(rownam, 1):
                line = next(iter_rows)
                iter_cols = iter(line)
                for col in colnam[nrow:]:
                    val = next(iter_cols)
                    if not val:
                        continue
                    out.write(pair_string % (row, col, val, count))
                    count += 1
        else:
            raise Exception('ERROR: format "%s" not found\n' % format)
        out.close()

[docs]    def write_cooler(self, fname, normalized=False):
        """
        writes the hic_data to a cooler file.

        :param False normalized: get normalized data
        """
        if 'h5py' not in modules:
            raise Exception('ERROR: cooler output is not available. Probably ' +
                            'you need to install h5py\n')
        if normalized and not self.bias:
            raise Exception('ERROR: data not normalized yet')
        if not all(isinstance(val, int) for _, val in self.items()):
            raise Exception('ERROR: raw hic data (integer values) is needed for cooler format')
        if self.chromosomes:
            if len(self.chromosomes) > 1:
                sections = OrderedDict((key,val*self.resolution)
                                       for key, val in self.chromosomes.items())
            else: # maybe part of a matrix
                sections = {next(iter(self.chromosomes)): self.__size*self.resolution}
        else: # maybe part of a matrix
            sections = {"Unknown": self.__size*self.resolution}

        out = cooler_file(fname, self.resolution, sections, list(sections.keys()))
        out.create_bins()
        out.prepare_matrix()
        for key, value in self.items():
            row, col = key // self.__size, key % self.__size
            if row > col: # only upper triangular
                continue
            out.write_iter(0, row, col, value)
        out.close()
        if normalized:
            weights = [self.bias[i] if not i in self.bads else 0. for i in range(self.__size)]
            out.write_weights(weights, weights)

[docs]    def write_matrix(self, fname, focus=None, diagonal=True, normalized=False):
        """
        writes the matrix to a file.

        :param None focus: a tuple with the (start, end) position of the desired
           window of data (start, starting at 1, and both start and end are
           inclusive). Alternatively a chromosome name can be input or a tuple
           of chromosome name, in order to retrieve a specific inter-chromosomal
           region
        :param True diagonal: if False, diagonal is replaced by zeroes
        :param False normalized: get normalized data
        """
        if focus:
            if isinstance(focus, tuple) and isinstance(focus[0], int):
                if len(focus) == 2:
                    start1, end1 = focus
                    start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
                else:
                    start1, end1, start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
            elif isinstance(focus, tuple) and isinstance(focus[0], basestring):
                start1, end1 = self.section_pos[focus[0]]
                start2, end2 = self.section_pos[focus[1]]
            else:
                start1, end1 = self.section_pos[focus]
                start2, end2 = self.section_pos[focus]
        else:
            start1 = start2 = 0
            end1   = end2   = len(self)
        out = open(fname, 'w')
        out.write('# MASKED %s\n' % (','.join([str(k - start1)
                                               for k in list(self.bads.keys())
                                               if start1 <= k <= end1])))
        rownam = ['%s\t%d-%d' % (k[0],
                                 k[1] * self.resolution + 1,
                                 (k[1] + 1) * self.resolution)
                  for k in sorted(self.sections,
                                  key=lambda x: self.sections[x])
                  if start2 <= self.sections[k] < end2]
        if rownam:
            for line in self.yield_matrix(focus=focus, diagonal=diagonal,
                                          normalized=normalized):
                out.write(rownam.pop(0) + '\t' +
                          '\t'.join([str(i) for i in line]) + '\n')
        else:
            for line in self.yield_matrix(focus=focus, diagonal=diagonal,
                                          normalized=normalized):
                out.write('\t'.join([str(i) for i in line]) + '\n')
        out.close()

[docs]    def get_matrix(self, focus=None, diagonal=True, normalized=False,
                   masked=False):
        """
        returns a matrix.

        :param None focus: a tuple with the (start, end) position of the desired
           window of data (start, starting at 1, and both start and end are
           inclusive). Alternatively a chromosome name can be input or a tuple
           of chromosome name, in order to retrieve a specific inter-chromosomal
           region
        :param True diagonal: if False, diagonal is replaced by ones, or zeroes
           if normalized
        :param False normalized: get normalized data
        :param False masked: return masked arrays using the definition of bad
           columns

        :returns: matrix (a list of lists of values)
        """
        if normalized and not self.bias:
            raise Exception('ERROR: experiment not normalized yet')
        start1, start2, end1, end2 = self._focus_coords(focus)
        if normalized:
            if diagonal:
                matrix = [[self[i, j] / self.bias[i] / self.bias[j]
                           for i in range(start2, end2)]
                          for j in range(start1, end1)]
            else:
                matrix = [[self[i, j] / self.bias[i] / self.bias[j]
                           for i in range(start2, end2)]
                          for j in range(start1, end1)]
                if start1 == start2:
                    for i in range(len(matrix)):
                        matrix[i][i] = 0
        else:
            if diagonal:
                matrix = [[self[i, j] for i in range(start2, end2)]
                          for j in range(start1, end1)]
            else:
                matrix = [[self[i, j] for i in range(start2, end2)]
                          for j in range(start1, end1)]
                if start1 == start2:
                    for i in range(len(matrix)):
                        matrix[i][i] = 1 if matrix[i][i] else 0

        if masked:
            bads1 = [b - start1 for b in self.bads if start1 <= b < end1]
            bads2 = [b - start2 for b in self.bads if start2 <= b < end2]
            m = zeros_like(matrix)
            for bad1 in bads1:
                m[:,bad1] = 1
                for bad2 in bads2:
                    m[bad2,:] = 1
            matrix = ma.masked_array(matrix, m)

        return matrix

    def _focus_coords(self, focus):
        siz = len(self)
        if focus:
            if isinstance(focus, tuple) and isinstance(focus[0], int):
                if len(focus) == 2:
                    start1, end1 = focus
                    start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
                else:
                    start1, end1, start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
            elif isinstance(focus, tuple) and isinstance(focus[0], basestring):
                start1, end1 = self.section_pos[focus[0].split(':')[0]]
                start2, end2 = self.section_pos[focus[1].split(':')[0]]
                if ':' in focus[0]:
                    pos = focus[0].split(':')[1]
                    try:
                        pos1, pos2 = [int(p) // self.resolution
                                      for p in pos.split('-')]
                    except ValueError:
                        raise Exception('ERROR: should be in format "chr3:10000:20000"')
                    start1, end1 = start1 + pos1, start1 + pos2
                if ':' in focus[1]:
                    pos = focus[0].split(':')[1]
                    try:
                        pos1, pos2 = [int(p) // self.resolution
                                      for p in pos.split('-')]
                    except ValueError:
                        raise Exception('ERROR: should be in format "chr3:10000:20000"')
                    start2, end2 = start1 + pos1, start1 + pos2
            else:
                start1, end1 = self.section_pos[focus.split(':')[0]]
                if ':' in focus:
                    pos = focus.split(':')[1]
                    try:
                        pos1, pos2 = [int(p) // self.resolution
                                      for p in pos.split('-')]
                    except ValueError:
                        raise Exception('ERROR: should be in format "chr3:10000:20000"')
                    start1, end1 = start1 + pos1, start1 + pos2
                start2, end2 = start1, end1
        else:
            start1 = start2 = 0
            end1   = end2   = siz
        return start1, start2, end1, end2

[docs]    def find_compartments(self, crms=None, savefig=None, savedata=None,
                          savecorr=None, show=False, suffix='', ev_index=None,
                          rich_in_A=None, format='png', savedir=None,
                          max_ev=3, show_compartment_labels=False, **kwargs):
        """
        Search for A/B compartments in each chromosome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromosome names.

        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name_EV1.png'.
        :param png format: in which to save the figures.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param None savedir: path to a directory to store coordinates of each
           eigenvector, one per chromosome. Each file contains one eigenvector
           per column, the first one being the one used as reference. This
           eigenvector is also rotated according to the prediction if a
           `rich_in_A` array was given.
        :param None savecorr: path to a directory where to save correlation
           matrices of each chromosome
        :param -1 vmin: for the color scale of the plotted map (use vmin='auto',
           and vmax='auto' to color according to the absolute maximum found).
        :param 1 vmax: for the color scale of the plotted map (use vmin='auto',
           and vmax='auto' to color according to the absolute maximum found).
        :param False yield_ev1: if True yields one list per chromosome with the
           first eigenvector used to compute compartments.
        :param '' suffix: to be placed after file names of compartment images
        :param 3 max_ev: maximum number of EV to try
        :param None ev_index: a list of number referring to the index of the
           eigenvector to be used. By default the first eigenvector is used.
           WARNING: index starts at 1, default is thus a list of ones. Note:
           if asking for only one chromosome the list should be only of one
           element.
        :param None rich_in_A: by default compartments are identified using mean
           number of intra-interactions (A compartments are expected to have
           less). However this measure is not very accurate. Using this
           parameter a path to a BED or BED-Graph file with a list of genes or
           active epigenetic marks can be passed, and used instead of the mean
           interactions.
        :param False show_compartment_labels: if True draw A and B compartment blocks.

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.

        :returns: 1- a dictionary with the N (max_ev) first
           eigenvectors in the form:
           {Chromosome_name: (Eigenvalue: [Eigenvector])}
           Sign of the eigenvectors are changed in order to match the
           prediction of A/B compartments (positive is A).
           2- a dictionary of statistics of enrichment for A compartments
           (Spearman rho).
        """
        if not self.bads:
            if kwargs.get('verbose', False):
                print('Filtering bad columns %d' % 99)
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False, silent=True)
            if len(self.bads) == len(self):
                self.bads = {}
                warn('WARNING: all columns would have been filtered out, '
                     'filtering disabled')
        if not self.expected:
            if kwargs.get('verbose', False):
                print('Normalizing by expected values')
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', False):
                print('Normalizing by ICE (1 round)')
            self.normalize_hic(iterations=0,
                               silent=not kwargs.get('verbose', False))
        if savefig:
            mkdir(savefig)
        if savecorr:
            mkdir(savecorr)
        if savedir:
            mkdir(savedir)
        if suffix != '':
            suffix = '_' + suffix
        # parse bed file
        if rich_in_A and isinstance(rich_in_A, basestring):
            rich_in_A = parse_bed(rich_in_A, resolution=self.resolution)

        cmprts = {}
        firsts = {}
        ev_nums = {}
        count = 0
        richA_stats = dict((sec, None) for sec in self.section_pos)

        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print('Processing chromosome', sec)
            # get chromosomal matrix
            try:
                matrix = [[(float(self[i,j]) / self.expected[sec][abs(j-i)]
                            / self.bias[i] / self.bias[j])
                        for i in range(*self.section_pos[sec])
                        if not i in self.bads]
                        for j in range(*self.section_pos[sec])
                        if not j in self.bads]
            except KeyError:
                if sec in self.expected and not self.expected[sec]:
                    matrix = []
                else:
                    matrix = [[(float(self[i,j]) / self.expected[abs(j-i)]
                                / self.bias[i] / self.bias[j])
                            for i in range(*self.section_pos[sec])
                            if not i in self.bads]
                            for j in range(*self.section_pos[sec])
                            if not j in self.bads]
            if not matrix: # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                count += 1
                continue
            # enforce symmetry
            for i in range(len(matrix)):
                for j in range(i+1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            # compute correlation coefficient
            try:
                matrix = [list(m) for m in corrcoef(matrix)]
            except TypeError:
                # very small chromosome?
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                count += 1
                continue
            # replace nan in correlation matrix
            matrix = [[0. if isnan(v) else v for v in l] for l in matrix]
            # write correlation matrix to file. replaces filtered row/columns by NaN
            if savecorr:
                out = open(os.path.join(savecorr, '%s_corr-matrix%s.tsv' % (sec, suffix)),
                           'w')
                start1, end1 = self.section_pos[sec]
                out.write('# MASKED %s\n' % (' '.join([str(k - start1)
                                                       for k in list(self.bads.keys())
                                                       if start1 <= k <= end1])))
                rownam = ['%s\t%d-%d' % (k[0],
                                         k[1] * self.resolution,
                                         (k[1] + 1) * self.resolution)
                          for k in sorted(self.sections,
                                          key=lambda x: self.sections[x])
                          if k[0] == sec]
                length = self.section_pos[sec][1] - self.section_pos[sec][0]
                empty = 'NaN\t' * (length - 1) + 'NaN\n'
                badrows = 0
                for row, posx in enumerate(range(self.section_pos[sec][0],
                                                  self.section_pos[sec][1])):
                    if posx in self.bads:
                        out.write(rownam.pop(0) + '\t' + empty)
                        badrows += 1
                        continue
                    vals = []
                    badcols = 0
                    for col, posy in enumerate(range(self.section_pos[sec][0],
                                                      self.section_pos[sec][1])):
                        if posy in self.bads:
                            vals.append('NaN')
                            badcols += 1
                            continue
                        vals.append(str(matrix[row-badrows][col-badcols]))
                    out.write(rownam.pop(0) + '\t' +'\t'.join(vals) + '\n')
                out.close()

            # get eigenvectors
            try:
                # This eighs is very very fast, only ask for one eigenvector
                evals, evect = eigsh(array(matrix),
                                     k=max_ev if max_ev else (len(matrix) - 1))
            except (LinAlgError, ValueError):
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = [] # Y chromosome, or so...
                count += 1
                continue
            # define breakpoints, and store first EVs
            n_first = [list(evect[:, -i])
                       for i in range(1, (max_ev + 1)
                                       if max_ev else len(matrix))]
            ev_num = (ev_index[count] - 1) if ev_index else 0
            breaks = [i for i, (a, b) in
                      enumerate(zip(n_first[ev_num][1:], n_first[ev_num][:-1]))
                      if a * b < 0] + [len(n_first[ev_num]) - 1]
            breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b}
                      for i, b in enumerate(breaks)]

            # rescale EVs, matrix and breaks by inserting NaNs in bad column places
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in sorted(self.bads) if beg <= k <= end]
            for evect in n_first:
                _ = [evect.insert(b, float('nan')) for b in bads]
            _ = [matrix.insert(b, [float('nan')] * len(matrix[0]))
                 for b in bads]
            _ = [matrix[i].insert(b, float('nan'))
                 for b in bads for i in range(len(n_first[0]))]
            for b in bads:  # they are sorted
                for brk in breaks:
                    if brk['start'] >= b:
                        brk['start'] += 1
                        brk['end'  ] += 1
                    else:
                        brk['end'  ] += brk['end'] > b
            bads = set(bads)

            # rescale first EV and change sign according to rich_in_A
            richA_stats[sec] = None
            sign = 1
            if rich_in_A and sec in rich_in_A:
                eves = []
                gccs = []
                for i, v in enumerate(n_first[ev_num]):
                    if i in bads:
                        continue
                    try:
                        gc = rich_in_A[sec][i]
                    except KeyError:
                        continue
                    gccs.append(gc)
                    eves.append(v)
                r_stat, richA_pval = spearmanr(eves, gccs)
                if kwargs.get('verbose', False):
                    print ('  - Spearman correlation between "rich in A" and '
                           'Eigenvector:\n'
                           '      rho: %.7f p-val:%.7f' % (r_stat, richA_pval))
                richA_stats[sec] = r_stat
                # switch sign and normalize
                sign = 1 if r_stat > 0 else -1
            for i in range(len(n_first)):
                n_first[i] = [sign * v for v in n_first[i]]
            # store it
            ev_nums[sec] = ev_num + 1
            cmprts[sec] = breaks
            if rich_in_A:
                for cmprt in cmprts[sec]:
                    try:
                        cmprt['dens'] = sum(rich_in_A.get(sec, {None: 0}).get(i, 0)
                                            for i in range(cmprt['start'], cmprt['end'] + 1)
                                            if not i in bads) / float(cmprt['end'] - cmprt['start'])
                    except ZeroDivisionError:
                        cmprt['dens'] = float('nan')
                    cmprt['type'] = 'A' if n_first[ev_num][cmprt['start']] > 0 else'B'
            firsts[sec] = (evals[::-1], n_first)

            # needed for the plotting
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax',  1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npperc(matrix, 99.5)),
                                abs(npperc(matrix, 0.5))])
                    vmin = -vmax
                try:
                    if savefig:
                        fnam = os.path.join(savefig,
                                            '%s_EV%d%s.%s' % (str(sec),
                                                              ev_nums[sec],
                                                              suffix,
                                                              format))
                    else:
                        fnam = None
                    plot_compartments(
                        sec, n_first[ev_num], cmprts, matrix, show, fnam,
                        vmin=vmin, vmax=vmax, whichpc=ev_num + 1,
                        showAB=show_compartment_labels)
                except AttributeError:
                    warn(('WARNING: chromosome %s too small for plotting.'
                          'Skipping image creation.') % sec)
                except ValueError:
                    warn(('WARNING: chromosome %s too small for plotting.'
                          'Skipping image creation.') % sec)

        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata, chroms=list(self.compartments.keys()),
                                    ev_nums=ev_nums)

        if savedir:
            ncrm = 0
            for sec in self.section_pos:
                if crms and sec not in crms:
                    continue
                ev_file = open(os.path.join(
                    savedir, '%s_EigVect%d.tsv' % (
                        sec, ev_index[ncrm] if ev_index else 1)), 'w')
                ev_file.write('# %s\n' % ('\t'.join(
                    'EV_%d (%.4f)' % (i, v)
                    for i, v in enumerate(firsts[sec][0], 1))))
                ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                         for vs in zip(*firsts[sec][1])]))
                ev_file.close()
                ncrm += 1

        return firsts, richA_stats

[docs]    def find_compartments_beta(self, crms=None, savefig=None, savedata=None,
                          savecorr=None, show=False, suffix='', how='',
                          label_compartments='hmm', log=None, max_mean_size=10000,
                          ev_index=None, rich_in_A=None, max_ev=3,show_compartment_labels=False, **kwargs):
        """
        Search for A/B compartments in each chromosome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromosome names.

        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param None savecorr: path to a directory where to save correlation
           matrices of each chromosome
        :param -1 vmin: for the color scale of the plotted map (use vmin='auto',
           and vmax='auto' to color according to the absolute maximum found).
        :param 1 vmax: for the color scale of the plotted map (use vmin='auto',
           and vmax='auto' to color according to the absolute maximum found).
        :param False yield_ev1: if True yields one list per chromosome with the
           first eigenvector used to compute compartments.
        :param '' suffix: to be placed after file names of compartment images
        :param 3 max_ev: maximum number of EV to try
        :param None ev_index: a list of number referring to the index of the
           eigenvector to be used. By default the first eigenvector is used.
           WARNING: index starts at 1, default is thus a list of ones. Note:
           if asking for only one chromosome the list should be only of one
           element.
        :param None rich_in_A: by default compartments are identified using mean
           number of intra-interactions (A compartments are expected to have
           less). However this measure is not very accurate. Using this
           parameter a path to a BED or BED-Graph file with a list of genes or
           active epigenetic marks can be passed, and used instead of the mean
           interactions.
        :param None log: path to a folder where to save log of the assignment
           of A/B compartments
        :param hmm label_compartments: label compartments into A/B categories,
           otherwise just find borders (faster). Can be either hmm (default), or
           cluster.
        :param 'ratio' how: ratio divide by column, subratio divide by
           compartment, diagonal only uses diagonal
        :param False'show_compartment_labels': if True draw A and B compartment blocks.


        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.

        :returns: a dictionary with the N (max_ev) first eigen vectors used to define
           compartment borders for each chromosome (keys are chromosome names)
        """
        if not self.bads:
            if kwargs.get('verbose', False):
                print('Filtering bad columns %d' % 99)
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False, silent=True)
            if len(self.bads) == len(self):
                self.bads = {}
                warn('WARNING: all columns would have been filtered out, '
                     'filtering disabled')
        if not self.expected:
            if kwargs.get('verbose', False):
                print('Normalizing by expected values')
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', False):
                print('Normalizing by ICE (1 round)')
            self.normalize_hic(iterations=0,
                               silent=not kwargs.get('verbose', False))
        if savefig:
            mkdir(savefig)
        if savecorr:
            mkdir(savecorr)
        if suffix != '':
            suffix = '_' + suffix
        # parse bed file
        if rich_in_A:
            rich_in_A = parse_bed(rich_in_A, resolution=self.resolution)

        cmprts = {}
        firsts = {}
        ev_nums = {}
        count = 0

        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print('Processing chromosome', sec)
            matrix = [[(float(self[i,j]) / self.expected[abs(j-i)]
                       / self.bias[i] / self.bias[j])
                      for i in range(*self.section_pos[sec])
                       if not i in self.bads]
                     for j in range(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix: # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                count += 1
                continue
            for i in range(len(matrix)):
                for j in range(i+1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            try:
                matrix = [list(m) for m in corrcoef(matrix)]
            except TypeError:
                # very small chromosome?
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                count += 1
                continue
            # write correlation matrix to file. replaces filtered row/columns by NaN
            if savecorr:
                out = open(os.path.join(savecorr, '%s_corr-matrix.tsv' % (sec)),
                           'w')
                start1, end1 = self.section_pos[sec]
                out.write('# MASKED %s\n' % (' '.join([str(k - start1)
                                                       for k in list(self.bads.keys())
                                                       if start1 <= k <= end1])))
                rownam = ['%s\t%d-%d' % (k[0],
                                         k[1] * self.resolution,
                                         (k[1] + 1) * self.resolution)
                          for k in sorted(self.sections,
                                          key=lambda x: self.sections[x])
                          if k[0] == sec]
                length = self.section_pos[sec][1] - self.section_pos[sec][0]
                empty = 'NaN\t' * (length - 1) + 'NaN\n'
                badrows = 0
                for row, posx in enumerate(range(self.section_pos[sec][0],
                                                  self.section_pos[sec][1])):
                    if posx in self.bads:
                        out.write(rownam.pop(0) + '\t' + empty)
                        badrows += 1
                        continue
                    vals = []
                    badcols = 0
                    for col, posy in enumerate(range(self.section_pos[sec][0],
                                                      self.section_pos[sec][1])):
                        if posy in self.bads:
                            vals.append('NaN')
                            badcols += 1
                            continue
                        vals.append(str(matrix[row-badrows][col-badcols]))
                    out.write(rownam.pop(0) + '\t' +'\t'.join(vals) + '\n')
                out.close()

            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=max_ev)
            except (LinAlgError, ValueError):
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = [] # Y chromosome, or so...
                count += 1
                continue
            index = ev_index[count] if ev_index else 1
            n_first = [list(evect[:, -i]) for i in range(1, max_ev + 1)]
            for ev_num in range(index, max_ev + 1):
                first = list(evect[:, -ev_num])
                breaks = [i for i, (a, b) in
                          enumerate(zip(first[1:], first[:-1]))
                          if a * b < 0] + [len(first) - 1]
                breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b}
                          for i, b in enumerate(breaks)]
                if (self.resolution * (len(breaks) - 1.0) / len(matrix)
                    > max_mean_size):
                    warn('WARNING: number of compartments found with the '
                         'EigenVector number %d is too low (%d compartments '
                         'in %d rows), for chromosome %s' % (
                             ev_num, len(breaks), len(matrix), sec))
                else:
                    break
            if (self.resolution * (len(breaks) - 1.0) / len(matrix)
                > max_mean_size):
                warn('WARNING: keeping first eigenvector, for chromosome %s' % (
                    sec))
                ev_num = 1
            if ev_index:
                ev_num = ev_index[count]
            first = list(evect[:, -ev_num])
            breaks = [i for i, (a, b) in
                      enumerate(zip(first[1:], first[:-1]))
                      if a * b < 0] + [len(first) - 1]
            breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b}
                      for i, b in enumerate(breaks)]
            ev_nums[sec] = ev_num
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in sorted(self.bads) if beg <= k <= end]
            for evect in n_first:
                _ = [evect.insert(b, float('nan')) for b in bads]
            _ = [first.insert(b, 0) for b in bads]
            _ = [matrix.insert(b, [float('nan')] * len(matrix[0]))
                 for b in bads]
            _ = [matrix[i].insert(b, float('nan'))
                 for b in bads for i in range(len(first))]
            breaks = [i for i, (a, b) in
                      enumerate(zip(first[1:], first[:-1]))
                      if a * b < 0] + [len(first) - 1]
            breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b}
                      for i, b in enumerate(breaks)]
            cmprts[sec] = breaks
            firsts[sec] = n_first
            # needed for the plotting
            self._apply_metric(cmprts, sec, rich_in_A, how=how)

            if label_compartments == 'cluster':
                if log:
                    logf = os.path.join(log, sec + suffix + '.log')
                else:
                    logf = None

                gammas = {}
                for n_clust in range(2, 4):
                    for gamma in range(0, 101, 1):
                        scorett, tt, prop = _cluster_ab_compartments(
                            float(gamma)/100, matrix, breaks, cmprts[sec],
                            rich_in_A, ev_num=ev_num, log=logf, save=False,
                            verbose=kwargs.get('verbose', False),
                            n_clust=n_clust)
                        gammas[gamma] = scorett, tt, prop
                    gamma = min(list(gammas.keys()), key=lambda k: gammas[k][0])
                    if gammas[gamma][0] - gammas[gamma][1] > 7:
                        print( ' WARNING: minimum showing very low '
                               'intermeagling of A/B compartments, trying '
                               'with 3 clusters, for chromosome %s', sec)
                        gammas = {}
                        continue
                    if kwargs.get('verbose', False):
                        print('   ====>  minimum:', gamma)
                    break
                _ = _cluster_ab_compartments(float(gamma)/100, matrix, breaks,
                                          cmprts[sec], rich_in_A, save=True,
                                          log=logf, ev_num=ev_num, n_clust=n_clust)

            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax',  1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npperc(matrix, 99.5)),
                                abs(npperc(matrix, 0.5))])
                    vmin = -vmax
                plot_compartments(
                    sec, first, cmprts, matrix, show,
                    savefig + '/chr' + str(sec) + suffix + '.png' if savefig else None,
                    vmin=vmin, vmax=vmax, whichpc=ev_num,showAB=show_compartment_labels)

                if label_compartments == 'cluster' or label_compartments == 'hmm':
                    plot_compartments_summary(
                          sec, cmprts, show,
                          savefig + '/chr' + str(sec) + suffix + '_summ.png' if savefig else None)
            count += 1

        if label_compartments == 'hmm':
            x = {}
            for sec in self.section_pos:
                beg, end = self.section_pos[sec]
                bads = [k - beg for k in self.bads if beg <= k <= end]
                try:
                    x[sec] = [j for i, j in enumerate(firsts[sec][ev_nums[sec] - 1])
                              if not i in bads]
                except KeyError:
                    continue

            # train two HMMs on the genomic data:
            #  - one with 2 states A B
            #  - one with 3 states A B I
            #  - one with 4 states A a B b
            #  - one with 5 states A a B b I
            models = {}
            for n in range(2, 6):
                if kwargs.get('verbose', False):
                    print ('Training HMM for %d categories of '
                           'compartments' % n)
                models[n] = _training(x, n, kwargs.get('verbose', False))

            # apply HMM models on each chromosome
            results = {}
            for sec in self.section_pos:
                if not sec in x:
                    continue
                beg, end = self.section_pos[sec]
                bads = [k - beg for k in self.bads if beg <= k <= end]
                if kwargs.get('verbose', False):
                    print('Chromosome', sec)
                # print 'CMPRTS before   ', sec, cmprts[sec]
                n_states, breaks = _hmm_refine_compartments(
                    x[sec], models, bads, kwargs.get('verbose', False))
                results[sec] = n_states, breaks
                cmprts[sec] = breaks
                # print 'CMPRTS after hmm', sec, cmprts[sec]
                self._apply_metric(cmprts, sec, rich_in_A, how=how)

                if rich_in_A:
                    test = lambda x: x >= 1
                else:
                    test = lambda x: x < 1
                max_type = nanmax([c['type'] for c in cmprts[sec]])

                # find which category of compartment has the highest "density"
                atyp = 0.
                alen = 0.
                btyp = 0.
                blen = 0.
                max_type = nanmax([c['type'] for c in cmprts[sec]])
                for typ in range(5):
                    subset = set([i for i, c in enumerate(cmprts[sec])
                                 if c['type'] == typ])
                    dens = sum(cmprts[sec][c]['dens'] * (cmprts[sec][c]['end'] - cmprts[sec][c]['start']) for c in subset)
                    leng = sum((cmprts[sec][c]['end'] - cmprts[sec][c]['start'])**2 / 2. for c in subset)
                    # leng = sum(1 for c in subset)
                    val = float(dens) / leng if leng else 0.
                    #if typ == 0:
                    if typ < max_type / 2.:
                        alen += leng
                        atyp += val * leng
                    # elif typ == max_type:
                    elif typ > max_type / 2.:
                        blen += leng
                        btyp += val * leng

                for i, comp in enumerate(cmprts[sec]):
                    if comp['type'] < max_type / 2.:
                        # if mean density of compartments of type 0 is higher than 1
                        # than label them as 'B', otherwise, as 'A'
                        if comp['type'] == 0:
                            comp['type'] = 'A' if test(val) else 'B'
                        else:
                            comp['type'] = 'a' if test(val) else 'b'
                    elif comp['type'] > max_type / 2.:
                        if comp['type'] == max_type:
                            comp['type'] = 'B' if test(val) else 'A'
                        else:
                            comp['type'] = 'b' if test(val) else 'a'
                    elif isnan(comp['type']):
                        comp['type'] = 'NA'
                    else:
                        comp['type'] = 'I'
        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata, chroms=list(self.compartments.keys()),
                                    ev_nums=ev_nums)
        return firsts

    def _apply_metric(self, cmprts, sec, rich_in_A, how='ratio'):
        """
        calculate compartment internal density if no rich_in_A, otherwise
        sum this list
        """
        # print 'SEGMENTS'
        # print sec, self.section_pos[sec]
        # for i in range(0, len(cmprts[sec]), 20):
        #     print '  ' + ''.join(['%5d/%-5d'% (s['start'], s['end']) for s in cmprts[sec][i:i+20]])
        # print 'CHROMOSOME', sec
        for cmprt in cmprts[sec]:
            if rich_in_A:
                beg1, end1 = cmprt['start'], cmprt['end'] + 1
                sec_matrix = [rich_in_A.get(sec, {None: 0}).get(i, 0)
                              for i in range(beg1, end1)
                              if not i in self.bads]
                try:
                    cmprt['dens'] = float(sum(sec_matrix)) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            else:
                beg, end = self.section_pos[sec]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg + 1
                # print 'BEG:%7d, END:%7d, LEN bias:%7d, LEN self:%7d, LEN expected:%7d' % (beg1, end1, len(self.bias),
                                                                                          # len(self), len(self.expected))
                if 'diagonal' in how:
                    sec_matrix = [(self[i,i] / self.expected[0] / self.bias[i]**2)
                                  for i in range(beg1, end1) if not i in self.bads]
                else: #if 'compartment' in how:
                    sec_matrix = [(self[i,j] / self.expected[abs(j-i)]
                                   / self.bias[i] / self.bias[j])
                                  for i in range(beg1, end1) if not i in self.bads
                                  for j in range(beg1, end1) if not j in self.bads]
                if '/compartment' in how: # diagonal / compartment
                    sec_column = [(self[i,j] / self.expected[abs(j-i)]
                                   / self.bias[i] / self.bias[j])
                                  for i in range(beg1, end1) if not i in self.bads
                                  for j in range(beg1, end1) if not j in self.bads]
                elif '/column' in how:
                    sec_column = [(self[i,j] / self.expected[abs(j-i)]
                                   / self.bias[i] / self.bias[j])
                                  for i in range(beg1, end1) if not i in self.bads
                                  for j in range(beg, end)
                                  if not j in self.bads]
                else:
                    sec_column = [1.]
                try:
                    if 'type' in cmprt and isnan(cmprt['type']):
                        cmprt['dens'] = 1.
                    else:
                        cmprt['dens'] = float(sum(sec_matrix)) / sum(sec_column)
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
        # normalize to 1.0
        try:
            if 'type' in cmprt: # hmm already run and have the types definded
                meanh = (sum(cmprt['dens'] for cmprt in cmprts[sec]
                             if not isnan(cmprt['type'])) /
                         sum(1 for cmprt in cmprts[sec]
                             if not isnan(cmprt['type'])))
            else:
                meanh = (sum(cmprt['dens'] for cmprt in cmprts[sec]) /
                         sum(1 for cmprt in cmprts[sec]))
        except ZeroDivisionError:
            meanh = 1.
        for cmprt in cmprts[sec]:
            try:
                if 'type' in cmprt and isnan(cmprt['type']):
                    cmprt['dens'] = 1.0
                else:
                    cmprt['dens'] /= meanh
            except ZeroDivisionError:
                cmprt['dens'] = 1.

[docs]    def write_compartments(self, savedata, chroms=None, ev_nums=None):
        """
        Write compartments to a file.

        :param savedata: path to a file.
        :param None chroms: write only the given list of chromosomes (default
           all chromosomes are written, note that the first column corresponding
           to chromosome name will disappear in non default case)
        """
        out = open(savedata, 'w')
        sections = chroms if chroms else list(self.compartments.keys())
        if ev_nums:
            for sec in sections:
                try:
                    out.write('## CHR %s\tEigenvector: %d\n' % (sec, ev_nums[sec]))
                except KeyError:
                    continue
        out.write('#%sstart\tend\trich in A\ttype\n'% (
            'CHR\t' if len(sections) > 1 else '\t'))
        for sec in sections:
            for c in self.compartments[sec]:
                out.write('%s%d\t%d\t%.2f\t%s\n' % (
                    (str(sec) + '\t') if sections else '\t',
                    c['start'] + 1, c['end'] + 1,
                    c.get('dens', float('nan')), c.get('type', '')))
        out.close()


[docs]    def yield_matrix(self, focus=None, diagonal=True, normalized=False):
        """
        Yields a matrix line by line.
        Bad row/columns are returned as null row/columns.

        :param None focus: a tuple with the (start, end) position of the desired
           window of data (start, starting at 1, and both start and end are
           inclusive). Alternatively a chromosome name can be input or a tuple
           of chromosome name, in order to retrieve a specific inter-chromosomal
           region
        :param True diagonal: if False, diagonal is replaced by zeroes
        :param False normalized: get normalized data

        :yields: matrix line by line (a line being a list of values)
        """
        siz = len(self)
        if normalized and not self.bias:
            raise Exception('ERROR: experiment not normalized yet')
        if focus:
            if isinstance(focus, tuple) and isinstance(focus[0], int):
                if len(focus) == 2:
                    start1, end1 = focus
                    start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
                else:
                    start1, end1, start2, end2 = focus
                    start1 -= 1
                    start2 -= 1
            elif isinstance(focus, tuple) and isinstance(focus[0], basestring):
                start1, end1 = self.section_pos[focus[0]]
                start2, end2 = self.section_pos[focus[1]]
            else:
                start1, end1 = self.section_pos[focus]
                start2, end2 = self.section_pos[focus]
        else:
            start1 = start2 = 0
            end1   = end2   = siz
        if normalized:
            for i in range(start2, end2):
                # if bad column:
                if i in self.bads:
                    yield [0.0 for j in range(start1, end1)]
                # if we want the diagonal, or we don't but are looking at a
                # region that is not symmetric
                elif diagonal or start1 != start2:
                    yield [self[i, j] / self.bias[i] / self.bias[j]
                           for j in range(start1, end1)]
                # diagonal replaced by zeroes
                else:
                    yield ([self[i, j] / self.bias[i] / self.bias[j]
                            for j in range(start1, i)] +
                           [0.0] +
                           [self[i, j] / self.bias[i] / self.bias[j]
                            for j in range(i + 1, end1)])
        else:
            for i in range(start2, end2):
                # if bad column:
                if i in self.bads:
                    yield [0 for j in range(start1, end1)]
                # if we want the diagonal, or we don't but are looking at a
                # region that is not symmetric
                elif diagonal or start1 != start2:
                    yield [self[i, j] for j in range(start1, end1)]
                # diagonal replaced by zeroes
                else:
                    yield ([self[i, j] for j in range(start1, i)] +
                           [0] +
                           [self[i, j] for j in range(i + 1, end1)])


def _hmm_refine_compartments(xsec, models, bads, verbose):
    prevll = float('-inf')
    prevdf = 0
    results = {}
    for n in range(2, 6):
        E, pi, T = models[n]
        probs = gaussian_prob(xsec, E)
        pathm, llm = best_path(probs, pi, T)
        pathm = asarray(list(map(float, pathm)))
        df = n**2 - n + n * 2 + n - 1
        len_seq = len(pathm)
        lrt = gammaincc((df - prevdf) / 2., (llm - prevll) / 2.)
        bic = -2 * llm + df * nplog(len_seq)
        aic = 2 * df - 2 * llm
        if verbose:
            print('Ll for %d states (%d df): %4.0f AIC: %4.0f BIC: %4.0f LRT=%f'% (
                n, df, llm, aic, bic, lrt))
        prevdf = df
        prevll = llm
        results[n] = {'AIC': aic,
                      'BIC': bic,
                      'LRT': lrt,
                      'PATH': pathm}
    n_states = min(results, key=lambda x: results[x]['BIC'])
    results = list(results[n_states]['PATH'])
    # print 'RESULTS', results
    _ = [results.insert(b, float('nan')) for b in sorted(bads)]
    # print 'RESULTS', results
    breaks = [(i, b) for i, (a, b) in
              enumerate(zip(results[1:], results[:-1]))
              if str(a) != str(b)] + [len(results) - 1]
    # print 'BREAKS', breaks
    breaks[-1] = (breaks[-1], results[-1])
    # print 'BREAKS', breaks
    breaks = [{'start': breaks[i-1][0] + 1 if i else 0, 'end': b,
               'type': a}
              for i, (b, a) in enumerate(breaks)]
    # print 'BREAKS', breaks
    return n_states, breaks

def _training(x, n, verbose):
    """
    define default emission transition and initial states, and train the hmm
    """
    pi = [0.5 - ((n - 2) * 0.05)**2 if i == 0 or i == n - 1 else ((n - 2)*0.05)**2*2 / (n - 2) for i in range(n)]
    T = [[0.9 if i==j else 0.1/(n-1) for i in range(n)] for j in range(n)]
    E =  asarray(list(zip(linspace(-1, 1, n), [1./n for _ in range(n)])))

    # normalize values of the first eigenvector
    for c in x:
        this_mean = mean(x[c])
        this_std  = std (x[c])
        x[c] = [v - this_mean for v in x[c]]
        x[c] = [v / this_std  for v in x[c]]
    train(pi, T, E, list(x.values()), verbose=verbose, threshold=1e-6, n_iter=1000)
    return E, pi, T

def _cluster_ab_compartments(gamma, matrix, breaks, cmprtsec, rich_in_A, save=True,
                             ev_num=1, log=None, verbose=False, savefig=None,
                             n_clust=2):
    # function to convert correlation into distances
    gamma += 1
    func = lambda x: -abs(x)**gamma / x
    funczero = lambda x: 0.0
    # calculate distance_matrix
    dist_matrix = [[0 for _ in range(len(breaks))]
                   for _ in range(len(breaks))]
    scores = {}
    for k, cmprt in enumerate(cmprtsec):
        beg1, end1 = cmprt['start'], cmprt['end'] + 1
        diff1 = end1 - beg1
        scores[(k,k)] = dist_matrix[k][k] = -1
        for l in range(k + 1, len(cmprtsec)):
            beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end'] + 1
            val = nansum([matrix[i][j] for i in range(beg1, end1)
                          for j in range(beg2, end2)]) / (end2 - beg2) / diff1
            try:
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = func(val)
            except ZeroDivisionError:
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(val)
            if isnan(scores[(k,l)]):
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(0)
    # cluster compartments according to their correlation score
    try:
        clust = linkage(dist_matrix, method='ward')
    except UnboundLocalError:
        warn('WARNING: Chromosome probably too small. Skipping')
        return (float('inf'), float('inf'), float('inf'))
    # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters)
    solutions = {}
    for k in clust[:,2][-3:]:
        clusters = {}
        _ = [clusters.setdefault(j, []).append(i) for i, j in
             enumerate(fcluster(clust, k, criterion='distance'))]
        solutions[k] = {'out': clusters}
        solutions[k]['score'] = calinski_harabasz(scores, clusters)
    # plot
    if savefig:
        xedges = [b['start'] for b in breaks]
        yedges = [b['start'] for b in breaks]
        xedges += [breaks[-1]['end']]
        yedges += [breaks[-1]['end']]
        X, Y = meshgrid(xedges, yedges)
        import matplotlib.pyplot as plt
        fig = plt.figure(figsize=(10,10))
        _ = fig.add_axes([0.09,0.1,0.2,0.6])
        Z1 = dendrogram(clust, orientation='left')
        idx1 = Z1['leaves']
        idx2 = Z1['leaves']
        D = asarray(dist_matrix)[idx1,:]
        D = D[:,idx2]
        axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
        m = axmatrix.pcolormesh(X, Y, D)
        axmatrix.set_aspect('equal')
        axmatrix.set_yticks([])
        axmatrix.set_xlim((0, breaks[-1]['end']))
        axmatrix.set_ylim((0, breaks[-1]['end']))
        plt.colorbar(m)
        plt.savefig(savefig)
    try:
        # take best cluster according to calinski_harabasz score
        clusters = [solutions[s] for s in sorted(
            solutions, key=lambda x: solutions[x]['score'])
                    if solutions[s]['score']>0][1 - n_clust]['out']
    except IndexError:
        # warn('WARNING1: compartment clustering is not clear. Skipping')
        return (float('inf'), float('inf'), float('inf'))
    if len(clusters) != n_clust:
        # warn('WARNING2: compartment clustering is too clear. Skipping')
        return (float('inf'), float('inf'), float('inf'))
    # labeling compartments. A compartments shall have lower
    # mean intra-interactions
    dens = {}
    if rich_in_A:
        test = lambda x: x >= 1
    else:
        test = lambda x: x < 1
    for k in clusters:
        val = sum([cmprtsec[c]['dens']
                   for c in clusters[k]]) / len(clusters[k])
        dens['A' if test(val) else 'B'] = [
            cmprtsec[c]['dens'] for c in clusters[k]
            if cmprtsec[c]['end'] + 1 - cmprtsec[c]['start'] > 2]
        for c in clusters[k]:
            cmprtsec[c]['type'] = 'A' if test(val) else 'B'
    try:
        tt, pval = ttest_ind(dens['A'], dens['B'])
    except ZeroDivisionError:
        return (float('inf'), float('inf'), float('inf'))
    prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B']))
    # to avoid having all A or all B
    # score = 5000 * (prop - 0.5)**4 - 2
    # to avoid having  consecutive As or Bs
    score = 0.
    prev = None
    for cmprt in cmprtsec:
        if cmprt.get('type', None) == prev:
            score += 1.
        prev = cmprt.get('type', prev)
    score /= len(cmprtsec)
    score = exp(10 * (score - 0.4)) # 5000 * (score - 0.5)**4 - 2
    # score = score1 + score2
    if verbose:
        print ('[EV%d CL%s] g:%5s prop:%5s%% tt:%7s '
               'score-interleave:%5s ' # score-proportion:%7s
               'final: %7s pv:%7s' % (
                   ev_num, n_clust, gamma - 1, round(prop * 100, 1),
                   round(tt, 3), round(score, 3), #round(score2, 3),
                   round(score + tt, 3), round(pval, 5)))
    if log:
        log = open(log, 'a')
        log.write('[EV%d CL%s] g:%5s prop:%5s%% tt:%6s '
                  'score-interleave:%6s ' # score-proportion:%7s
                  'final: %7s pv:%s\n' % (
                      ev_num, n_clust, gamma - 1, round(prop * 100, 1),
                      round(tt, 3), round(score, 3), # round(score2, 3),
                      round(score + tt, 3), round(pval, 4)))
        log.close()
    if not save:
        for cmprt in cmprtsec:
            if 'type' in cmprt:
                cmprt['type'] = None
    return score + tt, tt, prop