Source code for pytadbit.experiment

"""
20 Feb 2013


"""
from __future__ import print_function

import numpy as np
from copy                                import deepcopy as copy
from sys                                 import stderr
from io                                  import IOBase
from warnings                            import warn
from math                                import isnan
from numpy                               import log2, array
from pytadbit.modelling.IMP_CONFIG       import CONFIG
from pytadbit                            import HiC_data
from pytadbit.parsers.hic_parser         import read_matrix
from pytadbit.utils.extraviews           import nicer
from pytadbit.utils.extraviews           import tadbit_savefig
from pytadbit.utils.tadmaths             import zscore
from pytadbit.utils.normalize_hic        import iterative
from pytadbit.utils.hic_filtering        import hic_filtering_for_modelling
from pytadbit.parsers.tad_parser         import parse_tads
from pytadbit.modelling.structuralmodels import StructuralModels

try:
    from pytadbit.modelling.impoptimizer  import IMPoptimizer
    from pytadbit.modelling.imp_modelling import generate_3d_models
except ImportError:
    pass
    # stderr.write('IMP not found, check PYTHONPATH\n')

try:
    import matplotlib.pyplot as plt
    from matplotlib.cm import jet
except ImportError:
    stderr.write('matplotlib not found\n')

try:
    file_types = file, IOBase
except NameError:
    file_types = (IOBase,)

try:
    basestring
except NameError:
    basestring = str

[docs]def load_experiment_from_reads(name, fnam, genome_seq, resolution,
                               conditions=None, identifier=None, cell_type=None,
                               enzyme=None, exp_type='Hi-C', **kw_descr):
    """
    Loads an experiment object from TADbit-generated read files, that are lists
    of pairs of reads mapped to a reference genome.

    :param fnam: tsv file with reads1 and reads2
    :param name: name of the experiment
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None identifier: some identifier relative to the Hi-C data
    :param None cell_type: cell type on which the experiment was done
    :param None enzyme: restriction enzyme used in  the Hi-C experiment
    :param Hi-C exp_type: name of the experiment used (currently only Hi-C is
       supported)
    :param None conditions: :py:func:`list` of experimental conditions, e.g.
       the cell type, the enzyme... (i.e.: ['HindIII', 'cortex', 'treatment']).
       This parameter may be used to compare the effect of this conditions on
       the TADs
    :param None kw_descr: any other argument passed would be stored as
       complementary descriptive field. For example::

           exp  = Experiment('k562_rep2', resolution=100000,
                             identifier='SRX015263', cell_type='K562',
                             enzyme='HindIII', cylce='synchronized')
           print exp

           # Experiment k562_rep2:
           #    resolution        : 100Kb
           #    TADs              : None
           #    Hi-C rows         : None
           #    normalized        : None
           #    identifier        : SRX015263
           #    cell type         : K562
           #    restriction enzyme: HindIII
           #    cylce             : synchronized

       *note that these fields may appear in the header of generated out files*
    """
    size = 0
    section_sizes = {}
    sections = []
    for crm in genome_seq:
        len_crm = int(float(len(genome_seq[crm])) // resolution + 1)
        section_sizes[(crm,)] = len_crm
        size += len_crm + 1
        sections.extend([(crm, '%04d' % i) for i in range(len_crm + 1)])
    imx = HiC_data((), size)
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    for line in open(fnam):
        _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
        ps1 = dict_sec[(cr1, '%04d' % (int(ps1) // resolution))]
        ps2 = dict_sec[(cr2, '%04d' % (int(ps2) // resolution))]
        imx[ps1 + ps2 * size] += 1
        imx[ps2 + ps1 * size] += 1

    return Experiment(name, resolution=resolution, hic_data=imx,
                      conditions=conditions, identifier=identifier,
                      cell_type=cell_type, enzyme=enzyme, exp_type=exp_type,
                      **kw_descr)

[docs]class Experiment(object):
    """
    Hi-C experiment.

    :param name: name of the experiment
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None identifier: some identifier relative to the Hi-C data
    :param None cell_type: cell type on which the experiment was done
    :param None enzyme: restriction enzyme used in  the Hi-C experiment
    :param Hi-C exp_type: name of the experiment used (currently only Hi-C is
       supported)
    :param None hic_data: whether a file or a list of lists corresponding to
       the Hi-C data
    :param None tad_def: a file or a dict with precomputed TADs for this
       experiment
    :param None parser: a parser function that returns a tuple of lists
       representing the data matrix and the length of a row/column. With
       the file example.tsv:

       ::

         chrT_001    chrT_002    chrT_003    chrT_004
         chrT_001    629    164    88    105
         chrT_002    164    612    175    110
         chrT_003    88    175    437    100
         chrT_004    105    110    100    278

       the output of parser('example.tsv') would be be:
       ``[([629, 164, 88, 105, 164, 612, 175, 110, 88, 175, 437, 100, 105,
       110, 100, 278]), 4]``
    :param None conditions: :py:func:`list` of experimental conditions, e.g.
       the cell type, the enzyme... (i.e.: ['HindIII', 'cortex', 'treatment']).
       This parameter may be used to compare the effect of this conditions on
       the TADs
    :param True filter_columns: filter the columns with unexpectedly high
       content of low values
    :param None kw_descr: any other argument passed would be stored as
       complementary descriptive field. For example::

           exp  = Experiment('k562_rep2', resolution=100000,
                             identifier='SRX015263', cell_type='K562',
                             enzyme='HindIII', cylce='synchronized')
           print exp

           # Experiment k562_rep2:
           #    resolution        : 100Kb
           #    TADs              : None
           #    Hi-C rows         : None
           #    normalized        : None
           #    identifier        : SRX015263
           #    cell type         : K562
           #    restriction enzyme: HindIII
           #    cylce             : synchronized

       *note that these fields may appear in the header of generated out files*

    TODO: doc conditions
    TODO: normalization
    """


    def __init__(self, name, resolution, hic_data=None, norm_data=None,
                 tad_def=None, parser=None, no_warn=False, weights=None,
                 conditions=None, identifier=None,
                 cell_type=None, enzyme=None, exp_type='Hi-C', **kw_descr):
        self.name            = name
        self.resolution      = resolution
        self.identifier      = identifier
        self.cell_type       = cell_type
        self.enzyme          = enzyme
        self.description     = kw_descr
        self.exp_type        = exp_type
        self.crm             = None
        self._ori_resolution = resolution
        self.hic_data        = None
        self._ori_hic        = None
        self._ori_norm       = None
        self._ori_size       = None
        self.conditions      = sorted(conditions) if conditions else []
        self.size            = None
        self.tads            = {}
        self.norm            = None
        self._normalization  = None
        self._filtered_cols  = False
        self._zeros          = {}
        self._zscores        = {}
        if hic_data:
            self.load_hic_data(hic_data, parser, **kw_descr)
        if norm_data:
            self.load_norm_data(norm_data, parser, **kw_descr)
        if tad_def:
            self.load_tad_def(tad_def, weights=weights)
        elif not hic_data and not no_warn and not norm_data:
            stderr.write('WARNING: this is an empty shell, no data here.\n')


    def __repr__(self):
        return 'Experiment %s (resolution: %s, TADs: %s, Hi-C rows: %s, normalized: %s)' % (
            self.name, nicer(self.resolution), len(self.tads) or None,
            self.size, self._normalization if self._normalization else 'None')


    def __str__(self):
        outstr = 'Experiment %s:\n' % (self.name)
        outstr += '   resolution        : %s\n' % (nicer(self.resolution))
        outstr += '   TADs              : %s\n' % (len(self.tads) or None)
        outstr += '   Hi-C rows         : %s\n' % (self.size)
        outstr += '   normalized        : %s\n' % (self._normalization or None)
        ukw = 'UNKNOWN'
        try: # new in version post-CSDM13
            outstr += '   identifier        : %s\n' % (self.identifier or ukw)
            outstr += '   cell type         : %s\n' % (self.cell_type  or ukw)
            outstr += '   restriction enzyme: %s\n' % (self.enzyme     or ukw)
            for desc in self.description:
                outstr += '   %-18s: %s\n' % (desc, self.description[desc])
        except AttributeError:
            pass
        return outstr


    def __add__(self, other, silent=False):
        """
        sum Hi-C data of experiments into a new one.
        """
        reso1, reso2 = self.resolution, other.resolution
        if self.resolution == other.resolution:
            resolution = self.resolution
            changed_reso = False
        else:
            resolution = max(reso1, reso2)
            self.set_resolution(resolution)
            other.set_resolution(resolution)
            if not silent:
                stderr.write('WARNING: experiments of different resolution, ' +
                             'setting both resolution of %s, and normalizing ' +
                             'at this resolution\n' % (resolution))
            norm1 = copy(self.norm)
            norm2 = copy(other.norm)
            if self._normalization:
                self.normalize_hic()
            if other._normalization:
                other.normalize_hic()
            changed_reso = True
        if self.hic_data:
            new_hicdata = HiC_data([], size=self.size)
            for i in self.hic_data[0]:
                new_hicdata[i] = self.hic_data[0].get(i)
            for i in other.hic_data[0]:
                new_hicdata[i] += other.hic_data[0].get(i)
        else:
            new_hicdata = None
        xpr = Experiment(name='%s+%s' % (self.name, other.name),
                         resolution=resolution,
                         hic_data=new_hicdata, no_warn=True)
        # check if both experiments are normalized with the same method
        # and sum both normalized data
        if self._normalization != None and other._normalization != None:
            if (self._normalization.split('_factor:')[0] ==
                other._normalization.split('_factor:')[0]):
                xpr.norm = [HiC_data([], size=self.size)]
                for i in self.norm[0]:
                    xpr.norm[0][i] = self.norm[0].get(i)
                for i in other.norm[0]:
                    xpr.norm[0][i] += other.norm[0].get(i)
                # The final value of the factor should be the sum of each
                try:
                    xpr._normalization = (
                        self._normalization.split('_factor:')[0] +
                        '_factor:' +
                        str(int(self._normalization.split('_factor:')[1]) +
                            int(other._normalization.split('_factor:')[1])))
                except IndexError: # no factor there
                    xpr._normalization = (self._normalization)
        elif self.norm or other.norm:
            try:
                if (self.norm[0] or other.norm[0]) != {}:
                    if not silent:
                        raise Exception('ERROR: normalization differs between' +
                                        ' each experiment\n')
                else:
                    if not silent:
                        stderr.write('WARNING: experiments should be ' +
                                     'normalized before being summed\n')
            except TypeError:
                if not silent:
                    stderr.write('WARNING: experiments should be normalized ' +
                                 'before being summed\n')
        else:
            if not silent:
                stderr.write('WARNING: experiments should be normalized ' +
                             'before being summed\n')
        if changed_reso:
            self.set_resolution(reso1)
            self.norm = norm1
            other.set_resolution(reso2)
            other.norm = norm2
        xpr.crm = self.crm
        if not xpr.size:
            xpr.size = len(xpr.norm[0])

        def __merge(own, fgn):
            "internal function to merge descriptions"
            if own == fgn:
                return own
            return '%s+%s' % (own , fgn)

        xpr.identifier  = __merge(self.identifier , other.identifier )
        xpr.cell_type   = __merge(self.cell_type  , other.cell_type  )
        xpr.enzyme      = __merge(self.enzyme     , other.enzyme     )
        xpr.description = __merge(self.description, other.description)
        xpr.exp_type    = __merge(self.exp_type   , other.exp_type   )

        for des in self.description:
            if not des in other.description:
                continue
            xpr.description[des] = __merge(self.description[des],
                                           other.description[des])
        return xpr


    def __div__(self, other, silent=False):
        """
        sum Hi-C data of experiments into a new one.
        """
        reso1, reso2 = self.resolution, other.resolution
        if self.resolution == other.resolution:
            resolution = self.resolution
            changed_reso = False
        else:
            resolution = max(reso1, reso2)
            self.set_resolution(resolution)
            other.set_resolution(resolution)
            if not silent:
                stderr.write('WARNING: experiments of different resolution, ' +
                             'setting both resolution of %s, and normalizing ' +
                             'at this resolution\n' % (resolution))
            norm1 = copy(self.norm)
            norm2 = copy(other.norm)
            if self._normalization:
                self.normalize_hic()
            if other._normalization:
                other.normalize_hic()
            changed_reso = True
        if self.hic_data:
            new_hicdata = HiC_data([], size=self.size)
            for i in self.hic_data[0]:
                new_hicdata[i] = self.hic_data[0].get(i)
            for i in other.hic_data[0]:
                try:
                    new_hicdata[i] /= other.hic_data[0].get(i)
                except ZeroDivisionError:
                    new_hicdata[i] = float('NaN')
        else:
            new_hicdata = None
        xpr = Experiment(name='%s/%s' % (self.name, other.name),
                         resolution=resolution,
                         hic_data=new_hicdata, no_warn=True)
        # check if both experiments are normalized with the same method
        # and sum both normalized data
        if self._normalization != None and other._normalization != None:
            if (self._normalization.split('_factor:')[0] ==
                other._normalization.split('_factor:')[0]):
                xpr.norm = [HiC_data([], size=self.size)]
                for i in self.norm[0]:
                    xpr.norm[0][i] = self.norm[0].get(i)
                for i in other.norm[0]:
                    try:
                        xpr.norm[0][i] /= other.norm[0].get(i)
                    except ZeroDivisionError:
                        xpr.norm[0][i] = float('NaN')
                # The final value of the factor should be the same of each
                try:
                    xpr._normalization = (
                        self._normalization.split('_factor:')[0] +
                        '_factor:' +
                        str(int(self._normalization.split('_factor:')[1]) +
                            int(other._normalization.split('_factor:')[1]))) // 2
                except IndexError: # no factor there
                    xpr._normalization = (self._normalization)
        elif self.norm or other.norm:
            try:
                if (self.norm[0] or other.norm[0]) != {}:
                    if not silent:
                        raise Exception('ERROR: normalization differs between' +
                                        ' each experiment\n')
            except TypeError:
                pass
        if changed_reso:
            self.set_resolution(reso1)
            self.norm = norm1
            other.set_resolution(reso2)
            other.norm = norm2
        xpr.crm = self.crm
        if not xpr.size:
            xpr.size = len(xpr.norm[0])



        def __merge(own, fgn):
            "internal function to merge descriptions"
            if own == fgn:
                return own
            return '%s+%s' % (own , fgn)

        xpr.identifier  = __merge(self.identifier , other.identifier )
        xpr.cell_type   = __merge(self.cell_type  , other.cell_type  )
        xpr.enzyme      = __merge(self.enzyme     , other.enzyme     )
        xpr.description = __merge(self.description, other.description)
        xpr.exp_type    = __merge(self.exp_type   , other.exp_type   )

        for des in self.description:
            if not des in other.description:
                continue
            xpr.description[des] = __merge(self.description[des],
                                           other.description[des])
        return xpr



[docs]    def set_resolution(self, resolution, keep_original=True):
        """
        Set a new value for the resolution. Copy the original data into
        Experiment._ori_hic and replace the Experiment.hic_data
        with the data corresponding to new data
        (:func:`pytadbit.Chromosome.compare_condition`).

        :param resolution: an integer representing the resolution. This number
           must be a multiple of the original resolution, and higher than it
        :param True keep_original: either to keep or not the original data

        """
        if resolution < self._ori_resolution:
            raise Exception('New resolution might be higher than original.')
        if resolution % self._ori_resolution:
            raise Exception('New resolution might be a multiple original.\n' +
                            '  otherwise it is too complicated for me :P')
        if resolution == self.resolution:
            return
        # if we want to go back to original resolution
        if resolution == self._ori_resolution:
            self.hic_data   = self._ori_hic
            self.norm       = self._ori_norm
            self.size       = self._ori_size
            self.resolution = self._ori_resolution
            return
        # if current resolution is the original one
        if self.resolution == self._ori_resolution:
            if self.hic_data:
                self._ori_hic  = copy(self.hic_data)
            if self.norm:
                self._ori_norm = self.norm[:]
                # change the factor value in normalization description
                try:
                    self._normalization = (
                        self._normalization.split('_factor:')[0] +
                        '_factor:'+
                        str(int(self._normalization.split('factor:')[1])
                            * (resolution // self.resolution)))
                except IndexError: # no factor there
                    pass
        self.resolution = resolution
        fact = self.resolution // self._ori_resolution
        # super for!
        try:
            size = len(self._ori_hic[0])
        except TypeError:
            size = len(self._ori_norm[0])
        self.size     = size // fact
        rest = size % fact
        if rest:
            self.size += 1
        self.hic_data = [HiC_data([], size // fact + (1 if rest else 0))]
        self.norm     = [HiC_data([], size // fact + (1 if rest else 0))]
        def resize(mtrx, copee):
            "resize both hic_data and normalized data"
            for i in range(0, size, fact):
                for j in range(0, size, fact):
                    val = 0
                    for k in range(fact):
                        if i + k >= size:
                            break
                        for l in  range(fact):
                            if j + l >= size:
                                break
                            val += copee[(i + k) * size + j + l]
                    if val:
                        mtrx[i//fact * self.size + j//fact] = val
        try:
            resize(self.hic_data[0], self._ori_hic[0])
        except TypeError:
            pass
        try:
            resize(self.norm[0], self._ori_norm[0])
        except TypeError:
            pass
        # we need to recalculate zeros:
        if self._filtered_cols:
            stderr.write('WARNING: definition of filtered columns lost at ' +
                         'this resolution\n')
            self._filtered_cols = False
        if not keep_original:
            del(self._ori_hic)
            del(self._ori_norm)


[docs]    def filter_columns(self, silent=False, draw_hist=False, savefig=None,
                       perc_zero=99, by_mean=True, min_count=None):
        """
        Call filtering function, to remove artifactual columns in a given Hi-C
        matrix. This function will detect columns with very low interaction
        counts. Filtered out columns will be stored in the dictionary Experiment._zeros.

        :param False silent: does not warn for removed columns
        :param False draw_hist: shows the distribution of mean values by column
           the polynomial fit, and the cut applied.
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        :param 99 perc_zero: maximum percentage of cells with no interactions
           allowed.
        :param None min_count: minimum number of reads mapped to a bin (recommended
           value could be 2500). If set this option overrides the perc_zero
           filtering... This option is slightly slower.
        :param True by_mean: filter columns by mean column value using
           :func:`pytadbit.utils.hic_filtering.filter_by_mean` function

        """
        try:
            data = self.hic_data[0]
        except:
            data = self.norm[0]
        data.filter_columns(draw_hist=draw_hist, savefig=savefig, perc_zero=perc_zero,
                            by_mean=by_mean, min_count=min_count, silent=silent)
        self._zeros = data.bads        
        self._filtered_cols = True


[docs]    def load_hic_data(self, hic_data, parser=None, wanted_resolution=None,
                      data_resolution=None, silent=False, **kwargs):
        """
        Add a Hi-C experiment to the Chromosome object.

        :param None hic_data: whether a file or a list of lists corresponding to
           the Hi-C data
        :param name: name of the experiment
        :param False force: overwrite the experiments loaded under the same
           name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix and the length of a row/column.
           With the file example.tsv:

           ::

             chrT_001    chrT_002    chrT_003    chrT_004
             chrT_001    629    164    88    105
             chrT_002    86    612    175    110
             chrT_003    159    216    437    105
             chrT_004    100    111    146    278

           the output of parser('example.tsv') would be:
           ``[([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105,
           110, 105, 278]), 4]``
        :param None resolution: resolution of the experiment in the file; it
           will be adjusted to the resolution of the experiment. By default the
           file is expected to contain a Hi-C experiment with the same resolution
           as the :class:`pytadbit.Experiment` created, and no change is made
        :param True filter_columns: filter the columns with unexpectedly high
           content of low values
        :param False silent: does not warn for removed columns

        """
        self.hic_data = read_matrix(hic_data, parser=parser, one=False)
        self._ori_size       = self.size       = len(self.hic_data[0])
        self._ori_resolution = self.resolution = (data_resolution or
                                                  self._ori_resolution)
        wanted_resolution = wanted_resolution or self.resolution
        self.set_resolution(wanted_resolution, keep_original=False)
        if self.hic_data[0].bads:
            self._zeros = self.hic_data[0].bads
            self._filtered_cols = True

[docs]    def load_norm_data(self, norm_data, parser=None, resolution=None,
                       normalization='visibility', **kwargs):
        """
        Add a normalized Hi-C experiment to the Chromosome object.

        :param None norm_data: whether a file or a list of lists corresponding to
           the normalized Hi-C data
        :param name: name of the experiment
        :param False force: overwrite the experiments loaded under the same
           name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix and the length of a row/column.
           With the file example.tsv:

           ::

             chrT_001    chrT_002    chrT_003    chrT_004
             chrT_001    12.5    164    8.8    0.5
             chrT_002    8.6    61.2    1.5    1.1
             chrT_003    15.9    21.6    3.7    0.5
             chrT_004    0.0    1.1    1.6    2.8

        :param None resolution: resolution of the experiment in the file; it
           will be adjusted to the resolution of the experiment. By default the
           file is expected to contain a Hi-C experiment with the same resolution
           as the :class:`pytadbit.Experiment` created, and no change is made
        :param True filter_columns: filter the columns with unexpectedly high
           content of low values
        :param False silent: does not warn for removed columns

        """
        self.norm = read_matrix(norm_data, parser=parser, hic=False, one=False)
        self._ori_size       = self.size       = len(self.norm[0])
        self._ori_resolution = self.resolution = resolution or self._ori_resolution
        if not self._zeros: # in case we do not have original Hi-C data
            for i in range(self.size):
                if all([isnan(j) for j in
                        [self.norm[0][k] for k in
                         range(i * self.size, i * self.size + self.size)]]):
                    self._zeros[i] = None
        # remove NaNs, we do not need them as we have zeroes
        for i in list(self.norm[0].keys()):
            if isnan(self.norm[0][i]):
                del(self.norm[0][i])
        self._normalization = normalization
        if self.norm[0].bads:
            self._zeros = self.norm[0].bads
            self._filtered_cols = True


[docs]    def load_tad_def(self, tad_def, weights=None):
        """
         Add the Topologically Associated Domains definition detection to Slice

        :param None tad_def: a file or a dict with pre-computed TADs for this
           experiment
        :param None name: name of the experiment, if None f_name will be used
        :param None weights: Store information about the weights, corresponding
           to the normalization of the Hi-C data (see TADbit function
           documentation)

        """
        tads, norm = parse_tads(tad_def)
        last = max(tads.keys())
        if not self.size:
            self.size = tads[last]['end']
        self.tads = tads
        if not self.norm:
            self.norm  = weights or norm
            if self.norm:
                self._normalization = 'visibility'
        if self._normalization:
            norms = self.norm[0]
        elif self.hic_data:
            norms = self.hic_data[0]
        else:
            warn("WARNING: raw Hi-C data not available, " +
                 "TAD's height fixed to 1")
            norms = None
        diags = []
        siz = self.size
        sp1 = siz + 1
        zeros = self._zeros or {}
        if norms:
            for k in range(1, siz):
                s_k = siz * k
                diags.append(sum([norms[i * sp1 + s_k]
                                 if not (i in zeros
                                         or (i + k) in zeros) else 1. # 1 is the mean
                                  for i in range(siz - k)]) // (siz - k))
        for tad in tads:
            start, end = (int(tads[tad]['start']) + 1,
                          int(tads[tad]['end']) + 1)
            if norms:
                matrix = sum([norms[i + siz * j]
                             if not (i in zeros
                                     or j in zeros) else 1.
                              for i in range(start - 1, end - 1)
                              for j in range(i + 1, end - 1)])
            try:
                if norms:
                    height = float(matrix) / sum(
                        [diags[i-1] * (end - start - i)
                         for i in range(1, end - start)])
                else:
                    height = tads[tad].get('height', 1.0)
            except ZeroDivisionError:
                height = 0.
            tads[tad]['height'] = height


[docs]    def normalize_hic(self, factor=1, iterations=0, max_dev=0.1, silent=False):
        """
        Normalize the Hi-C data. This normalization step does the same of
        the :func:`pytadbit.tadbit.tadbit` function (default parameters),

        It fills the Experiment.norm variable with the Hi-C values divided by
        the calculated weight.

        The weight of a given cell in column i and row j corresponds to the
        square root of the product of the sum of column i by the sum of row
        j.

        normalization is done according to this formula:

        .. math::

          weight_{(I,J)} = \\frac{\\sum^N_{j=0}\\sum^N_{i=0}(matrix(i,j))}
                                 {\\sum^N_{i=0}(matrix(i,J)) \\times \\sum^N_{j=0}(matrix(I,j))}

        with N being the number or rows/columns of the Hi-C matrix in both
        cases.

        :param 1 factor: final mean number of normalized interactions wanted
           per cell
        :param False silent: does not warn when overwriting weights
        :param None rowsums: input a list of rowsums calculated elsewhere
        """

        if not self.hic_data:
            raise Exception('ERROR: No Hi-C data loaded\n')
        if self.norm and not silent:
            stderr.write('WARNING: removing previous weights\n')
        size = self.size
        self.hic_data[0].normalize_hic(iterations=iterations, max_dev=max_dev,
                                       silent=silent, factor=factor)
        bias = self.hic_data[0].bias  # faster quick ref
        hic_data = self.hic_data[0]
        self.norm = [HiC_data([(i + j * size, float(hic_data[i, j]) /
                                bias[i] / bias[j])
                               for i in bias for j in bias], size)]
        # no need to use lists, tuples use less memory
        if factor:
            self._normalization = 'visibility_factor:' + str(factor)
        else:
            self._normalization = 'visibility'


[docs]    def get_hic_zscores(self, normalized=True, zscored=True, remove_zeros=True):
        """
        Normalize the Hi-C raw data. The result will be stored into
        the private Experiment._zscore list.

        :param True normalized: whether to normalize the result using the
           weights (see :func:`normalize_hic`)
        :param True zscored: calculate the z-score of the data
        :param False remove_zeros: remove null interactions. Dangerous, null
           interaction are informative.

        """
        values = {}
        zeros  = {}
        self._zscores = {}
        if normalized:
            for i in range(self.size):
                # zeros are rows or columns having a zero in the diagonal
                if i in self._zeros:
                    continue
                for j in range(i + 1, self.size):
                    if j in self._zeros:
                        continue
                    if (not self.norm[0][i * self.size + j]
                        and remove_zeros):
                        zeros[(i, j)] = None
                        continue
                    values[(i, j)] = self.norm[0][i * self.size + j]
        else:
            for i in range(self.size):
                if i in self._zeros:
                    continue
                for j in range(i + 1, self.size):
                    if j in self._zeros:
                        continue
                    values[(i, j)] = self.hic_data[0][i * self.size + j]
        # compute Z-score
        if zscored:
            zscore(values)
        for i in range(self.size):
            if i in self._zeros:
                continue
            for j in range(i + 1, self.size):
                if j in self._zeros:
                    continue
                if (i, j) in zeros and remove_zeros:
                    continue
                self._zscores.setdefault(str(i), {})
                self._zscores[str(i)][str(j)] = values[(i, j)]


[docs]    def model_region(self, start=1, end=None, n_models=5000, n_keep=1000,
                     n_cpus=1, verbose=0, keep_all=False, close_bins=1,
                     outfile=None, config=CONFIG, container=None,
                     single_particle_restraints=None, use_HiC=True):
        """
        Generates of three-dimensional models using IMP, for a given segment of
        chromosome.

        :param 1 start: first bin to model (bin number)
        :param None end: last bin to model (bin number). By default goes to the
           last bin.
        :param 5000 n_models: number of modes to generate
        :param 1000 n_keep: number of models used in the final analysis
           (usually the top 20% of the generated models). The models are ranked
           according to their objective function value (the lower the better)
        :param False keep_all: whether or not to keep the discarded models (if
           True, models will be stored under tructuralModels.bad_models)
        :param 1 close_bins: number of particles away (i.e. the bin number
           difference) a particle pair must be in order to be considered as
           neighbors (e.g. 1 means consecutive particles)
        :param n_cpus: number of CPUs to use
        :param 0 verbose: the information printed can be: nothing (0), the
           objective function value the selected models (1), the objective
           function value of all the models (2), all the modeling
           information (3)
        :param None container: restrains particle to be within a given object. Can
           only be a 'cylinder', which is, in fact a cylinder of a given height to
           which are added hemispherical ends. This cylinder is defined by a radius,
           its height (with a height of 0 the cylinder becomes a sphere) and the
           force applied to the restraint. E.g. for modeling E. coli genome (2
           micrometers length and 0.5 micrometer of width), these values could be
           used: ['cylinder', 250, 1500, 50], and for a typical mammalian nuclei
           (6 micrometers diameter): ['cylinder', 3000, 0, 50]
        :param CONFIG config: a dictionary containing the standard
           parameters used to generate the models. The dictionary should
           contain the keys kforce, maxdist, upfreq and lowfreq.
           Examples can be seen by doing:

           ::

             from pytadbit.imp.CONFIG import CONFIG

           where CONFIG is a dictionarry of dictionnaries to be passed to this
           function:

           ::

             CONFIG = {
              # use these paramaters with the Hi-C data from:
              'reference' : 'victor corces dataset 2013',

              # Force applied to the restraints inferred to neighbor particles
              'kforce'    : 5,

              # Maximum experimental contact distance
              'maxdist'   : 600, # OPTIMIZATION: 500-1200

              # Minimum and maximum thresholds used to decide which experimental values have to be
              # included in the computation of restraints. Z-score values bigger than upfreq
              # and less that lowfreq will be include, whereas all the others will be rejected
              'upfreq'    : 0.3, # OPTIMIZATION: min/max Z-score

              'lowfreq'   : -0.7, # OPTIMIZATION: min/max Z-score

              # How much space (radius in nm) ocupies a nucleotide
              'scale'     : 0.005
              }

        :param True use_HiC: apply hic data restraints to the model
        :param: None single_particle_restraints: a list containing restraints to single particles.
            Each restraint in the list is itself a list with the following information:
                [bin, [position_x, position_y, position_z], type, kforce, radius]
                bin: bin number of the particle to restraint
                [position_x, position_y, position_z](nm): center of the sphere of the restraint.
                    The center of the coordinate system is the center of the base of the
                    cylinder defined as the container.
                type: 'Harmonic', 'HarmonicLowerBound', 'HarmonicUpperBound'
                kforce: weigth of the restraint
                radius (nm): radius of the sphere
        :returns: a :class:`pytadbit.imp.structuralmodels.StructuralModels` object.

        """
        if not self._normalization:
            stderr.write('WARNING: not normalized data, should run ' +
                         'Experiment.normalize_hic()\n')
        if not end:
            end = self.size
        zscores, values, zeros = self._sub_experiment_zscore(start, end)
        if self.hic_data and self.hic_data[0].chromosomes:
            coords = []
            tot = 0
            chrs = []
            chrom_offset_start = start
            chrom_offset_end = 0
            for k, v in self.hic_data[0].chromosomes.items():
                tot += v
                if start > tot:
                    chrom_offset_start = start - tot
                if end <= tot:
                    chrom_offset_end = tot - end
                    chrs.append(k)
                    break
                if start < tot and end >= tot:
                    chrs.append(k)
            for k in chrs:
                coords.append({'crm'  : k,
                      'start': 1,
                      'end'  : self.hic_data[0].chromosomes[k]})
            coords[0]['start'] = chrom_offset_start
            coords[-1]['end'] -= chrom_offset_end
        else:
            coords = {'crm'  : self.crm.name,
                      'start': start,
                      'end'  : end}
        zeros = tuple([i not in zeros for i in range(end - start + 1)])
        nloci = end - start + 1
        if verbose:
            stderr.write('Preparing to model %s particles\n' % nloci)
        return generate_3d_models(zscores, self.resolution, nloci,
                                  values=values, n_models=n_models,
                                  outfile=outfile, n_keep=n_keep, n_cpus=n_cpus,
                                  verbose=verbose, keep_all=keep_all, first=0,
                                  close_bins=close_bins, config=config, container=container,
                                  experiment=self, coords=coords, zeros=zeros,
                                  single_particle_restraints=single_particle_restraints,
                                  use_HiC=use_HiC)

[docs]    def optimal_imp_parameters(self, start=1, end=None, n_models=500, n_keep=100,
                               n_cpus=1, upfreq_range=(0, 1, 0.1), close_bins=1,
                               kbending_range=0.0,
                               lowfreq_range=(-1, 0, 0.1),
                               scale_range=[0.01][:],
                               maxdist_range=(400, 1400, 100),
                               dcutoff_range=[2][:],
                               outfile=None, verbose=True, corr='spearman',
                               off_diag=1, savedata=None,
                               container=None):
        """
        Find the optimal set of parameters to be used for the 3D modeling in
        IMP.

        :param 1 start: first bin to model (bin number)
        :param None end: last bin to model (bin number). By default goes to the
           last bin.
        :param 500 n_models: number of modes to generate
        :param 100 n_keep: number of models used in the final analysis (usually
           the top 20% of the generated models). The models are ranked
           according to their objective function value (the lower the better)
        :param 1 close_bins: number of particles away (i.e. the bin number
           difference) a particle pair must be in order to be considered as
           neighbors (e.g. 1 means consecutive particles)
        :param n_cpus: number of CPUs to use
        :param False verbose: if set to True, information about the distance,
           force and Z-score between particles will be printed
        :param (-1,0,0.1) lowfreq_range:  range of lowfreq values to be
           optimized. The last value of the input tuple is the incremental step
           for the lowfreq values
        :param (0,1,0.1,0.1) upfreq_range: range of upfreq values to be
           optimized. The last value of the input tuple is the incremental step
           for the upfreq values
        :param (400,1400,100) maxdist_range: upper and lower bounds used to
           search for the optimal maximum experimental distance. The last value
           of the input tuple is the incremental step for maxdist values
        :param [0.01] scale_range: upper and lower bounds used to search for
           the optimal scale parameter (nm per nucleotide). The last value of
           the input tuple is the incremental step for scale parameter values
        :param [2] dcutoff_range: upper and lower bounds used to search for
           the optimal distance cutoff parameter (distance, in number of beads,
           from which to consider 2 beads as being close). The last value of the
           input tuple is the incremental step for scale parameter values
        :param None container: restrains particle to be within a given object. Can
           only be a 'cylinder', which is, in fact a cylinder of a given height to
           which are added hemispherical ends. This cylinder is defined by a radius,
           its height (with a height of 0 the cylinder becomes a sphere) and the
           force applied to the restraint. E.g. for modeling E. coli genome (2
           micrometers length and 0.5 micrometer of width), these values could be
           used: ['cylinder', 250, 1500, 50], and for a typical mammalian nuclei
           (6 micrometers diameter): ['cylinder', 3000, 0, 50]
        :param True verbose: print the results to the standard output

        .. note::

          Each of the *_range* parameters accept tuples in the form
           *(start, end, step)*, or a list with the list of values to test.

           E.g.:
             * scale_range=[0.001, 0.005, 0.006] will test these three values.
             * scale_range=(0.001, 0.005, 0.001) will test the values 0.001,
               0.002, 0.003, 0.004 and 0.005


        :returns: an :class:`pytadbit.imp.impoptimizer.IMPoptimizer` object

        """
        if not self._normalization:
            stderr.write('WARNING: not normalized data, should run ' +
                         'Experiment.normalize_hic()\n')
        if not end:
            end = self.size
        optimizer = IMPoptimizer(self, start, end, n_keep=n_keep,
                                 n_models=n_models, close_bins=close_bins,
                                 container=container)
        optimizer.run_grid_search(maxdist_range=maxdist_range,
                                  kbending_range=kbending_range,
                                  upfreq_range=upfreq_range,
                                  lowfreq_range=lowfreq_range,
                                  scale_range=scale_range,
                                  dcutoff_range=dcutoff_range, corr=corr,
                                  n_cpus=n_cpus, verbose=verbose,
                                  off_diag=off_diag, savedata=savedata)

        if outfile:
            optimizer.write_result(outfile)

        return optimizer


    def _sub_experiment_zscore(self, start, end):
        """
        Get the z-score of a sub-region of an  experiment.

        :param start: first bin to model (bin number)
        :param end: first bin to model (bin number)

        :returns: 1- z-score, 2- matrix of values with NaNs in the diagonal and in
           bad columns and 3- actual position of bad columns
        """
        if not self._normalization or not self._normalization.startswith('visibility'):
            stderr.write('WARNING: normalizing according to visibility method\n')
            self.normalize_hic()
        from pytadbit import Chromosome
        if start < 1:
            raise ValueError('ERROR: start should be higher than 0\n')
        start -= 1 # things starts at 0 for python. we keep the end coordinate
                   # at its original value because it is inclusive
        siz = self.size
        try:
            matrix = self.get_hic_matrix()
            new_matrix = [[matrix[i][j] for i in range(start, end)]
                          for j in range(start, end)]
            tmp = Chromosome('tmp')
            tmp.add_experiment('exp1', hic_data=[new_matrix],
                               resolution=self.resolution, filter_columns=False)
            exp = tmp.experiments[0]
            # We want the weights and zeros calculated in the full chromosome
            exp.norm = [[self.norm[0][i + siz * j] for i in range(start, end)
                         for j in range(start, end)]]
        except TypeError: # no Hi-C data provided
            matrix = self.get_hic_matrix(normalized=True)
            new_matrix = [[matrix[i][j] for i in range(start, end)]
                           for j in range(start, end)]
            tmp = Chromosome('tmp')
            tmp.add_experiment('exp1', norm_data=[new_matrix],
                               resolution=self.resolution, filter_columns=False)
            exp = tmp.experiments[0]
        exp._zeros = dict([(z - start, None) for z in self._zeros
                           if start <= z <= end - 1])
        if len(exp._zeros) == (end - start):
            raise Exception('ERROR: no interaction found in selected regions')
        # ... but the z-scores in this particular region
        exp.get_hic_zscores()
        values = [[float('nan') for _ in range(exp.size)]
                  for _ in range(exp.size)]
        for i in range(exp.size):
            # zeros are rows or columns having a zero in the diagonal
            if i in exp._zeros:
                continue
            for j in range(i + 1, exp.size):  # NaNs kept in the diagonal
                if j in exp._zeros:
                    continue
                val = exp.norm[0][i * exp.size + j]
                values[i][j] = val
                values[j][i] = val
        return exp._zscores, values, exp._zeros


[docs]    def write_interaction_pairs(self, fname, normalized=True, zscored=True,
                                diagonal=False, cutoff=None, header=False,
                                true_position=False, uniq=True,
                                remove_zeros=False, focus=None, format='tsv'):
        """
        Creates a tab separated file with all the pairwise interactions.

        :param fname: file name where to write the  pairwise interactions
        :param True zscored: computes the z-score of the log10(data)
        :param True normalized: use the weights to normalize the data
        :param None cutoff: if defined, only the zscores above the cutoff will
           be writen to the file
        :param False uniq: only writes one representent per interacting pair
        :param False true_position: if, true writes genomic coordinates,
           otherwise, genomic bin.
        :param None focus: writes interactions between the start and stop bin
           passed to this parameter.
        :param 'tsv' format: in which to write the file, can be tab separated
           (tsv) or JSON (json)

        """
        cutoff = cutoff or float('-inf')
        if not self._zscores and zscored:
            self.get_hic_zscores()
        if not self.norm and normalized:
            raise Exception('Experiment not normalized.')
        # write to file
        if isinstance(fname, basestring):
            out = open(fname, 'w')
        elif isinstance(fname, file_types):
            out = fname
        else:
            raise Exception('Not recognize file type\n')
        if header:
            if format == 'tsv':
                out.write('elt1\telt2\t%s\n' % ('zscore' if zscored else
                                                'normalized hi-c' if normalized
                                                else 'raw hi-c'))
            elif format == 'json':
                out.write('''
{
    "metadata": {
                        "formatVersion" : 3,
                        %s
                        "species" : "%s",
                        "cellType" : "%s",
                        "experimentType" : "%s",
                        "identifier" : "%s",
                        "resolution" : %s,
                        "chromosome" : "%s",
                        "start" : %s,
                        "end" : %s
                },
    "interactions": [
                ''' % ('\n'.join(['"%s": "%s",' % (k, self.description[k])
                                  for k in self.description]),
                       self.description.get('species', ''),
                       self.cell_type,
                       self.exp_type,
                       self.identifier,
                       self.resolution,
                       self.crm.name,
                       focus[0] * self.resolution if focus else 1,
                       (focus[1] * self.resolution if focus else
                        self.resolution * self.size)))
        if focus:
            start, end = focus[0], focus[1] + 1
        else:
            start, end = 0, self.size
        for i in range(start, end):
            if i in self._zeros:
                continue
            newstart = i if uniq else 0
            for j in range(newstart, end):
                if j in self._zeros:
                    continue
                if not diagonal and i == j:
                    continue
                if zscored:
                    try:
                        if self._zscores[str(i)][str(j)] < cutoff:
                            continue
                        if self._zscores[str(i)][str(j)] == -99:
                            continue
                    except KeyError:
                        continue
                    val = self._zscores[str(i)][str(j)]
                elif normalized:
                    val = self.norm[0][self.size*i+j]
                else:
                    val = self.hic_data[0][self.size*i+j]
                if remove_zeros and not val:
                    continue
                if true_position:
                    if format == 'tsv':
                        out.write('%s\t%s\t%s\n' % (
                            self.resolution * (i + 1),
                            self.resolution * (j + 1), val))
                    elif format == 'json':
                        out.write('%s,%s,%s,\n' % (
                            self.resolution * (i + 1),
                            self.resolution * (j + 1), val))
                else:
                    if format == 'tsv':
                        out.write('%s\t%s\t%s\n' % (
                            i + 1 - start, j + 1 - start, val))
                    elif format == 'json':
                        out.write('%s,%s,%s\n' % (
                            i + 1 - start, j + 1 - start, val))
        if format == 'json':
            out.write(']}\n')
        out.close()


[docs]    def get_hic_matrix(self, focus=None, diagonal=True, normalized=False):
        """
        Return the Hi-C matrix.

        :param None focus: if a tuple is passed (start, end), wil return a Hi-C
           matrix starting at start, and ending at end (all inclusive).
        :param True diagonal: replace the values in the diagonal by one. Used
           for the filtering in order to smooth the distribution of mean values
        :para False normalized: returns normalized data instead of raw Hi-C

        :returns: list of lists representing the Hi-C data matrix of the
           current experiment
        """
        siz = self.size
        if normalized:
            try:
                hic = self.norm[0]
            except TypeError:
                raise Exception('ERROR: experiment not normalized yet')
        else:
            hic = self.hic_data[0]
        if focus:
            start, end = focus
            start -= 1
        else:
            start = 0
            end   = siz
        if diagonal:
            return [[hic[i + siz * j] for i in range(start, end)]
                    for j in range(start, end)]
        else:
            mtrx = [[hic[i + siz * j] for i in range(start, end)]
                    for j in range(start, end)]
            for i in range(start, end):
                mtrx[i][i] = 1 if mtrx[i][i] else 0
            return mtrx


[docs]    def print_hic_matrix(self, print_it=True, normalized=False, zeros=False):
        """
        Return the Hi-C matrix as string

        :param True print_it: Otherwise, returns the string
        :param False normalized: returns normalized data, instead of raw Hi-C
        :param False zeros: take into account filtered columns
        :returns: list of lists representing the Hi-C data matrix of the
           current experiment
        """
        siz = self.size
        if normalized:
            hic = self.norm[0]
        else:
            hic = self.hic_data[0]
        if zeros:
            out = '\n'.join(['\t'.join(
                ['nan' if (i in self._zeros or j in self._zeros) else
                 str(hic[i+siz * j]) for i in range(siz)])
                             for j in range(siz)])
        else:
            out = '\n'.join(['\t'.join([str(hic[i+siz * j])
                                        for i in range(siz)])
                             for j in range(siz)])
        if print_it:
            print(out)
        else:
            return out + '\n'


[docs]    def view(self, tad=None, focus=None, paint_tads=False, axe=None,
             show=True, logarithm=True, normalized=False, relative=True,
             decorate=True, savefig=None, where='both', clim=None,
             cmap='jet'):
        """
        Visualize the matrix of Hi-C interactions

        :param None tad: a given TAD in the form:
           ::

             {'start': start,
              'end'  : end,
              'brk'  : end,
              'score': score}

           **Alternatively** a list of the TADs can be passed (all the TADs
           between the first and last one passed will be showed. Thus, passing
           more than two TADs might be superfluous)
        :param None focus: a tuple with the start and end positions of the
           region to visualize
        :param False paint_tads: draw a box around the TADs defined for this
           experiment
        :param None axe: an axe object from matplotlib can be passed in order
           to customize the picture
        :param True show: either to pop-up matplotlib image or not
        :param True logarithm: show the logarithm values
        :param True normalized: show the normalized data (weights might have
           been calculated previously). *Note: white rows/columns may appear in
           the matrix displayed; these rows correspond to filtered rows (see*
           :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)*
        :param True relative: color scale is relative to the whole matrix of
           data, not only to the region displayed
        :param True decorate: draws color bar, title and axes labels
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        :param None clim: tuple with minimum and maximum value range for color
           scale. I.e. clim=(-4, 10)
        :param 'jet' cmap: color map from matplotlib. Can also be a
           preconfigured cmap object.
        """
        if logarithm==True:
            fun = log2
        elif logarithm:
            fun = logarithm
        else:
            fun = lambda x: x
        size = self.size
        if normalized and not self.norm:
            raise Exception('ERROR: weights not calculated for this ' +
                            'experiment. Run Experiment.normalize_hic\n')
        if tad and focus:
            raise Exception('ERROR: only one of "tad" or "focus" might be set')
        start = end = None
        if focus:
            start, end = focus
            if start == 0:
                stderr.write('WARNING: Hi-C matrix starts at 1, setting ' +
                             'starting point to 1.\n')
                start = 1
        elif isinstance(tad, dict):
            start = int(tad['start'])
            end   = int(tad['end'])
        elif isinstance(tad, list):
            if isinstance(tad[0], dict):
                start = int(sorted(tad,
                                   key=lambda x: int(x['start']))[0 ]['start'])
                end   = int(sorted(tad,
                                   key=lambda x: int(x['end'  ]))[-1]['end'  ])
        elif self.tads:
            start = self.tads[min(self.tads)]['start'] + 1
            end   = self.tads[max(self.tads)]['end'  ] + 1
        else:
            start =  1
            end   = size
        if normalized:
            norm_data = self.norm
        else:
            hic_data = self.hic_data
        if relative and not clim:
            if normalized:
                # find minimum, if value is non-zero... for logarithm
                mini = min([i for i in list(norm_data[0].values()) if i])
                if mini == int(mini):
                    vmin = min(norm_data[0].values())
                else:
                    vmin = mini
                vmin = fun(vmin or (1 if logarithm else 0))
                vmax = fun(max(norm_data[0].values()))
            else:
                vmin = fun(min(hic_data[0].values()) or
                           (1 if logarithm else 0))
                vmax = fun(max(hic_data[0].values()))
        elif clim:
            vmin, vmax = clim
        if axe is None:
            plt.figure(figsize=(8, 6))
            axe = plt.subplot(111)
        if tad or focus:
            if start > -1:
                if normalized:
                    matrix = [
                        [norm_data[0][i+size*j]
                         if (not i in self._zeros
                             and not j in self._zeros) else float('nan')
                         for i in range(int(start) - 1, int(end))]
                        for j in range(int(start) - 1, int(end))]
                else:
                    matrix = [
                        [hic_data[0][i+size*j]
                         for i in range(int(start) - 1, int(end))]
                        for j in range(int(start) - 1, int(end))]
            elif isinstance(tad, list):
                if normalized:
                    stderr.write('WARNING: List passed, not going to be ' +
                                 'normalized.\n')
                matrix = tad
            else:
                # TODO: something... matrix not declared...
                pass
        else:
            if normalized:
                matrix = [[norm_data[0][i+size*j]
                           if (not i in self._zeros
                               and not j in self._zeros) else float('nan')
                           for i in range(size)]
                          for j in range(size)]
            else:
                matrix = [[hic_data[0][i+size*j]\
                           for i in range(size)] \
                          for j in range(size)]
        if isinstance(cmap, basestring):
            cmap = plt.get_cmap(cmap).copy()
            cmap.set_bad('darkgrey', 0)
        matrix = np.array(matrix)
        if where == 'up':
            for i in range(int(end - start)):
                for j in range(i, int(end - start)):
                    matrix[i][j] = np.nan if normalized else 0
        elif where == 'down':
            for i in range(int(end - start)):
                for j in range(i + 1):
                    matrix[i][j] = np.nan if normalized else 0
        with np.errstate(divide='ignore', invalid='ignore'):
            matrix = np.ma.masked_where(np.isnan(matrix), fun(matrix))
        if relative:
            img = axe.imshow(matrix, origin='lower', vmin=vmin, vmax=vmax,
                             interpolation="nearest", cmap=cmap,
                             extent=(int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5,
                                     int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5))
        else:
            img = axe.imshow(matrix, origin='lower',
                             interpolation="nearest", cmap=cmap,
                             extent=(int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5,
                                     int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5))
        if decorate:
            cbar = axe.figure.colorbar(img)
            cbar.ax.set_ylabel('%sHi-C %sinteraction count' % (
                'Log2 ' * (logarithm==True), 'normalized ' * normalized), rotation=-90)
            axe.set_title(('Chromosome %s experiment %s %s') % (
                self.crm.name, self.name,
                'focus: %s-%s' % (start, end) if tad else ''))
            axe.set_xlabel('Genomic bin (resolution: %s)' % (self.resolution))
            if paint_tads:
                axe.set_ylabel('TAD number')
            else:
                axe.set_ylabel('Genomic bin (resolution: %s)' % (
                    self.resolution))
        if not paint_tads:
            axe.set_ylim(int(start or 1) - 0.5,
                         int(start or 1) + len(matrix) - 0.5)
            axe.set_xlim(int(start or 1) - 0.5,
                         int(start or 1) + len(matrix) - 0.5)
            if show:
                plt.show()
            if savefig:
                tadbit_savefig(savefig)
            return img
        pwidth = 1
        tads = dict([(t, self.tads[t]) for t in self.tads
                     if  ((int(self.tads[t]['start']) + 1 >= start
                           and int(self.tads[t]['end'  ]) + 1 <= end)
                          or not start)])
        for i, tad in tads.items():
            t_start = int(tad['start']) + .5
            t_end   = int(tad['end'])   + 1.5
            nwidth = float(abs(tad['score'])) / 4
            if where in ['down', 'both']:
                axe.hlines(t_start, t_start, t_end, colors='k', lw=pwidth)
            if where in ['up', 'both']:
                axe.hlines(t_end  , t_start, t_end, colors='k', lw=nwidth)
            if where in ['up', 'both']:
                axe.vlines(t_start, t_start, t_end, colors='k', lw=pwidth)
            if where in ['down', 'both']:
                axe.vlines(t_end  , t_start, t_end, colors='k', lw=nwidth)
            pwidth = nwidth
            if tad['score'] < 0:
                for j in range(0, int(t_end) - int(t_start), 2):
                    axe.plot((t_start    , t_start + j),
                             (t_end   - j, t_end      ), color='k')
                    axe.plot((t_end      , t_end   - j),
                             (t_start + j, t_start    ), color='k')
        axe.set_ylim(int(start or 1) - 0.5,
                     int(start or 1) + len(matrix) - 0.5)
        axe.set_xlim(int(start or 1) - 0.5,
                     int(start or 1) + len(matrix) - 0.5)
        if paint_tads:
            ticks = []
            labels = []
            for tad, tick in [(t, tads[t]['start'] + (tads[t]['end'] -
                                                      tads[t]['start'] - 1))
                              for t in list(tads.keys())[::(len(tads)//11 + 1)]]:
                ticks.append(tick)
                labels.append(tad + 1)
            axe.set_yticks(ticks)
            axe.set_yticklabels(labels)
        if show:
            plt.show()
        if savefig:
            tadbit_savefig(savefig)
        return img


[docs]    def write_tad_borders(self, density=False, savedata=None, normalized=False):
        """
        Print a table summarizing the TADs found by tadbit. This function outputs
        something similar to the R function.

        :param False density: if True, adds a column with the relative
           interaction frequency measured within each TAD (value of 1 means an
           interaction frequency equal to the expectation in the experiment)
        :param None savedata: path to a file where to save the density data
           generated (1 column per step + 1 for particle number). If None, print
           a table.
        :param False normalized: uses normalized data to calculate the density
        """
        if normalized and self.norm:
            norms = self.norm[0]
        elif self.hic_data:
            if normalized:
                warn("WARNING: weights not available, using raw data")
            norms = self.hic_data[0]
        else:
            warn("WARNING: raw Hi-C data not available, " +
                 "TAD's height fixed to 1")
            norms = None
        zeros = self._zeros or {}
        table = ''
        table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score',
                                        '' if not density else '\tdensity')
        tads = self.tads
        sp1 = self.size + 1
        diags = []
        if norms:
            for k in range(1, self.size):
                s_k = self.size * k
                diags.append(sum([norms[i * sp1 + s_k]
                                 if not (i in zeros
                                         or (i + k) in zeros) else 0.
                                  for i in range(
                                      self.size - k)]) // (self.size - k))
        for tad in tads:
            table += '%s\t%s\t%s\t%s%s\n' % (
                tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                abs(tads[tad]['score']), '' if not density else
                '\t%s' % (round(float(tads[tad]['height']), 3)))
        if not savedata:
            print(table)
            return
        if isinstance(savedata, file_types):
            out = savedata
        else:
            out = open(savedata, 'w')
        out.write(table)

[docs]    def write_json(self, filename, focus=None, normalized=False):
        """
        Save hic matrix in the json format, read by TADkit.

        :param filename: location where the file will be written
        :param None focus: if a tuple is passed (start, end), json will contain a Hi-C
           matrix starting at start, and ending at end (all inclusive).
        :para False normalized: use normalized data instead of raw Hi-C

        """
        if not self.crm.species:
            warn("WARNING: no species specified in chromosome. TADkit will not be able to interpret the file")
        if not self.crm.name:
            warn("WARNING: no name specified in chromosome. TADkit will not be able to interpret the file")

        if focus:
            start, end = focus
            size = end-start+1
        else:
            start = 0
            end = size = self.size

        if size > 1200:
            warn("WARNING: this is a very big matrix, consider using focus. TADkit will not be able to render the file")

        new_hic_data = self.get_hic_matrix(focus=focus,  normalized=normalized)

        chrom_start = []
        chrom_end = []
        chrom = []
        if self.hic_data and self.hic_data[0].chromosomes:
            tot = 0
            chrs = []
            chrom_offset_start = start
            chrom_offset_end = 0
            for k, v in self.hic_data[0].chromosomes.items():
                tot += v
                if start > tot:
                    chrom_offset_start = start - tot
                if end <= tot:
                    chrom_offset_end = tot - end
                    chrs.append((k,v))
                    break
                if start < tot and end >= tot:
                    chrs.append((k,v))

            for k, v in chrs:
                chrom.append(k)
                chrom_start.append(0)
                chrom_end.append(v * self.resolution)
            chrom_start[0] = chrom_offset_start * self.resolution
            chrom_end[-1] -= chrom_offset_end * self.resolution

        else:
            chrom.append(self.crm.name)
            chrom_start.append(start * self.resolution)
            chrom_end.append(end * self.resolution)


        descr = {'chromosome'   : chrom,
                 'species'      : self.crm.species,
                 'resolution'   : self.resolution,
                 'chrom_start'  : chrom_start,
                 'chrom_end'    : chrom_end,
                 'start'        : self.resolution,
                 'end'          : size * self.resolution}

        # Fake structural models object to produce json
        sm = StructuralModels(nloci=size, models = [], bad_models = [], experiment=self, resolution=self.resolution, original_data=new_hic_data, description=descr, config={'scale':0.01})
        sm.write_json(filename=filename)

    # def generate_densities(self):
    #     """
    #     Related to the generation of 3D models.
    #     In the case of Hi-C data, the density is equal to the number of
    #     nucleotides in a bin, which is equal to the experiment resolution.
    #     """
    #     dens = {}
    #     for i in self.size:
    #         dens[i] = self.resolution
    #     return dens