Source code for pytadbit.hic_data

December 12, 2014.

from __future__ import print_function

from future import standard_library
import os
from sys                            import stderr, modules
from collections                    import OrderedDict
from warnings                       import warn
from bisect                         import bisect_right as bisect
from pickle                         import HIGHEST_PROTOCOL, dump, load

from numpy.linalg                   import LinAlgError
from numpy                          import corrcoef, nansum, array, isnan, mean
from numpy                          import meshgrid, asarray, exp, linspace, std
from numpy                          import nanpercentile as npperc, log as nplog
from numpy                          import nanmax, ma, zeros_like
from scipy.stats                    import ttest_ind, spearmanr
from scipy.special                  import gammaincc
from scipy.cluster.hierarchy        import linkage, fcluster, dendrogram
from scipy.sparse.linalg            import eigsh
from scipy.sparse                   import csr_matrix

from pytadbit.utils.extraviews      import plot_compartments
from pytadbit.utils.extraviews      import plot_compartments_summary
from pytadbit.utils.hic_filtering   import filter_by_mean, filter_by_zero_count
from pytadbit.utils.normalize_hic   import iterative, expected
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.parsers.bed_parser    import parse_bed
from pytadbit.utils.file_handling   import mkdir
from pytadbit.utils.hmm             import gaussian_prob, best_path, train
from pytadbit.utils.tadmaths        import calinski_harabasz
    from pytadbit.parsers.cooler_parser import cooler_file
except ImportError:

except NameError:
    basestring = str

def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

[docs]class HiC_data(dict): """ This may also hold the print/write-to-file matrix functions """ def __init__(self, items, size, chromosomes=None, dict_sec=None, resolution=1, masked=None, symmetricized=False): super(HiC_data, self).__init__(items) self.__size = size self._size2 = size**2 self._symmetricize() self.bias = None self.bads = masked or {} self.chromosomes = chromosomes self.sections = dict_sec self.section_pos = {} self.resolution = resolution self.expected = None self.symmetricized = symmetricized self.compartments = {} if self.chromosomes: total = 0 for crm in self.chromosomes: self.section_pos[crm] = (total, total + self.chromosomes[crm]) total += self.chromosomes[crm] if self.sections == {}: self.section_pos = {None: (0, self.__size)} self.sections = dict([((None, i), i) for i in range(0, self.__size)]) def _symmetricize(self): """ Check if matrix is symmetric (check first 10 non-zero values) and, if not, make it symmetric - if matrix is half empty, copy values on one side to the other side - if matrix is asymmetric, sum non-diagonal values """ to_sum = False symmetric = True count = 0 for n in self: i = n // self.__size j = n % self.__size if i == j or self[i, j] == self[i, j] == 0: continue if not isclose(self[i, j], self[j, i]): if self[i, j] != 0 and self[j, i] != 0: to_sum = True symmetric = False break if count > 10: return count += 1 if symmetric: # may not reach 10 values return if to_sum: for n in list(self.keys())[:]: i = n // self.__size j = n % self.__size if i != j: self[j, i] = self[i, j] = self[j, i] + self[i, j] else: for n in list(self.keys())[:]: i = n // self.__size j = n % self.__size self[j, i] = self[i, j] = self[n] def _update_size(self, size): self.__size += size self._size2 = self.__size**2 def __len__(self): return self.__size def __getitem__(self, row_col): """ slow one... for user for fast item getting, use self.get() """ try: row, col = row_col pos = row * self.__size + col if pos > self._size2: raise IndexError( 'ERROR: row or column larger than %s' % self.__size) return self.get(pos, 0) except TypeError: if row_col > self._size2: raise IndexError( 'ERROR: position %d larger than %s^2' % (row_col, self.__size)) return self.get(row_col, 0) def __setitem__(self, row_col, val): """ slow one... for user for fast item getting, use self.get() """ try: row, col = row_col pos = row * self.__size + col if pos > self._size2: print(row, col, pos) raise IndexError( 'ERROR: row or column larger than %s' % self.__size) super(HiC_data, self).__setitem__(pos, val) except TypeError: if hasattr(self, '_size2') and row_col > self._size2: raise IndexError( 'ERROR: position %d larger than %s^2' % (row_col, self.__size)) super(HiC_data, self).__setitem__(row_col, val)
[docs] def get_hic_data_as_csr(self): """ Returns a scipy sparse matrix in Compressed Sparse Row format of the Hi-C data in the dictionary :returns: scipy sparse matrix in Compressed Sparse Row format """ values = [] cols = [] rows = [] for key, value in self.items(): row, col = round(key // self.__size), key % self.__size values.append(float(value)) cols.append(col) rows.append(row) return csr_matrix((values, (rows, cols)), shape=(self.__size,self.__size))
[docs] def add_sections_from_fasta(self, fasta): """ Add genomic coordinate to HiC_data object by getting them from a FASTA file containing chromosome sequences :param fasta: path to a FASTA file """ genome = parse_fasta(fasta, verbose=False) sections = [] genome_seq = OrderedDict() size = 0 for crm in genome: genome_seq[crm] = int(len(genome[crm])) // self.resolution + 1 size += genome_seq[crm] section_sizes = {} for crm in genome_seq: len_crm = genome_seq[crm] section_sizes[(crm,)] = len_crm sections.extend([(crm, i) for i in range(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) self.chromosomes = genome_seq self.sections = dict_sec if self.chromosomes: total = 0 for crm in self.chromosomes: self.section_pos[crm] = (total, total + self.chromosomes[crm]) total += self.chromosomes[crm] if size != self.__size: warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size) + 'should adjust the resolution') self.__size = size self._size2 = size**2
[docs] def add_sections(self, lengths, chr_names=None, binned=False): """ Add genomic coordinate to HiC_data object by getting them from a FASTA file containing chromosome sequences. Orders matters. :param lengths: list of chromosome lengths :param None chr_names: list of corresponding chromosome names. :param False binned: if True, lengths will not be divided by resolution """ sections = [] genome_seq = OrderedDict() size = 0 resolution = 1 if binned else self.resolution for crm, length in enumerate(lengths): cnam = 'chr' + str(crm) if not chr_names else chr_names[crm] genome_seq[cnam] = int(length) // resolution + 1 size += genome_seq[cnam] section_sizes = {} for crm in genome_seq: len_crm = genome_seq[crm] section_sizes[(crm,)] = len_crm sections.extend([(crm, i) for i in range(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) self.chromosomes = genome_seq self.sections = dict_sec if self.chromosomes: total = 0 for crm in self.chromosomes: self.section_pos[crm] = (total, total + self.chromosomes[crm]) total += self.chromosomes[crm] if size != self.__size: warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size) + 'should adjust the resolution') self.__size = size self._size2 = size**2
[docs] def cis_trans_ratio(self, normalized=False, exclude=None, diagonal=True, equals=None): """ Counts the number of interactions occurring within chromosomes (cis) with respect to the total number of interactions :param False normalized: used normalized data :param None exclude: exclude a given list of chromosome from the ratio (may want to exclude translocated chromosomes) :param False diagonal: replace values in the diagonal by 0 or 1 :param None equals: can pass a function that would decide if 2 chromosomes have to be considered as the same. e.g. lambda x, y: x[:4]==y[:4] will consider chr2L and chr2R as being the same chromosome. WARNING: only working on consecutive chromosomes. :returns: the ratio of cis interactions over the total number of interactions. This number is expected to be between at least 40-60% in Human classic dilution Hi-C with HindIII as restriction enzyme. """ if normalized and not self.bias: raise Exception('ERROR: experiment not normalized yet') if exclude == None: exclude = [] if equals == None: equals = lambda x, y: x == y intra = 0 if not self.chromosomes: return float('nan') # define chromosomes to be merged to_skip = set() c_prev = '' for c in self.chromosomes: if equals(c, c_prev): to_skip.add(c_prev) c_prev = c sections = sorted([-1] + [self.section_pos[c][1] for c in self.section_pos if not c in to_skip]) # defines columns to be skipped bads = set(self.bads.keys()) for c in exclude: bads.update(i for i in range(*self.section_pos[c])) # diagonal if diagonal: valid = lambda x, y: True else: valid = lambda x, y: x != y # normalization if normalized: transform = lambda x, y, z: x / self.bias[y] / self.bias[z] else: transform = lambda x, y, z: x # compute ratio for k, v in self.items(): i, j = divmod(k, self.__size) if bisect(sections, i) != bisect(sections, j): continue if i in bads or j in bads: continue if valid(i, j): # diagonal thing intra += transform(v, i, j) try: return float(intra) / self.sum(bias=self.bias if normalized else None, bads=bads) except ZeroDivisionError: return 0.
[docs] def filter_columns(self, draw_hist=False, savefig=None, perc_zero=99, by_mean=True, min_count=None, silent=False): """ Call filtering function, to remove artifactual columns in a given Hi-C matrix. This function will detect columns with very low interaction counts. Filtered out columns will be stored in the dictionary Experiment._zeros. :param False draw_hist: shows the distribution of mean values by column the polynomial fit, and the cut applied. :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param 75 perc_zero: maximum percentage of cells with no interactions allowed. :param None min_count: minimum number of reads mapped to a bin (recommended value could be 2500). If set this option overrides the perc_zero filtering... This option is slightly slower. :param True by_mean: filter columns by mean column value using :func:`pytadbit.utils.hic_filtering.filter_by_mean` function """ self.bads = filter_by_zero_count(self, perc_zero, min_count=min_count, silent=silent) if by_mean: self.bads.update(filter_by_mean( self, draw_hist=draw_hist, silent=silent, savefig=savefig, bads=self.bads)) if not silent: print('Found %d of %d columns with poor signal' % (len(self.bads), len(self)))
[docs] def sum(self, bias=None, bads=None): """ Sum Hi-C data matrix WARNING: parameters are not meant to be used by external users :params None bias: expects a dictionary of biases to use normalized matrix :params None bads: extends computed bad columns :returns: the sum of the Hi-C matrix skipping bad columns """ N = self.__size norm_sum = 0 bads = bads or self.bads if bias: for k, v in self.items(): i, j = divmod(k, N) if i in bads or j in bads: continue norm_sum += v / (bias[i] * bias[j]) else: for k, v in self.items(): i, j = divmod(k, N) if i in bads or j in bads: continue norm_sum += v return norm_sum
def normalize_expected(self, **kwargs): self.expected = expected(self, bads=self.bads, **kwargs)
[docs] def normalize_hic(self, iterations=0, max_dev=0.1, silent=False, sqrt=False, factor=1): """ Normalize the Hi-C data. It fills the Experiment.norm variable with the Hi-C values divided by the calculated weight. :param 0 iteration: number of iterations :param 0.1 max_dev: iterative process stops when the maximum deviation between the sum of row is equal to this number (0.1 means 10%) :param False silent: does not warn when overwriting weights :param False sqrt: uses the square root of the computed biases :param 1 factor: final mean number of normalized interactions wanted per cell (excludes filtered, or bad, out columns) """ bias = iterative(self, iterations=iterations, max_dev=max_dev, bads=self.bads, verbose=not silent) if sqrt: bias = dict((b, bias[b]**0.5) for b in bias) if factor: if not silent: print('rescaling to factor %d' % factor) print(' - getting the sum of the matrix') # get the sum on half of the matrix norm_sum = self.sum(bias) if not silent: print(' => %.3f' % norm_sum) print(' - rescaling biases') # divide biases target = (norm_sum / float(len(self) * len(self) * factor))**0.5 bias = dict([(b, bias[b] * target) for b in bias]) self.bias = bias
[docs] def save_biases(self, fnam, protocol=None): """ Save biases, decay and bad columns in pickle format (to be loaded by the function load_hic_data_from_bam) :param fnam: path to output file """ out = open(fnam, 'wb') dump({'biases' : self.bias, 'decay' : self.expected, 'badcol' : self.bads, 'resolution': self.resolution}, out, protocol if protocol else HIGHEST_PROTOCOL) out.close()
[docs] def load_biases(self, fnam, protocol=None): """ Load biases, decay and bad columns from pickle file :param fnam: path to input pickle file """ biases = load(open(fnam,'rb')) if biases['resolution'] != self.resolution: raise Exception(('Error: resolution in Pickle (%d) does not match ' 'the one of this HiC_data object (%d)') % ( biases['resolution'], self.resolution)) self.bias = biases['biases'] self.expected = biases['decay'] self.bads = biases['badcol']
def get_as_tuple(self): return tuple([self[i, j] for j in range(len(self)) for i in range(len(self))])
[docs] def write_coord_table(self, fname, focus=None, diagonal=True, normalized=False, format='BED'): """ writes a coordinate table to a file. :param None focus: a tuple with the (start, end) position of the desired window of data (start, starting at 1, and both start and end are inclusive). Alternatively a chromosome name can be input or a tuple of chromosome name, in order to retrieve a specific inter-chromosomal region :param True diagonal: if False, diagonal is replaced by zeroes :param False normalized: get normalized data :param BED format: either "BED" chr1 \t 111 \t 222 \t chr2:333-444,55 \t 1 \t . chr2 \t 333 \t 444 \t chr1:111-222,55 \t 2 \t . or "long-range" format: chr1:111-222 \t chr2:333-444 \t 55 chr2:333-444 \t chr1:111-222 \t 55 """ if focus: if isinstance(focus, tuple) and isinstance(focus[0], int): if len(focus) == 2: start1, end1 = focus start2, end2 = focus start1 -= 1 start2 -= 1 else: start1, end1, start2, end2 = focus start1 -= 1 start2 -= 1 elif isinstance(focus, tuple) and isinstance(focus[0], basestring): start1, end1 = self.section_pos[focus[0]] start2, end2 = self.section_pos[focus[1]] else: start1, end1 = self.section_pos[focus] start2, end2 = self.section_pos[focus] else: start1 = start2 = 0 end1 = end2 = len(self) out = open(fname, 'w') if format == 'long-range': rownam = ['%s:%d-%d' % (k[0], k[1] * self.resolution, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if start2 <= self.sections[k] < end2] if not rownam: raise Exception('ERROR: HiC data object should have genomic coordinates') iter_rows = self.yield_matrix(focus=focus, diagonal=diagonal, normalized=normalized) pair_string = '%s\t%s\t%f\n' if normalized else '%s\t%s\t%d\n' for nrow, row in enumerate(rownam, 1): line = next(iter_rows) iter_cols = iter(line) for col in rownam[nrow:]: val = next(iter_cols) if not val: continue out.write(pair_string % (row, col, val)) elif format == 'BED': rownam = ['%s\t%d\t%d' % (k[0], k[1] * self.resolution, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if start2 <= self.sections[k] < end2] colnam = ['%s:%d-%d' % (k[0], k[1] * self.resolution, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if start2 <= self.sections[k] < end2] if not rownam: raise Exception('ERROR: Hi-C data object should have genomic coordinates') iter_rows = self.yield_matrix(focus=focus, diagonal=diagonal, normalized=normalized) pair_string = '%s\t%s,%f\t%d\t.\n' if normalized else '%s\t%s,%d\t%d\t.\n' count = 1 for nrow, row in enumerate(rownam, 1): line = next(iter_rows) iter_cols = iter(line) for col in colnam[nrow:]: val = next(iter_cols) if not val: continue out.write(pair_string % (row, col, val, count)) count += 1 else: raise Exception('ERROR: format "%s" not found\n' % format) out.close()
[docs] def write_cooler(self, fname, normalized=False): """ writes the hic_data to a cooler file. :param False normalized: get normalized data """ if 'h5py' not in modules: raise Exception('ERROR: cooler output is not available. Probably ' + 'you need to install h5py\n') if normalized and not self.bias: raise Exception('ERROR: data not normalized yet') if not all(isinstance(val, int) for _, val in self.items()): raise Exception('ERROR: raw hic data (integer values) is needed for cooler format') if self.chromosomes: if len(self.chromosomes) > 1: sections = OrderedDict((key,val*self.resolution) for key, val in self.chromosomes.items()) else: # maybe part of a matrix sections = {next(iter(self.chromosomes)): self.__size*self.resolution} else: # maybe part of a matrix sections = {"Unknown": self.__size*self.resolution} out = cooler_file(fname, self.resolution, sections, list(sections.keys())) out.create_bins() out.prepare_matrix() for key, value in self.items(): row, col = key // self.__size, key % self.__size if row > col: # only upper triangular continue out.write_iter(0, row, col, value) out.close() if normalized: weights = [self.bias[i] if not i in self.bads else 0. for i in range(self.__size)] out.write_weights(weights, weights)
[docs] def write_matrix(self, fname, focus=None, diagonal=True, normalized=False): """ writes the matrix to a file. :param None focus: a tuple with the (start, end) position of the desired window of data (start, starting at 1, and both start and end are inclusive). Alternatively a chromosome name can be input or a tuple of chromosome name, in order to retrieve a specific inter-chromosomal region :param True diagonal: if False, diagonal is replaced by zeroes :param False normalized: get normalized data """ if focus: if isinstance(focus, tuple) and isinstance(focus[0], int): if len(focus) == 2: start1, end1 = focus start2, end2 = focus start1 -= 1 start2 -= 1 else: start1, end1, start2, end2 = focus start1 -= 1 start2 -= 1 elif isinstance(focus, tuple) and isinstance(focus[0], basestring): start1, end1 = self.section_pos[focus[0]] start2, end2 = self.section_pos[focus[1]] else: start1, end1 = self.section_pos[focus] start2, end2 = self.section_pos[focus] else: start1 = start2 = 0 end1 = end2 = len(self) out = open(fname, 'w') out.write('# MASKED %s\n' % (','.join([str(k - start1) for k in list(self.bads.keys()) if start1 <= k <= end1]))) rownam = ['%s\t%d-%d' % (k[0], k[1] * self.resolution + 1, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if start2 <= self.sections[k] < end2] if rownam: for line in self.yield_matrix(focus=focus, diagonal=diagonal, normalized=normalized): out.write(rownam.pop(0) + '\t' + '\t'.join([str(i) for i in line]) + '\n') else: for line in self.yield_matrix(focus=focus, diagonal=diagonal, normalized=normalized): out.write('\t'.join([str(i) for i in line]) + '\n') out.close()
[docs] def get_matrix(self, focus=None, diagonal=True, normalized=False, masked=False): """ returns a matrix. :param None focus: a tuple with the (start, end) position of the desired window of data (start, starting at 1, and both start and end are inclusive). Alternatively a chromosome name can be input or a tuple of chromosome name, in order to retrieve a specific inter-chromosomal region :param True diagonal: if False, diagonal is replaced by ones, or zeroes if normalized :param False normalized: get normalized data :param False masked: return masked arrays using the definition of bad columns :returns: matrix (a list of lists of values) """ if normalized and not self.bias: raise Exception('ERROR: experiment not normalized yet') start1, start2, end1, end2 = self._focus_coords(focus) if normalized: if diagonal: matrix = [[self[i, j] / self.bias[i] / self.bias[j] for i in range(start2, end2)] for j in range(start1, end1)] else: matrix = [[self[i, j] / self.bias[i] / self.bias[j] for i in range(start2, end2)] for j in range(start1, end1)] if start1 == start2: for i in range(len(matrix)): matrix[i][i] = 0 else: if diagonal: matrix = [[self[i, j] for i in range(start2, end2)] for j in range(start1, end1)] else: matrix = [[self[i, j] for i in range(start2, end2)] for j in range(start1, end1)] if start1 == start2: for i in range(len(matrix)): matrix[i][i] = 1 if matrix[i][i] else 0 if masked: bads1 = [b - start1 for b in self.bads if start1 <= b < end1] bads2 = [b - start2 for b in self.bads if start2 <= b < end2] m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) return matrix
def _focus_coords(self, focus): siz = len(self) if focus: if isinstance(focus, tuple) and isinstance(focus[0], int): if len(focus) == 2: start1, end1 = focus start2, end2 = focus start1 -= 1 start2 -= 1 else: start1, end1, start2, end2 = focus start1 -= 1 start2 -= 1 elif isinstance(focus, tuple) and isinstance(focus[0], basestring): start1, end1 = self.section_pos[focus[0].split(':')[0]] start2, end2 = self.section_pos[focus[1].split(':')[0]] if ':' in focus[0]: pos = focus[0].split(':')[1] try: pos1, pos2 = [int(p) // self.resolution for p in pos.split('-')] except ValueError: raise Exception('ERROR: should be in format "chr3:10000:20000"') start1, end1 = start1 + pos1, start1 + pos2 if ':' in focus[1]: pos = focus[0].split(':')[1] try: pos1, pos2 = [int(p) // self.resolution for p in pos.split('-')] except ValueError: raise Exception('ERROR: should be in format "chr3:10000:20000"') start2, end2 = start1 + pos1, start1 + pos2 else: start1, end1 = self.section_pos[focus.split(':')[0]] if ':' in focus: pos = focus.split(':')[1] try: pos1, pos2 = [int(p) // self.resolution for p in pos.split('-')] except ValueError: raise Exception('ERROR: should be in format "chr3:10000:20000"') start1, end1 = start1 + pos1, start1 + pos2 start2, end2 = start1, end1 else: start1 = start2 = 0 end1 = end2 = siz return start1, start2, end1, end2
[docs] def find_compartments(self, crms=None, savefig=None, savedata=None, savecorr=None, show=False, suffix='', ev_index=None, rich_in_A=None, format='png', savedir=None, max_ev=3, show_compartment_labels=False, **kwargs): """ Search for A/B compartments in each chromosome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromosome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name_EV1.png'. :param png format: in which to save the figures. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param None savedir: path to a directory to store coordinates of each eigenvector, one per chromosome. Each file contains one eigenvector per column, the first one being the one used as reference. This eigenvector is also rotated according to the prediction if a `rich_in_A` array was given. :param None savecorr: path to a directory where to save correlation matrices of each chromosome :param -1 vmin: for the color scale of the plotted map (use vmin='auto', and vmax='auto' to color according to the absolute maximum found). :param 1 vmax: for the color scale of the plotted map (use vmin='auto', and vmax='auto' to color according to the absolute maximum found). :param False yield_ev1: if True yields one list per chromosome with the first eigenvector used to compute compartments. :param '' suffix: to be placed after file names of compartment images :param 3 max_ev: maximum number of EV to try :param None ev_index: a list of number referring to the index of the eigenvector to be used. By default the first eigenvector is used. WARNING: index starts at 1, default is thus a list of ones. Note: if asking for only one chromosome the list should be only of one element. :param None rich_in_A: by default compartments are identified using mean number of intra-interactions (A compartments are expected to have less). However this measure is not very accurate. Using this parameter a path to a BED or BED-Graph file with a list of genes or active epigenetic marks can be passed, and used instead of the mean interactions. :param False show_compartment_labels: if True draw A and B compartment blocks. TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. :returns: 1- a dictionary with the N (max_ev) first eigenvectors in the form: {Chromosome_name: (Eigenvalue: [Eigenvector])} Sign of the eigenvectors are changed in order to match the prediction of A/B compartments (positive is A). 2- a dictionary of statistics of enrichment for A compartments (Spearman rho). """ if not self.bads: if kwargs.get('verbose', False): print('Filtering bad columns %d' % 99) self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if len(self.bads) == len(self): self.bads = {} warn('WARNING: all columns would have been filtered out, ' 'filtering disabled') if not self.expected: if kwargs.get('verbose', False): print('Normalizing by expected values') self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', False): print('Normalizing by ICE (1 round)') self.normalize_hic(iterations=0, silent=not kwargs.get('verbose', False)) if savefig: mkdir(savefig) if savecorr: mkdir(savecorr) if savedir: mkdir(savedir) if suffix != '': suffix = '_' + suffix # parse bed file if rich_in_A and isinstance(rich_in_A, basestring): rich_in_A = parse_bed(rich_in_A, resolution=self.resolution) cmprts = {} firsts = {} ev_nums = {} count = 0 richA_stats = dict((sec, None) for sec in self.section_pos) for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print('Processing chromosome', sec) # get chromosomal matrix try: matrix = [[(float(self[i,j]) / self.expected[sec][abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(*self.section_pos[sec]) if not i in self.bads] for j in range(*self.section_pos[sec]) if not j in self.bads] except KeyError: if sec in self.expected and not self.expected[sec]: matrix = [] else: matrix = [[(float(self[i,j]) / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(*self.section_pos[sec]) if not i in self.bads] for j in range(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] count += 1 continue # enforce symmetry for i in range(len(matrix)): for j in range(i+1, len(matrix)): matrix[i][j] = matrix[j][i] # compute correlation coefficient try: matrix = [list(m) for m in corrcoef(matrix)] except TypeError: # very small chromosome? warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] count += 1 continue # replace nan in correlation matrix matrix = [[0. if isnan(v) else v for v in l] for l in matrix] # write correlation matrix to file. replaces filtered row/columns by NaN if savecorr: out = open(os.path.join(savecorr, '%s_corr-matrix%s.tsv' % (sec, suffix)), 'w') start1, end1 = self.section_pos[sec] out.write('# MASKED %s\n' % (' '.join([str(k - start1) for k in list(self.bads.keys()) if start1 <= k <= end1]))) rownam = ['%s\t%d-%d' % (k[0], k[1] * self.resolution, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if k[0] == sec] length = self.section_pos[sec][1] - self.section_pos[sec][0] empty = 'NaN\t' * (length - 1) + 'NaN\n' badrows = 0 for row, posx in enumerate(range(self.section_pos[sec][0], self.section_pos[sec][1])): if posx in self.bads: out.write(rownam.pop(0) + '\t' + empty) badrows += 1 continue vals = [] badcols = 0 for col, posy in enumerate(range(self.section_pos[sec][0], self.section_pos[sec][1])): if posy in self.bads: vals.append('NaN') badcols += 1 continue vals.append(str(matrix[row-badrows][col-badcols])) out.write(rownam.pop(0) + '\t' +'\t'.join(vals) + '\n') out.close() # get eigenvectors try: # This eighs is very very fast, only ask for one eigenvector evals, evect = eigsh(array(matrix), k=max_ev if max_ev else (len(matrix) - 1)) except (LinAlgError, ValueError): warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... count += 1 continue # define breakpoints, and store first EVs n_first = [list(evect[:, -i]) for i in range(1, (max_ev + 1) if max_ev else len(matrix))] ev_num = (ev_index[count] - 1) if ev_index else 0 breaks = [i for i, (a, b) in enumerate(zip(n_first[ev_num][1:], n_first[ev_num][:-1])) if a * b < 0] + [len(n_first[ev_num]) - 1] breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b} for i, b in enumerate(breaks)] # rescale EVs, matrix and breaks by inserting NaNs in bad column places beg, end = self.section_pos[sec] bads = [k - beg for k in sorted(self.bads) if beg <= k <= end] for evect in n_first: _ = [evect.insert(b, float('nan')) for b in bads] _ = [matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads] _ = [matrix[i].insert(b, float('nan')) for b in bads for i in range(len(n_first[0]))] for b in bads: # they are sorted for brk in breaks: if brk['start'] >= b: brk['start'] += 1 brk['end' ] += 1 else: brk['end' ] += brk['end'] > b bads = set(bads) # rescale first EV and change sign according to rich_in_A richA_stats[sec] = None sign = 1 if rich_in_A and sec in rich_in_A: eves = [] gccs = [] for i, v in enumerate(n_first[ev_num]): if i in bads: continue try: gc = rich_in_A[sec][i] except KeyError: continue gccs.append(gc) eves.append(v) r_stat, richA_pval = spearmanr(eves, gccs) if kwargs.get('verbose', False): print (' - Spearman correlation between "rich in A" and ' 'Eigenvector:\n' ' rho: %.7f p-val:%.7f' % (r_stat, richA_pval)) richA_stats[sec] = r_stat # switch sign and normalize sign = 1 if r_stat > 0 else -1 for i in range(len(n_first)): n_first[i] = [sign * v for v in n_first[i]] # store it ev_nums[sec] = ev_num + 1 cmprts[sec] = breaks if rich_in_A: for cmprt in cmprts[sec]: try: cmprt['dens'] = sum(rich_in_A.get(sec, {None: 0}).get(i, 0) for i in range(cmprt['start'], cmprt['end'] + 1) if not i in bads) / float(cmprt['end'] - cmprt['start']) except ZeroDivisionError: cmprt['dens'] = float('nan') cmprt['type'] = 'A' if n_first[ev_num][cmprt['start']] > 0 else'B' firsts[sec] = (evals[::-1], n_first) # needed for the plotting if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npperc(matrix, 99.5)), abs(npperc(matrix, 0.5))]) vmin = -vmax try: if savefig: fnam = os.path.join(savefig, '%s_EV%d%s.%s' % (str(sec), ev_nums[sec], suffix, format)) else: fnam = None plot_compartments( sec, n_first[ev_num], cmprts, matrix, show, fnam, vmin=vmin, vmax=vmax, whichpc=ev_num + 1, showAB=show_compartment_labels) except AttributeError: warn(('WARNING: chromosome %s too small for plotting.' 'Skipping image creation.') % sec) except ValueError: warn(('WARNING: chromosome %s too small for plotting.' 'Skipping image creation.') % sec) self.compartments = cmprts if savedata: self.write_compartments(savedata, chroms=list(self.compartments.keys()), ev_nums=ev_nums) if savedir: ncrm = 0 for sec in self.section_pos: if crms and sec not in crms: continue ev_file = open(os.path.join( savedir, '%s_EigVect%d.tsv' % ( sec, ev_index[ncrm] if ev_index else 1)), 'w') ev_file.write('# %s\n' % ('\t'.join( 'EV_%d (%.4f)' % (i, v) for i, v in enumerate(firsts[sec][0], 1)))) ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[sec][1])])) ev_file.close() ncrm += 1 return firsts, richA_stats
[docs] def find_compartments_beta(self, crms=None, savefig=None, savedata=None, savecorr=None, show=False, suffix='', how='', label_compartments='hmm', log=None, max_mean_size=10000, ev_index=None, rich_in_A=None, max_ev=3,show_compartment_labels=False, **kwargs): """ Search for A/B compartments in each chromosome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromosome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param None savecorr: path to a directory where to save correlation matrices of each chromosome :param -1 vmin: for the color scale of the plotted map (use vmin='auto', and vmax='auto' to color according to the absolute maximum found). :param 1 vmax: for the color scale of the plotted map (use vmin='auto', and vmax='auto' to color according to the absolute maximum found). :param False yield_ev1: if True yields one list per chromosome with the first eigenvector used to compute compartments. :param '' suffix: to be placed after file names of compartment images :param 3 max_ev: maximum number of EV to try :param None ev_index: a list of number referring to the index of the eigenvector to be used. By default the first eigenvector is used. WARNING: index starts at 1, default is thus a list of ones. Note: if asking for only one chromosome the list should be only of one element. :param None rich_in_A: by default compartments are identified using mean number of intra-interactions (A compartments are expected to have less). However this measure is not very accurate. Using this parameter a path to a BED or BED-Graph file with a list of genes or active epigenetic marks can be passed, and used instead of the mean interactions. :param None log: path to a folder where to save log of the assignment of A/B compartments :param hmm label_compartments: label compartments into A/B categories, otherwise just find borders (faster). Can be either hmm (default), or cluster. :param 'ratio' how: ratio divide by column, subratio divide by compartment, diagonal only uses diagonal :param False'show_compartment_labels': if True draw A and B compartment blocks. TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. :returns: a dictionary with the N (max_ev) first eigen vectors used to define compartment borders for each chromosome (keys are chromosome names) """ if not self.bads: if kwargs.get('verbose', False): print('Filtering bad columns %d' % 99) self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if len(self.bads) == len(self): self.bads = {} warn('WARNING: all columns would have been filtered out, ' 'filtering disabled') if not self.expected: if kwargs.get('verbose', False): print('Normalizing by expected values') self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', False): print('Normalizing by ICE (1 round)') self.normalize_hic(iterations=0, silent=not kwargs.get('verbose', False)) if savefig: mkdir(savefig) if savecorr: mkdir(savecorr) if suffix != '': suffix = '_' + suffix # parse bed file if rich_in_A: rich_in_A = parse_bed(rich_in_A, resolution=self.resolution) cmprts = {} firsts = {} ev_nums = {} count = 0 for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print('Processing chromosome', sec) matrix = [[(float(self[i,j]) / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(*self.section_pos[sec]) if not i in self.bads] for j in range(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] count += 1 continue for i in range(len(matrix)): for j in range(i+1, len(matrix)): matrix[i][j] = matrix[j][i] try: matrix = [list(m) for m in corrcoef(matrix)] except TypeError: # very small chromosome? warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] count += 1 continue # write correlation matrix to file. replaces filtered row/columns by NaN if savecorr: out = open(os.path.join(savecorr, '%s_corr-matrix.tsv' % (sec)), 'w') start1, end1 = self.section_pos[sec] out.write('# MASKED %s\n' % (' '.join([str(k - start1) for k in list(self.bads.keys()) if start1 <= k <= end1]))) rownam = ['%s\t%d-%d' % (k[0], k[1] * self.resolution, (k[1] + 1) * self.resolution) for k in sorted(self.sections, key=lambda x: self.sections[x]) if k[0] == sec] length = self.section_pos[sec][1] - self.section_pos[sec][0] empty = 'NaN\t' * (length - 1) + 'NaN\n' badrows = 0 for row, posx in enumerate(range(self.section_pos[sec][0], self.section_pos[sec][1])): if posx in self.bads: out.write(rownam.pop(0) + '\t' + empty) badrows += 1 continue vals = [] badcols = 0 for col, posy in enumerate(range(self.section_pos[sec][0], self.section_pos[sec][1])): if posy in self.bads: vals.append('NaN') badcols += 1 continue vals.append(str(matrix[row-badrows][col-badcols])) out.write(rownam.pop(0) + '\t' +'\t'.join(vals) + '\n') out.close() try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=max_ev) except (LinAlgError, ValueError): warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... count += 1 continue index = ev_index[count] if ev_index else 1 n_first = [list(evect[:, -i]) for i in range(1, max_ev + 1)] for ev_num in range(index, max_ev + 1): first = list(evect[:, -ev_num]) breaks = [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first) - 1] breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b} for i, b in enumerate(breaks)] if (self.resolution * (len(breaks) - 1.0) / len(matrix) > max_mean_size): warn('WARNING: number of compartments found with the ' 'EigenVector number %d is too low (%d compartments ' 'in %d rows), for chromosome %s' % ( ev_num, len(breaks), len(matrix), sec)) else: break if (self.resolution * (len(breaks) - 1.0) / len(matrix) > max_mean_size): warn('WARNING: keeping first eigenvector, for chromosome %s' % ( sec)) ev_num = 1 if ev_index: ev_num = ev_index[count] first = list(evect[:, -ev_num]) breaks = [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first) - 1] breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b} for i, b in enumerate(breaks)] ev_nums[sec] = ev_num beg, end = self.section_pos[sec] bads = [k - beg for k in sorted(self.bads) if beg <= k <= end] for evect in n_first: _ = [evect.insert(b, float('nan')) for b in bads] _ = [first.insert(b, 0) for b in bads] _ = [matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads] _ = [matrix[i].insert(b, float('nan')) for b in bads for i in range(len(first))] breaks = [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first) - 1] breaks = [{'start': breaks[i-1] + 1 if i else 0, 'end': b} for i, b in enumerate(breaks)] cmprts[sec] = breaks firsts[sec] = n_first # needed for the plotting self._apply_metric(cmprts, sec, rich_in_A, how=how) if label_compartments == 'cluster': if log: logf = os.path.join(log, sec + suffix + '.log') else: logf = None gammas = {} for n_clust in range(2, 4): for gamma in range(0, 101, 1): scorett, tt, prop = _cluster_ab_compartments( float(gamma)/100, matrix, breaks, cmprts[sec], rich_in_A, ev_num=ev_num, log=logf, save=False, verbose=kwargs.get('verbose', False), n_clust=n_clust) gammas[gamma] = scorett, tt, prop gamma = min(list(gammas.keys()), key=lambda k: gammas[k][0]) if gammas[gamma][0] - gammas[gamma][1] > 7: print( ' WARNING: minimum showing very low ' 'intermeagling of A/B compartments, trying ' 'with 3 clusters, for chromosome %s', sec) gammas = {} continue if kwargs.get('verbose', False): print(' ====> minimum:', gamma) break _ = _cluster_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], rich_in_A, save=True, log=logf, ev_num=ev_num, n_clust=n_clust) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npperc(matrix, 99.5)), abs(npperc(matrix, 0.5))]) vmin = -vmax plot_compartments( sec, first, cmprts, matrix, show, savefig + '/chr' + str(sec) + suffix + '.png' if savefig else None, vmin=vmin, vmax=vmax, whichpc=ev_num,showAB=show_compartment_labels) if label_compartments == 'cluster' or label_compartments == 'hmm': plot_compartments_summary( sec, cmprts, show, savefig + '/chr' + str(sec) + suffix + '_summ.png' if savefig else None) count += 1 if label_compartments == 'hmm': x = {} for sec in self.section_pos: beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] try: x[sec] = [j for i, j in enumerate(firsts[sec][ev_nums[sec] - 1]) if not i in bads] except KeyError: continue # train two HMMs on the genomic data: # - one with 2 states A B # - one with 3 states A B I # - one with 4 states A a B b # - one with 5 states A a B b I models = {} for n in range(2, 6): if kwargs.get('verbose', False): print ('Training HMM for %d categories of ' 'compartments' % n) models[n] = _training(x, n, kwargs.get('verbose', False)) # apply HMM models on each chromosome results = {} for sec in self.section_pos: if not sec in x: continue beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] if kwargs.get('verbose', False): print('Chromosome', sec) # print 'CMPRTS before ', sec, cmprts[sec] n_states, breaks = _hmm_refine_compartments( x[sec], models, bads, kwargs.get('verbose', False)) results[sec] = n_states, breaks cmprts[sec] = breaks # print 'CMPRTS after hmm', sec, cmprts[sec] self._apply_metric(cmprts, sec, rich_in_A, how=how) if rich_in_A: test = lambda x: x >= 1 else: test = lambda x: x < 1 max_type = nanmax([c['type'] for c in cmprts[sec]]) # find which category of compartment has the highest "density" atyp = 0. alen = 0. btyp = 0. blen = 0. max_type = nanmax([c['type'] for c in cmprts[sec]]) for typ in range(5): subset = set([i for i, c in enumerate(cmprts[sec]) if c['type'] == typ]) dens = sum(cmprts[sec][c]['dens'] * (cmprts[sec][c]['end'] - cmprts[sec][c]['start']) for c in subset) leng = sum((cmprts[sec][c]['end'] - cmprts[sec][c]['start'])**2 / 2. for c in subset) # leng = sum(1 for c in subset) val = float(dens) / leng if leng else 0. #if typ == 0: if typ < max_type / 2.: alen += leng atyp += val * leng # elif typ == max_type: elif typ > max_type / 2.: blen += leng btyp += val * leng for i, comp in enumerate(cmprts[sec]): if comp['type'] < max_type / 2.: # if mean density of compartments of type 0 is higher than 1 # than label them as 'B', otherwise, as 'A' if comp['type'] == 0: comp['type'] = 'A' if test(val) else 'B' else: comp['type'] = 'a' if test(val) else 'b' elif comp['type'] > max_type / 2.: if comp['type'] == max_type: comp['type'] = 'B' if test(val) else 'A' else: comp['type'] = 'b' if test(val) else 'a' elif isnan(comp['type']): comp['type'] = 'NA' else: comp['type'] = 'I' self.compartments = cmprts if savedata: self.write_compartments(savedata, chroms=list(self.compartments.keys()), ev_nums=ev_nums) return firsts
def _apply_metric(self, cmprts, sec, rich_in_A, how='ratio'): """ calculate compartment internal density if no rich_in_A, otherwise sum this list """ # print 'SEGMENTS' # print sec, self.section_pos[sec] # for i in range(0, len(cmprts[sec]), 20): # print ' ' + ''.join(['%5d/%-5d'% (s['start'], s['end']) for s in cmprts[sec][i:i+20]]) # print 'CHROMOSOME', sec for cmprt in cmprts[sec]: if rich_in_A: beg1, end1 = cmprt['start'], cmprt['end'] + 1 sec_matrix = [rich_in_A.get(sec, {None: 0}).get(i, 0) for i in range(beg1, end1) if not i in self.bads] try: cmprt['dens'] = float(sum(sec_matrix)) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. else: beg, end = self.section_pos[sec] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg + 1 # print 'BEG:%7d, END:%7d, LEN bias:%7d, LEN self:%7d, LEN expected:%7d' % (beg1, end1, len(self.bias), # len(self), len(self.expected)) if 'diagonal' in how: sec_matrix = [(self[i,i] / self.expected[0] / self.bias[i]**2) for i in range(beg1, end1) if not i in self.bads] else: #if 'compartment' in how: sec_matrix = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(beg1, end1) if not i in self.bads for j in range(beg1, end1) if not j in self.bads] if '/compartment' in how: # diagonal / compartment sec_column = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(beg1, end1) if not i in self.bads for j in range(beg1, end1) if not j in self.bads] elif '/column' in how: sec_column = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in range(beg1, end1) if not i in self.bads for j in range(beg, end) if not j in self.bads] else: sec_column = [1.] try: if 'type' in cmprt and isnan(cmprt['type']): cmprt['dens'] = 1. else: cmprt['dens'] = float(sum(sec_matrix)) / sum(sec_column) except ZeroDivisionError: cmprt['dens'] = 1. # normalize to 1.0 try: if 'type' in cmprt: # hmm already run and have the types definded meanh = (sum(cmprt['dens'] for cmprt in cmprts[sec] if not isnan(cmprt['type'])) / sum(1 for cmprt in cmprts[sec] if not isnan(cmprt['type']))) else: meanh = (sum(cmprt['dens'] for cmprt in cmprts[sec]) / sum(1 for cmprt in cmprts[sec])) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: if 'type' in cmprt and isnan(cmprt['type']): cmprt['dens'] = 1.0 else: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1.
[docs] def write_compartments(self, savedata, chroms=None, ev_nums=None): """ Write compartments to a file. :param savedata: path to a file. :param None chroms: write only the given list of chromosomes (default all chromosomes are written, note that the first column corresponding to chromosome name will disappear in non default case) """ out = open(savedata, 'w') sections = chroms if chroms else list(self.compartments.keys()) if ev_nums: for sec in sections: try: out.write('## CHR %s\tEigenvector: %d\n' % (sec, ev_nums[sec])) except KeyError: continue out.write('#%sstart\tend\trich in A\ttype\n'% ( 'CHR\t' if len(sections) > 1 else '\t')) for sec in sections: for c in self.compartments[sec]: out.write('%s%d\t%d\t%.2f\t%s\n' % ( (str(sec) + '\t') if sections else '\t', c['start'] + 1, c['end'] + 1, c.get('dens', float('nan')), c.get('type', ''))) out.close()
[docs] def yield_matrix(self, focus=None, diagonal=True, normalized=False): """ Yields a matrix line by line. Bad row/columns are returned as null row/columns. :param None focus: a tuple with the (start, end) position of the desired window of data (start, starting at 1, and both start and end are inclusive). Alternatively a chromosome name can be input or a tuple of chromosome name, in order to retrieve a specific inter-chromosomal region :param True diagonal: if False, diagonal is replaced by zeroes :param False normalized: get normalized data :yields: matrix line by line (a line being a list of values) """ siz = len(self) if normalized and not self.bias: raise Exception('ERROR: experiment not normalized yet') if focus: if isinstance(focus, tuple) and isinstance(focus[0], int): if len(focus) == 2: start1, end1 = focus start2, end2 = focus start1 -= 1 start2 -= 1 else: start1, end1, start2, end2 = focus start1 -= 1 start2 -= 1 elif isinstance(focus, tuple) and isinstance(focus[0], basestring): start1, end1 = self.section_pos[focus[0]] start2, end2 = self.section_pos[focus[1]] else: start1, end1 = self.section_pos[focus] start2, end2 = self.section_pos[focus] else: start1 = start2 = 0 end1 = end2 = siz if normalized: for i in range(start2, end2): # if bad column: if i in self.bads: yield [0.0 for j in range(start1, end1)] # if we want the diagonal, or we don't but are looking at a # region that is not symmetric elif diagonal or start1 != start2: yield [self[i, j] / self.bias[i] / self.bias[j] for j in range(start1, end1)] # diagonal replaced by zeroes else: yield ([self[i, j] / self.bias[i] / self.bias[j] for j in range(start1, i)] + [0.0] + [self[i, j] / self.bias[i] / self.bias[j] for j in range(i + 1, end1)]) else: for i in range(start2, end2): # if bad column: if i in self.bads: yield [0 for j in range(start1, end1)] # if we want the diagonal, or we don't but are looking at a # region that is not symmetric elif diagonal or start1 != start2: yield [self[i, j] for j in range(start1, end1)] # diagonal replaced by zeroes else: yield ([self[i, j] for j in range(start1, i)] + [0] + [self[i, j] for j in range(i + 1, end1)])
def _hmm_refine_compartments(xsec, models, bads, verbose): prevll = float('-inf') prevdf = 0 results = {} for n in range(2, 6): E, pi, T = models[n] probs = gaussian_prob(xsec, E) pathm, llm = best_path(probs, pi, T) pathm = asarray(list(map(float, pathm))) df = n**2 - n + n * 2 + n - 1 len_seq = len(pathm) lrt = gammaincc((df - prevdf) / 2., (llm - prevll) / 2.) bic = -2 * llm + df * nplog(len_seq) aic = 2 * df - 2 * llm if verbose: print('Ll for %d states (%d df): %4.0f AIC: %4.0f BIC: %4.0f LRT=%f'% ( n, df, llm, aic, bic, lrt)) prevdf = df prevll = llm results[n] = {'AIC': aic, 'BIC': bic, 'LRT': lrt, 'PATH': pathm} n_states = min(results, key=lambda x: results[x]['BIC']) results = list(results[n_states]['PATH']) # print 'RESULTS', results _ = [results.insert(b, float('nan')) for b in sorted(bads)] # print 'RESULTS', results breaks = [(i, b) for i, (a, b) in enumerate(zip(results[1:], results[:-1])) if str(a) != str(b)] + [len(results) - 1] # print 'BREAKS', breaks breaks[-1] = (breaks[-1], results[-1]) # print 'BREAKS', breaks breaks = [{'start': breaks[i-1][0] + 1 if i else 0, 'end': b, 'type': a} for i, (b, a) in enumerate(breaks)] # print 'BREAKS', breaks return n_states, breaks def _training(x, n, verbose): """ define default emission transition and initial states, and train the hmm """ pi = [0.5 - ((n - 2) * 0.05)**2 if i == 0 or i == n - 1 else ((n - 2)*0.05)**2*2 / (n - 2) for i in range(n)] T = [[0.9 if i==j else 0.1/(n-1) for i in range(n)] for j in range(n)] E = asarray(list(zip(linspace(-1, 1, n), [1./n for _ in range(n)]))) # normalize values of the first eigenvector for c in x: this_mean = mean(x[c]) this_std = std (x[c]) x[c] = [v - this_mean for v in x[c]] x[c] = [v / this_std for v in x[c]] train(pi, T, E, list(x.values()), verbose=verbose, threshold=1e-6, n_iter=1000) return E, pi, T def _cluster_ab_compartments(gamma, matrix, breaks, cmprtsec, rich_in_A, save=True, ev_num=1, log=None, verbose=False, savefig=None, n_clust=2): # function to convert correlation into distances gamma += 1 func = lambda x: -abs(x)**gamma / x funczero = lambda x: 0.0 # calculate distance_matrix dist_matrix = [[0 for _ in range(len(breaks))] for _ in range(len(breaks))] scores = {} for k, cmprt in enumerate(cmprtsec): beg1, end1 = cmprt['start'], cmprt['end'] + 1 diff1 = end1 - beg1 scores[(k,k)] = dist_matrix[k][k] = -1 for l in range(k + 1, len(cmprtsec)): beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end'] + 1 val = nansum([matrix[i][j] for i in range(beg1, end1) for j in range(beg2, end2)]) / (end2 - beg2) / diff1 try: scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = func(val) except ZeroDivisionError: scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(val) if isnan(scores[(k,l)]): scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(0) # cluster compartments according to their correlation score try: clust = linkage(dist_matrix, method='ward') except UnboundLocalError: warn('WARNING: Chromosome probably too small. Skipping') return (float('inf'), float('inf'), float('inf')) # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters) solutions = {} for k in clust[:,2][-3:]: clusters = {} _ = [clusters.setdefault(j, []).append(i) for i, j in enumerate(fcluster(clust, k, criterion='distance'))] solutions[k] = {'out': clusters} solutions[k]['score'] = calinski_harabasz(scores, clusters) # plot if savefig: xedges = [b['start'] for b in breaks] yedges = [b['start'] for b in breaks] xedges += [breaks[-1]['end']] yedges += [breaks[-1]['end']] X, Y = meshgrid(xedges, yedges) import matplotlib.pyplot as plt fig = plt.figure(figsize=(10,10)) _ = fig.add_axes([0.09,0.1,0.2,0.6]) Z1 = dendrogram(clust, orientation='left') idx1 = Z1['leaves'] idx2 = Z1['leaves'] D = asarray(dist_matrix)[idx1,:] D = D[:,idx2] axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) m = axmatrix.pcolormesh(X, Y, D) axmatrix.set_aspect('equal') axmatrix.set_yticks([]) axmatrix.set_xlim((0, breaks[-1]['end'])) axmatrix.set_ylim((0, breaks[-1]['end'])) plt.colorbar(m) plt.savefig(savefig) try: # take best cluster according to calinski_harabasz score clusters = [solutions[s] for s in sorted( solutions, key=lambda x: solutions[x]['score']) if solutions[s]['score']>0][1 - n_clust]['out'] except IndexError: # warn('WARNING1: compartment clustering is not clear. Skipping') return (float('inf'), float('inf'), float('inf')) if len(clusters) != n_clust: # warn('WARNING2: compartment clustering is too clear. Skipping') return (float('inf'), float('inf'), float('inf')) # labeling compartments. A compartments shall have lower # mean intra-interactions dens = {} if rich_in_A: test = lambda x: x >= 1 else: test = lambda x: x < 1 for k in clusters: val = sum([cmprtsec[c]['dens'] for c in clusters[k]]) / len(clusters[k]) dens['A' if test(val) else 'B'] = [ cmprtsec[c]['dens'] for c in clusters[k] if cmprtsec[c]['end'] + 1 - cmprtsec[c]['start'] > 2] for c in clusters[k]: cmprtsec[c]['type'] = 'A' if test(val) else 'B' try: tt, pval = ttest_ind(dens['A'], dens['B']) except ZeroDivisionError: return (float('inf'), float('inf'), float('inf')) prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B'])) # to avoid having all A or all B # score = 5000 * (prop - 0.5)**4 - 2 # to avoid having consecutive As or Bs score = 0. prev = None for cmprt in cmprtsec: if cmprt.get('type', None) == prev: score += 1. prev = cmprt.get('type', prev) score /= len(cmprtsec) score = exp(10 * (score - 0.4)) # 5000 * (score - 0.5)**4 - 2 # score = score1 + score2 if verbose: print ('[EV%d CL%s] g:%5s prop:%5s%% tt:%7s ' 'score-interleave:%5s ' # score-proportion:%7s 'final: %7s pv:%7s' % ( ev_num, n_clust, gamma - 1, round(prop * 100, 1), round(tt, 3), round(score, 3), #round(score2, 3), round(score + tt, 3), round(pval, 5))) if log: log = open(log, 'a') log.write('[EV%d CL%s] g:%5s prop:%5s%% tt:%6s ' 'score-interleave:%6s ' # score-proportion:%7s 'final: %7s pv:%s\n' % ( ev_num, n_clust, gamma - 1, round(prop * 100, 1), round(tt, 3), round(score, 3), # round(score2, 3), round(score + tt, 3), round(pval, 4))) log.close() if not save: for cmprt in cmprtsec: if 'type' in cmprt: cmprt['type'] = None return score + tt, tt, prop