Source code for pytadbit.modelling.structuralmodels

"""
19 Jul 2013
"""
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
try:
    from pickle5                      import load  # python < 3.8
except ImportError:
    from pickle                       import load
from pickle                           import dump, HIGHEST_PROTOCOL
from subprocess                       import Popen, PIPE
from math                             import acos, degrees, pi, sqrt
from warnings                         import warn, catch_warnings, simplefilter    
from sys                              import version_info
if (version_info > (3, 0)):
    from string                           import ascii_uppercase as uc, ascii_lowercase as lc
else:
    from string                           import uppercase as uc, lowercase as lc
from random                           import random
from os.path                          import exists
from itertools                        import combinations
from uuid                             import uuid5, UUID
from hashlib                          import md5
from copy                             import deepcopy, copy

from numpy                            import exp as np_exp
from numpy                            import median as np_median
from numpy                            import mean as np_mean
from numpy                            import std as np_std, log2
from numpy                            import array, cross, dot, ma, isnan
from numpy                            import histogram, linspace, errstate
from numpy                            import nanmin, nanmax
from numpy.linalg                     import norm

from scipy.optimize                   import curve_fit
from scipy.stats                      import spearmanr, pearsonr, chisquare
from scipy.stats                      import kendalltau
from scipy.stats                      import linregress
from scipy.stats                      import normaltest, norm as sc_norm
from scipy.cluster.hierarchy          import linkage, fcluster
from scipy.spatial.distance           import squareform

from pytadbit                         import get_dependencies_version
from pytadbit.utils.three_dim_stats   import calc_consistency, mass_center
from pytadbit.utils.three_dim_stats   import dihedral, calc_eqv_rmsd
from pytadbit.utils.three_dim_stats   import get_center_of_mass, distance
from pytadbit.utils.tadmaths          import calinski_harabasz, nozero_log_list
from pytadbit.utils.tadmaths          import mean_none
from pytadbit.utils.extraviews        import plot_3d_model, setup_plot
from pytadbit.utils.extraviews        import chimera_view, tadbit_savefig
from pytadbit.utils.extraviews        import augmented_dendrogram, plot_hist_box
from pytadbit.utils.extraviews        import tad_coloring
from pytadbit.utils.extraviews        import tad_border_coloring
from pytadbit.utils.extraviews        import color_residues
from pytadbit.mapping.analyze         import scc
from pytadbit.modelling.impmodel      import IMPmodel
from pytadbit.centroid                import centroid_wrapper
from pytadbit.aligner3d               import aligner3d_wrapper
from pytadbit.squared_distance_matrix import squared_distance_matrix_calculation_wrapper
from functools import reduce

try:
    from matplotlib import pyplot as plt
    from matplotlib.cm import viridis, Reds, Blues, bwr
    from matplotlib.ticker import PercentFormatter
except ImportError:
    warn('matplotlib not found\n')

try:
    basestring
except NameError:
    basestring = str

def R2_vs_L(L, P):
    """
    Calculates the persistence length (Lp) of given section of the model.
    Persistence length is calculated according to [Bystricky2004]_ :

    .. math::

    <R^2> = 2 \\times Lp^2 \\times (\\frac{Lc}{Lp} - 1 + e^{\\frac{-Lc}{Lp}})

    with the contour length as :math:`Lc = \\frac{d}{c}` where :math:`d` is
    the genomic distance in bp and :math:`c` the linear mass density of the
    chromatin (in bp/nm).

    :returns: persistence length, or 2 times the Kuhn length
    """
    return 2.0 * P * ( L - P * ( 1.0 - np_exp( - L / P ) ) )

[docs]def load_structuralmodels(input_): """ Loads :class:`pytadbit.modelling.structuralmodels.StructuralModels` from a file (generated with :class:`pytadbit.modelling.structuralmodels.StructuralModels.save_models`). :param input: path to the pickled StructuralModels object or dictionary containing all the information. :returns: a :class:`pytadbit.modelling.imp_model.StructuralModels`. """ if isinstance(input_, basestring): with open(input_,'rb') as f_input_: svd = load(f_input_) else: svd = input_ try: return StructuralModels( nloci=svd['nloci'], models=svd['models'], bad_models=svd['bad_models'], resolution=svd['resolution'], original_data=svd['original_data'], clusters=svd['clusters'], config=svd['config'], zscores=svd['zscore'], zeros=svd['zeros'], restraints=svd.get('restraints', None), description=svd.get('description', None)) except KeyError: # old version return StructuralModels( nloci=svd['nloci'], models=svd['models'], bad_models=svd['bad_models'], resolution=svd['resolution'], original_data=svd['original_data'], clusters=svd['clusters'], config=svd['config'], zscores=svd['zscore'], restraints=svd.get('restraints', None))
[docs]class StructuralModels(object): """ This class contains three-dimensional models generated from a single Hi-C data. They can be reached either by their index (integer representing their rank according to objective function value), or by their IMP random intial number (as string). :param nloci: number of particles in the selected region :param models: a dictionary containing the generated :class:`pytadbit.modelling.impmodel.IMPmodel` to be used as 'best models' :param bad_models: a dictionary of :class:`pytadbit.modelling.impmodel.IMPmodel`, these model will not be used, just stored in case the set of 'best models' needs to be extended later-on ( :func:`pytadbit.modelling.structuralmodels.StructuralModels.define_best_models` ). :param resolution: number of nucleotides per Hi-C bin. This will be the number of nucleotides in each model's particle. :param None original_data: a list of list (equivalent to a square matrix) of the normalized Hi-C data, used to build this list of models. :param None clusters: a dictionary of type :class:`pytadbit.modelling.structuralmodels.ClusterOfModels` :param None config: a dictionary containing the parameter to be used for the generation of three dimensional models. """ def __init__(self, nloci, models, bad_models, resolution, original_data=None, zscores=None, clusters=None, config=None, experiment=None, zeros=None, restraints=None, description=None): self.__models = models self._bad_models = bad_models self.nloci = nloci self.clusters = clusters or ClusterOfModels() self.resolution = float(resolution) self._original_data = original_data # only used for correlation self._zscores = zscores # only used for plotting self._zeros = zeros or {} # filtered out columns self._config = config or {} self.experiment = experiment self._restraints = restraints self.description = description def __getitem__(self, nam): if isinstance(nam, basestring): for m in self.__models: if self.__models[m]['rand_init'] == nam: return self.__models[m] else: raise KeyError('Model %s not found\n' % (nam)) try: return self.__models[nam] except TypeError: for i, key in self.__models: if nam == i: return self.__models[key] raise KeyError('Model %s not found\n' % (i)) def __next__(self): return next(self.__models) def __iter__(self): for m in self.__models: yield self.__models[m] def __len__(self): return len(self.__models) def __repr__(self): return ('StructuralModels with %s models of %s particles\n' + ' (objective function range: %s - %s)\n' + ' (corresponding to the best models out of %s models).\n' + ' IMP modeling used this parameters:\n' + '%s\n' + ' Models where clustered into %s clusters') % ( len(self.__models), self.nloci, int(self.__models[0]['objfun']), int(self.__models[len(self.__models) - 1]['objfun']), len(self.__models) + len(self._bad_models), '\n'.join([' - %-12s: %s' % (k, v) for k, v in list(self._config.items())]), len(self.clusters)) def _extend_models(self, models, nbest=None, different_stage=False): """ Add new models to structural models to current StructuralModel. :param models: list of StructuralModels or StructuralModels instance :param None nbest: number of StructuralModels to be considered as 'good' :param False different_stage: by default, only models with new unique random initial numbers will be added. If added data corresponds to a different stage (experiment, time points...) than random_init numbers will extended with a specific hash. """ if isinstance(models, StructuralModels): models.define_best_models(len(models) + len(models._bad_models)) models = models._StructuralModels__models nbest = len(self.__models) if nbest is None else nbest nall = len(self.__models) + len(self._bad_models) self.define_best_models(nall) if different_stage: chars = list(lc) + list(map(str, list(range(10)))) sha = ''.join(chars[int(random() * len(chars))] for _ in range(12)) for m in list(models.keys()): models[m]['rand_init'] += '_%s' % (sha) ids = set(self.__models[m]['rand_init'] for m in list(self.__models.keys())) for m in list(models.keys()): if models[m]['rand_init'] in ids: warn(('WARNING: model with seed: %s already here (use ' 'different_stage=True, to force extesion)\n ' 'SKIPPING...') % (m)) del(models[m]) new_models = {} for i, m in enumerate(sorted(list(models.values()) + list(self.__models.values()), key=lambda x: x['objfun'])): new_models[i] = m new_models[i]['index'] = i self.__models = new_models # keep the same number of best models self.define_best_models(nbest)
[docs] def align_models(self, models=None, cluster=None, in_place=False, reference_model=None, by_cluster=False, **kwargs): """ Three-dimensional aligner for structural models. :param None models: if None (default) the average model will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the average model only for the models in the cluster number 'cluster' :param False in_place: if True, the result of the alignment will REPLACE the coordinates of each model. Default is too yield new coordinates of each model :param None reference_model: align given model to reference model :param False by_cluster: if True align each cluster individually in case we align all the models (models=None) """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: if by_cluster and len(self.clusters) > 0: aligned_coords = [None for _ in range(len(self.__models))] for cluster in self.clusters: clust_aligned_coords = self.align_models(cluster=cluster, in_place=in_place, reference_model=reference_model) for midx, m in enumerate(self.clusters[cluster]): aligned_coords[self[str(m)]['index']] = clust_aligned_coords[midx] # Complete with models with singletons if not in_place: for midx in range(len(self.__models)): if not aligned_coords[midx]: aligned_coords[midx] = [self[midx]['x'][:], self[midx]['y'][:], self[midx]['z'][:]] return aligned_coords if not in_place else None else: models = [m for m in self.__models] ref_model = models[0] if reference_model is None else reference_model firstx, firsty, firstz = (self[ref_model]['x'][:], self[ref_model]['y'][:], self[ref_model]['z'][:]) mass_center(firstx, firsty, firstz, self._zeros) aligned = [] for sec in models: if sec == ref_model: if not in_place: aligned.append([firstx, firsty, firstz]) continue coords = aligner3d_wrapper(firstx, firsty, firstz, self[sec]['x'], self[sec]['y'], self[sec]['z'], self._zeros, self.nloci) if in_place: self[sec]['x'], self[sec]['y'], self[sec]['z'] = coords else: aligned.append(coords) if in_place: mass_center(self[ref_model]['x'], self[ref_model]['y'], self[ref_model]['z'], self._zeros) return None return aligned
[docs] def fetch_model_by_rand_init(self, rand_init, all_models=False): """ Models are stored according to their objective function value (first best), but in order to reproduce a model, we need its initial random number. This method helps to fetch the model corresponding to a given initial random number stored under StructuralModels.models[N]['rand_init']. :param rand_init: the wanted rand_init number. :param False all_models: whether or not to use 'bad' models :returns: index of 3d model """ for i, model in enumerate(self): if model['rand_init'] == rand_init: return i if all_models: for m in self._bad_models: if self._bad_models[m]['rand_init'] == rand_init: return m raise IndexError(('Model with initial random number: %s, not found\n' + '') % (rand_init))
[docs] def centroid_model(self, models=None, cluster=None, verbose=False): """ Estimates and returns the centroid model of a given group of models. :param None models: if None (default) the centroid model will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the centroid model only for the models in the cluster number 'cluster' :param False verbose: prints the distance of each model to average model (in stderr) :returns: the centroid model of a given group of models (the most model representative) """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] # remove particles with zeros from calculation x = [] y = [] z = [] for model in range(len(models)): x.append([self[model]['x'][i] for i in range(self.nloci) if self._zeros[i]]) y.append([self[model]['y'][i] for i in range(self.nloci) if self._zeros[i]]) z.append([self[model]['z'][i] for i in range(self.nloci) if self._zeros[i]]) zeros = tuple([True for _ in range(len(x[0]))]) idx = centroid_wrapper(x, y, z, zeros, len(x[0]), len(models), int(verbose), 0) return models[idx]
[docs] def average_model(self, models=None, cluster=None, verbose=False): """ Builds and returns an average model representing a given group of models :param None models: if None (default) the average model will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the average model only for the models in the cluster number 'cluster' :param False verbose: prints the distance of each model to average model (in stderr) :returns: the average model of a given group of models (a new and ARTIFICIAL model) """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] # remove particles with zeros from calculation x = [] y = [] z = [] for model in range(len(models)): x.append([self[model]['x'][i] for i in range(self.nloci)]) y.append([self[model]['y'][i] for i in range(self.nloci)]) z.append([self[model]['z'][i] for i in range(self.nloci)]) idx = centroid_wrapper(x, y, z, self._zeros, len(x[0]), len(models), int(verbose), 1) avgmodel = IMPmodel((('x', idx[0]), ('y', idx[1]), ('z', idx[2]), ('rand_init', 'avg'), ('objfun', None), ('radius', float(self.resolution * self._config['scale']) / 2))) return avgmodel
[docs] def cluster_models(self, fact=0.75, dcutoff=None, method='mcl', mcl_bin='mcl', tmp_file=None, verbose=True, n_cpus=1, mclargs=None, external=False, what='score', region=None): """ This function performs a clustering analysis of the generated models based on structural comparison. The result will be stored in StructuralModels.clusters Clustering is done according to a score of pairwise comparison calculated as: .. math:: score_i = eqvs_i \\times \\frac{dRMSD_i / max(dRMSD)} {RMSD_i / max(RMSD)} where :math:`eqvs_i` is the number of equivalent position for the ith pairwise model comparison. :param 0.75 fact: factor to define the percentage of equivalent positions to be considered in the clustering :param None dcutoff: distance threshold (nm) to determine if two particles are in contact, default is 1.5 times resolution times scale :param 'mcl' method: clustering method to use, which can be either 'mcl' or 'ward'. MCL method is recommended. WARD method uses a scipy implementation of this hierarchical clustering, and selects the best number of clusters using the :func:`pytadbit.utils.tadmaths.calinski_harabasz` function. :param 'mcl' mcl_bin: path to the mcl executable file, in case of the 'mcl is not in the PATH' warning message :param None tmp_file: path to a temporary file created during the clustering computation. Default will be created in /tmp/ folder :param True verbose: same as print StructuralModels.clusters :param 1 n_cpus: number of cpus to use in MCL clustering :param mclargs: list with any other command line argument to be passed to mcl (i.e,: mclargs=['-pi', '10', '-I', '2.0']) :param False external: if True returns the cluster found instead of storing it as StructuralModels.clusters :param 'score' what: Statistic used for clustering. Can be one of 'score', 'rmsd', 'drmsd' or 'eqv'. :param None region: coordinates of the region to base the clustering. Can be either one chromosome name, or the coordinate in the form: "chr3:110000000-120000000" """ tmp_file = '/tmp/tadbit_tmp_%s.txt' % ( ''.join([(uc + lc)[int(random() * 52)] for _ in range(4)])) if not dcutoff: dcutoff = int(1.5 * self.resolution * self._config['scale']) crm = None beg = 0 end = self.nloci if region: if ':' in region: crm, pos = region.split(':') beg, end = list(map(int, pos.split('-'))) else: crm = region end = None try: my_descr = dict(self.description) except TypeError: raise Exception('ERROR: chromosome not in StructuralModels\n') chrom_start = 0 if 'chromosome' in my_descr and isinstance(my_descr['chromosome'], list): if crm not in my_descr['chromosome']: raise Exception('ERROR: chromosome not in StructuralModels\n') chrom_start += sum([(n-m) for i,(m,n) in enumerate(zip(my_descr['start'],my_descr['end'])) if i < my_descr['chromosome'].index(crm)]) else: if my_descr.get('chromosome', 'Chromosome') != crm: raise Exception('ERROR: chromosome not in StructuralModels\n') if not end: end = my_descr['end'][my_descr['chromosome'].index(crm)] beg = int(float(beg + chrom_start) // self.resolution) end = int(float(end + chrom_start) // self.resolution) nloci = end - beg scores = calc_eqv_rmsd(self.__models, beg, end, self._zeros, dcutoff, what=what, normed=True) from distutils.spawn import find_executable if not find_executable(mcl_bin): print('\nWARNING: MCL not found in path using WARD clustering\n') method = 'ward' # Initialize cluster definition of models: for model in self: model['cluster'] = 'Singleton' new_singles = 0 if method == 'ward': matrix = [[0.0 for _ in range(len(self))] for _ in range(len(self))] for (i, j), score in list(scores.items()): matrix[i][j] = score if score > fact * nloci else 0.0 clust = linkage(squareform(matrix), method='ward') # score each possible cut in hierarchical clustering solutions = {} for k in clust[:, 2]: clusters = ClusterOfModels() [clusters.setdefault(j, []).append(i) for i, j in enumerate(fcluster(clust, k, criterion='distance'))] solutions[k] = {'out': clusters} solutions[k]['score'] = calinski_harabasz(scores, clusters) # take best cluster according to calinski_harabasz score clusters = [solutions[s] for s in sorted( solutions, key=lambda x: solutions[x]['score']) if solutions[s]['score'] > 0][-1]['out'] # sort clusters, the more populated, the first. clusters = dict((i + 1, j) for i, j in enumerate(sorted(list(clusters.values()), key=len, reverse=True))) if external: return clusters self.clusters = ClusterOfModels() for cluster in clusters: self.clusters[cluster] = [] for model in clusters[cluster]: self[model]['cluster'] = cluster self.clusters[cluster].append(str(self[model]['rand_init'])) self.clusters[cluster].sort( key=lambda x: self[str(x)]['objfun']) else: out_f = open(tmp_file, 'w') uniqs = list(set([tuple(sorted((m1, m2))) for m1, m2 in scores])) cut = fact * (nloci - self._zeros[beg:end].count(False)) for md1, md2 in uniqs: score = scores[(md1, md2)] if score >= cut: out_f.write('model_%s\tmodel_%s\t%s\n' % (md1, md2, score)) out_f.close() Popen('%s %s --abc -te %s -V all -o %s.mcl %s' % ( mcl_bin, tmp_file, n_cpus, tmp_file, ' '.join( mclargs or [])), stdout=PIPE, stderr=PIPE, shell=True, universal_newlines=True).communicate() clusters = ClusterOfModels() if not exists(tmp_file + '.mcl'): raise Exception('Problem with clustering, try increasing ' + '"dcutoff", now: %s\n' % (dcutoff)) with open(tmp_file + '.mcl') as f_tmp_file: for cluster, line in enumerate(f_tmp_file): models = line.split() if len(models) == 1: new_singles += 1 else: clusters[cluster + 1] = [] for model in models: model = int(model.split('_')[1]) if not external: self[model]['cluster'] = cluster + 1 clusters[cluster + 1].append( str(self[model]['rand_init'])) clusters[cluster + 1].sort( key=lambda x: self[str(x)]['objfun']) if external: return clusters self.clusters = clusters if verbose: singletons = len([1 for m in self if m['cluster'] == 'Singleton']) print(('Number of singletons excluded from clustering: %s (total' + ' singletons: %s)') % (singletons - new_singles, singletons)) print(self.clusters)
def _build_distance_matrix(self, n_best_clusters): """ """ clusters = sorted(self.clusters.keys())[:n_best_clusters] matrix = [[0.0 for _ in clusters] for _ in clusters] clust_count = dict([(c, len([m for m in self if m['cluster'] == c])) for c in clusters]) objfun = dict([(c, [m for m in self if m['cluster'] == c][0]['objfun']) for c in clusters]) for i, cl1 in enumerate(clusters): md1 = md2 = None # find model with lowest energy for each cluster for md1 in self: if md1['cluster'] == cl1: # the first one found is the best :) break for j, cl2 in enumerate(clusters[i + 1:]): # find model with lowest energy for each cluster for md2 in self: if md2['cluster'] == cl2: # the first one found is the best :) break matrix[i][j + i + 1] = calc_eqv_rmsd({0: md1, 1: md2}, 0, self.nloci, self._zeros, one=True) return clust_count, objfun, matrix
[docs] def cluster_analysis_dendrogram(self, n_best_clusters=None, color=False, axe=None, savefig=None, **kwargs): """ Representation of the clustering results. The length of the leaves if proportional to the final objective function value of each model. The branch widths are proportional to the number of models in a given cluster (or group of clusters, if it is an internal branch). :param None n_best_clusters: number of clusters to represent (by default all clusters will be shown) :param False color: color the dendrogram based on the significance of the clustering (basically it depends of the internal branch lengths) :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param 10.0 width_factor: multiplicator for the width of the line representing the number of models in a given cluster. :param 8 fontsize: size of the smallest font represented in the plot :param (8,8) figsize: a tuple of width and height, to set the size of the plot. """ if not self.clusters: warn('WARNING: no clusters found, clustering with default' + ' parameters\n') self.cluster_models() if not n_best_clusters: n_best_clusters = len(self.clusters) if n_best_clusters <= 1: warn("Need at least 2 clusters to display...") return None clust_count, objfun, matrix = self._build_distance_matrix(n_best_clusters) z = linkage(matrix) minnrj = min(objfun.values()) - 1 maxnrj = max(objfun.values()) - 1 val = (maxnrj - minnrj) maxz = max([i[2] for i in z]) for i in z: i[2] = i[2] / maxz * val dads = {} i = max(clust_count) for a, b, _, _ in z: i += 1 clust_count[i] = clust_count[a + 1] + clust_count[b + 1] dads[a + 1] = i dads[b + 1] = i d = augmented_dendrogram(clust_count, dads, objfun, color, axe, savefig, z, **kwargs) return d
[docs] def get_contact_matrix(self, models=None, cluster=None, cutoff=None, distance=False, show_bad_columns=True): """ Returns a matrix with the number of interactions observed below a given cutoff distance. :param None models: if None (default) the contact matrix will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the contact matrix only for the models in the cluster number 'cluster' :param None cutoff: distance cutoff (nm) to define whether two particles are in contact or not, default is 2 times resolution, times scale. Cutoff can also be a list of values, in wich case the returned object will be a dictionnary of matrices (keys being square cutoffs) :param False distance: returns the distance matrix of all_angles against all_angles particles instead of a contact_map matrix using the cutoff :param True show_bad_columns: show bad columns in contact map :returns: matrix frequency of interaction """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] cutoff_list = True if not isinstance(cutoff, list): cutoff = [cutoff] cutoff_list = False cutoff.sort(reverse=True) if not cutoff: cutoff = [float(2 * self.resolution * self._config['scale'])] cutoff = [c**2 for c in cutoff] matrix = dict([(c, [[0. for _ in range(self.nloci)] for _ in range(self.nloci)]) for c in cutoff]) # remove (or not) interactions from bad columns if show_bad_columns: wloci = [i for i in range(self.nloci) if self._zeros[i]] else: wloci = [i for i in range(self.nloci)] models = [self[mdl] for mdl in models] frac = 1.0 / len(models) for model in models: squared_distance_matrix = squared_distance_matrix_calculation_wrapper( model['x'], model['y'], model['z'], self.nloci) #print model, len(x), len(y), len(z) for c in cutoff: for i, j in combinations(wloci, 2): if squared_distance_matrix[i][j] <= c: matrix[c][i][j] += frac # * 100 matrix[c][j][i] += frac # * 100 if cutoff_list: return matrix return list(matrix.values())[0]
[docs] def define_best_models(self, nbest): """ Defines the number of top models (based on the objective function) to keep. If keep_all is set to True in :func:`pytadbit.modelling.imp_model.generate_3d_models` or in :func:`pytadbit.experiment.Experiment.model_region`, then the full set of models (n_models parameter) will be used, otherwise only the n_keep models will be available. :param nbest: number of top models to keep (usually 20% of the generated models). """ tmp_models = self.__models tmp_models.update(self._bad_models) nbest = min(len(tmp_models), nbest) self.__models = dict((i, tmp_models[i]) for i in range(nbest)) self._bad_models = dict((i, tmp_models[i]) for i in range(nbest, len(tmp_models)))
[docs] def deconvolve(self, fact=0.75, dcutoff=None, method='mcl', mcl_bin='mcl', tmp_file=None, verbose=True, n_cpus=1, mclargs=None, what='dRMSD', n_best_clusters=10, savefig=None, represent_models=False, figsize=(11, 11), clusters=None, **kwargs): """ This function performs a deconvolution analysis of a given froup of models. It first clusters models based on structural comparison (dRMSD), and then, performs a differential contact map between each possible pair of cluster. .. note:: Clusters defined here are different from the one defined when using :func:`pytadbit.modelling.structuralmodels.StructuralModels.cluster_models`. They are also not stored into StructuralModels.clusters :param 0.75 fact: factor to define the percentage of equivalent positions to be considered in the clustering :param None dcutoff: distance threshold (nm) to determine if two particles are in contact, default is 1.5 times resolution times scale :param 'mcl' method: clustering method to use, which can be either 'mcl' or 'ward'. MCL method is recommended. WARD method uses a scipy implementation of this hierarchical clustering, and selects the best number of clusters using the :func:`pytadbit.utils.tadmaths.calinski_harabasz` function. :param 'mcl' mcl_bin: path to the mcl executable file, in case of the 'mcl is not in the PATH' warning message :param None tmp_file: path to a temporary file created during the clustering computation. Default will be created in /tmp/ folder :param True verbose: same as print StructuralModels.clusters :param 1 n_cpus: number of cpus to use in MCL clustering :param mclargs: list with any other command line argument to be passed to mcl (i.e,: mclargs=['-pi', '10', '-I', '2.0']) :param 10 n_best_clusters: number of clusters to represent :param None clusters: provide clusters as a dictionary with keys=cluster number, or name, and values list of model numbers. :param False represent_models: To generate an interactive visualization of a representative model for each cluster. Representative model depends on the value passed to this option, it can be either 'centroid' or 'best' (this last standing for the model with lowest IMP objective function value). :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param (11,11) figsize: dimension of the plot """ fact /= self.nloci if not dcutoff: dcutoff = 1.5 * self.resolution * self._config['scale'] if not clusters: clusters = self.cluster_models(fact=fact, dcutoff=dcutoff, mcl_bin=mcl_bin, method=method, tmp_file=tmp_file, n_cpus=n_cpus, mclargs=mclargs, external=True, what=what) if len(clusters) <= 1: raise Exception('ERROR: did not found clusters to be compared ' + '(try different clustering parameters).\n') if verbose: print(clusters) n_best_clusters = min(len(clusters), n_best_clusters) add = 1 if represent_models else 0 fig, axes = plt.subplots(n_best_clusters - 1 + add, n_best_clusters - 1 + add, sharex=True, sharey=True, figsize=figsize) # pre-calculate contact-matrices cmatrices = [self.get_contact_matrix( [str(m) for m in clusters[i + 1]], cutoff=dcutoff) for i in range(n_best_clusters)] for i in range(n_best_clusters - 1 + add): for j in range(n_best_clusters - 1 + add): try: axes[i, j].set(adjustable='box', aspect=1) axes[i, j].set_visible(False) except TypeError: axes.set(adjustable='box', aspect=1) axes.set_visible(False) # doing the plot for i in range(n_best_clusters - 1): for j in range(1, n_best_clusters): if j < i + 1: continue try: axes[i + add, j - 1].set_visible(True) axes[i + add, j - 1].set(adjustable='box', aspect=1) except TypeError: axes.set_visible(True) axes.set(adjustable='box', aspect=1) matrix3 = [[cmatrices[i][k][l] - cmatrices[j][k][l] for l in range(self.nloci)] for k in range(self.nloci)] try: ims = axes[i + add, j - 1].imshow( matrix3, origin='lower', cmap=bwr, interpolation="nearest", vmin=-1, vmax=1, extent=(0.5, len(matrix3) + 0.5, 0.5, len(matrix3) + 0.5)) axes[i + add, j - 1].grid() except TypeError: ims = axes.imshow( matrix3, origin='lower', cmap=bwr, interpolation="nearest", vmin=-1, vmax=1, extent=(0.5, len(matrix3) + 0.5, 0.5, len(matrix3) + 0.5)) axes.grid() if not i and not represent_models: try: axes[i + add, j - 1].set_title('Cluster #%s' % (j + 1), color='blue') except TypeError: axes.set_title('Cluster #%s' % (j + 1), color='blue') if j != i + 1: try: axes[i + add, j - 1].yaxis.set_ticks_position('none') except TypeError: axes.yaxis.set_ticks_position('none') else: try: plt.setp(axes[i + add, j - 1].get_yticklabels(), visible=True) axes[i + add, j - 1].yaxis.set_ticks_position('left') for item in axes[i + add, j - 1].get_yticklabels(): item.set_fontsize(9) except TypeError: plt.setp(axes.get_yticklabels(), visible=True) axes.yaxis.set_ticks_position('left') for item in axes.get_yticklabels(): item.set_fontsize(9) if i != j - 1: try: axes[i + add, j - 1].xaxis.set_ticks_position('none') except TypeError: axes.xaxis.set_ticks_position('none') else: try: plt.setp(axes[i + add, j - 1].get_xticklabels(), visible=True) axes[i + add, j - 1].xaxis.set_ticks_position('bottom') for item in axes[i + add, j - 1].get_xticklabels(): item.set_fontsize(9) except TypeError: plt.setp(axes.get_xticklabels(), visible=True) axes.xaxis.set_ticks_position('bottom') for item in axes.get_xticklabels(): item.set_fontsize(9) if j == n_best_clusters - 1 and not represent_models: try: axes[i + add, j - 1].yaxis.set_label_position('right') axes[i + add, j - 1].set_ylabel('Cluster #%s' % (i + 1), rotation=-90, fontsize='large', color='red', va='bottom') except TypeError: axes.yaxis.set_label_position('right') axes.set_ylabel('Cluster #%s' % (i + 1), rotation=-90, fontsize='large', color='red', va='bottom') try: axes[i + add, j - 1].set_xlim((0.5, len(matrix3) + 0.5)) axes[i + add, j - 1].set_ylim((0.5, len(matrix3) + 0.5)) except TypeError: axes.set_xlim((0.5, len(matrix3) + 0.5)) axes.set_ylim((0.5, len(matrix3) + 0.5)) # new axe for the color bar cell = fig.add_axes([0.125, 0.1, 0.01, 0.25]) cbar = fig.colorbar(ims, cax=cell, cmap=viridis) cbar.set_ticks([float(k) / 100 for k in range(-100, 150, 50)]) cbar.set_ticklabels(['%3s%% ' % (p) for p in range(-100, 150, 50)]) for item in cell.get_yticklabels(): item.set_fontsize(10) cbar.ax.set_ylabel('\n\nPercentage of models with a\n' + 'given pair of particles closer\n' + 'than the %s nm cutoff\n' % dcutoff + '(red clusters minus blue)\n\n' + (''.join([' ' * 10 + 'Cluster #%-2s: %3s models\n' % ( i + 1, len(clusters[i + 1])) for i in range(n_best_clusters)])), rotation=0, ha='left', va='center') plt.suptitle(('Deconvolution analysis for the %s top clusters ' + '(cutoff=%s nm)') % ( n_best_clusters, dcutoff), size='x-large') if represent_models: for i in range(1, n_best_clusters): ax = fig.add_subplot(n_best_clusters, n_best_clusters, i, projection='3d') ax.set_title('Cluster #%s' % (i + 1), color='blue') if represent_models == 'centroid': mdl = self[self.centroid_model( models=[m for m in clusters[i + 1]])]['index'] else: if represent_models != 'best': warn("WARNING: represent_model value should be one of" + "'centroid' or 'best' not %s\n" % ( represent_models) + "Showing best model.") self.view_models(models=[self[m]['index'] for m in clusters[i + 1]], tool='plot', axe=ax, **kwargs) for item in [ax]: item.patch.set_visible(False) for i in range(n_best_clusters - 1): ax = fig.add_subplot(n_best_clusters, n_best_clusters, n_best_clusters * (i + 2), projection='3d') self.view_models(models=[self[m]['index'] for m in clusters[i + 1]], tool='plot', axe=ax, **kwargs) ax.yaxis.set_label_position('top') ax.set_title('Cluster #%s' % (i + 1), rotation=-90, fontsize='large', color='red', position=(1, .5), va='center', ha='left') for item in [ax]: item.patch.set_visible(False) axes[0, n_best_clusters - 1].set_visible(False) if savefig: tadbit_savefig(savefig) else: plt.show()
[docs] def contact_map(self, models=None, cluster=None, cutoff=None, axe=None, savefig=None, savedata=None): """ Plots a contact map representing the frequency of interaction (defined by a distance cutoff) between two particles. :param None models: if None (default) the contact map will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the contact map only for the models in the cluster number 'cluster' :param None cutoff: distance cutoff (nm) to define whether two particles are in contact or not, default is 2 times resolution, times scale. :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path to a file where to save the contact map data generated, in three columns format (particle1, particle2, percentage of models where these two particles are in contact) """ if not cutoff: cutoff = 2.0 * self.resolution * self._config['scale'] matrix = self.get_contact_matrix(models, cluster, cutoff=cutoff) show = False if savedata: out = open(savedata, 'w') out.write('#Particle1\tParticle2\tModels_percentage\n') for i in range(len(matrix)): for j in range(i + 1, len(matrix)): out.write('%s\t%s\t%s\n' % (i, j, matrix[i][j])) out.close() if not savefig and not show and not axe: return # stop here, we do not want to display anything if not axe: fig = plt.figure(figsize=(8, 6)) axe = fig.add_subplot(111) show = True else: fig = axe.get_figure() cmap = plt.get_cmap('viridis') cmap.set_bad('darkgrey', 1) ims = axe.imshow(matrix, origin='lower', interpolation="nearest", vmin=0, vmax=1, cmap=cmap, extent=(0.5, self.nloci + 0.5, 0.5, self.nloci + 0.5)) axe.set_ylabel('Particle') axe.set_xlabel('Particle') cbar = axe.figure.colorbar(ims) oldlabels = cbar.ax.get_yticklabels() newlabels = [str(int(100 * float(x.get_text())))+'%' for x in oldlabels] cbar.ax.set_yticklabels(newlabels) cbar.ax.set_ylabel('Percentage of models with particles at <' + '%s nm' % (cutoff)) axe.set_title('Contact map') if savefig: tadbit_savefig(savefig) elif show: plt.show()
[docs] def accessibility(self, radius, models=None, cluster=None, nump=100, superradius=200, savefig=None, savedata=None, axe=None, plot=True, error=True, steps=(1, )): """ Calculates a mesh surface around the model (distance equal to input **radius**) and checks if each point of this mesh could be replaced by an object (i.e. a protein) of a given **radius** Outer part of the model can be excluded from the estimation of accessible surface, as the occupancy outside the model is unkown (see superradius option). :param radius: radius of the object we want to fit in the model. :param 100 nump: number of points to draw around a given particle. This number also sets the number of points drawn around edges, as each point occupies a given surface (see maths below). *Note that this number is considerably lowered by the occupancy of edges, depending of the angle formed by the edges surrounding a given particle, only 10% to 50% of the ``nump`` will be drawn in fact.* :param None savefig: path where to save chimera image :param (1, ) steps: how many particles to group for the estimation. By default 1 curve is drawn :param 200 superradius: radius of an object used to exclude outer surface of the model. Superradius must be higher than radius. This function will first define a mesh around the chromatin, representing all possible position of the center of the object we want to fit. This mesh will be at a distance of *radius* from the chromatin strand. All dots in the mesh represents an equal area (*a*), the whole surface of the chromatin strand being: :math:`A=n \\times a` (*n* being the total number of dots in the mesh). The mesh consists of spheres around particles of the model, and cylinders around edges joining particles (no overlap is allowed between sphere and cylinders or cylinder and cylinder when they are consecutive). If we want that all dots of the mesh representing the surface of the chromatin, corresponds to an equal area (:math:`a`) .. math:: a = \\frac{4\pi r^2}{s} = \\frac{2\pi r N_{(d)}}{c} with: * :math:`r` radius of the object to fit (as the input parameter **radius**) * :math:`s` number of points in sphere * :math:`c` number of points in circle (as the input parameter **nump**) * :math:`N_{(d)}` number of circles in an edge of length :math:`d` According to this, when the distance between two particles is equal to :math:`2r` (:math:`N=2r`), we would have :math:`s=c`. As : .. math:: 2\pi r = \sqrt{4\pi r^2} \\times \sqrt{\pi} It is fair to state the number of dots represented along a circle as: .. math:: c = \sqrt{s} \\times \sqrt{\pi} Thus the number of circles in an edge of length :math:`d` must be: .. math:: N_{(d)}=\\frac{s}{\sqrt{s}\sqrt{\pi}}\\times\\frac{d}{2r} :returns: a list of *1-* the number of dots in the mesh that could be occupied by an object of the given radius *2-* the total number of dots in the mesh *3-* the estimated area of the mesh (in square micrometers) *4-* the area of the mesh of a virtually straight strand of chromatin defined as :math:`contour\\times 2\pi r + 4\pi r^2` (also in micrometers) *5-* a list of number of (accessibles, inaccessible) for each particle (percentage burried can be infered afterwards by accessible/(accessible+inaccessible) ) if ploting, returns values to be plotted (list of lists, one per window), and the same for upper and lower error lines """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] acc = [] for model in models: acc_vs_inacc = self[model].accessible_surface( radius, nump=nump, superradius=superradius, include_edges=False)[-1] acc.append([(float(j) / (j + k)) if (j + k) else 0.0 for _, j, k in acc_vs_inacc]) accper, errorn, errorp = self._windowize(list(zip(*acc)), steps, average=True) if savedata: out = open(savedata, 'w') out.write('# Particle\t%s\n' % ('\t'.join([ str(c) + '\t' + '2*stddev(%d)' % c for c in steps]))) for part in range(self.nloci): out.write('%s\t%s\n' % (part + 1, '\t'.join( ['nan\tnan' if part >= len(accper[c]) else (str(round(accper[c][part], 3)) + '\t' + str(round(errorp[c][part] - accper[c][part], 3))) if accper[c][part] else 'nan\tnan' for c in steps]))) out.close() if not plot: return # stop here, we do not want to display anything # plot ylabel = 'Accessibility to an object with a radius of %s nm ' % (radius) xlabel = 'Particle number' title = 'Accesibility per particle' self._generic_per_particle_plot(steps, accper, error, errorp, errorn, savefig, axe, xlabel=xlabel, ylabel=ylabel, title=title, ylim=(0, 1)) #if savefig: # tadbit_savefig(savefig) #elif not axe: # plt.show() #plt.close('all') return accper, errorp, errorn
def _get_density(self, models, interval, use_mass_center): dists = [[None] * len(models)] * interval for p in range(interval, self.nloci - interval): part1, part2, part3 = p - interval, p, p + interval if use_mass_center: subdists = [] for m in models: try: coord1 = get_center_of_mass( self[m]['x'][part1:part2], self[m]['y'][part1:part2], self[m]['z'][part1:part2], self._zeros) coord2 = get_center_of_mass( self[m]['x'][part2:part3], self[m]['y'][part2:part3], self[m]['z'][part2:part3], self._zeros) subdists.append(distance(coord1, coord2)) except ZeroDivisionError: # part1==part2 or part2==part3 subdists.append(float('nan')) dists.append([float(interval * self.resolution) / d for d in subdists]) else: dist1 = self.median_3d_dist(part1 + 1, part2 + 1, models, plot=False, median=False) dist2 = self.median_3d_dist(part2 + 1, part3 + 1, models, plot=False, median=False) dist = [(d1 + d2) for d1, d2 in zip(dist1, dist2)] dists.append([float(interval * self.resolution * 2) / d for d in dist]) return dists
[docs] def density_plot(self, models=None, cluster=None, steps=(1, 2, 3, 4, 5), interval=1, use_mass_center=False, error=False, axe=None, savefig=None, savedata=None, plot=True): """ Plots the number of nucleotides per nm of chromatin vs the modeled region bins. :param None models: if None (default) the density plot will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the density plot only for the models in the cluster number 'cluster' :param (1, 2, 3, 4, 5) steps: how many particles to group for the estimation. By default 5 curves are drawn :param False error: represent the error of the estimates :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param 1 interval: distance are measure with this given interval between two bins. :param False use_mass_center: if interval is higher than one, calculates the distance between the center of mass of the particles *n* to *n+interval* and the center of mass of the particles *n+interval* and *n+2interval* :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path to a file where to save the density data generated (1 column per step + 1 for particle number). :param True plot: e.g. if False, only saves data. No plotting done :returns: values to be plotted (list of lists, one per window), and the same for upper and lower error lines """ if isinstance(steps, int): steps = (steps, ) models = self._get_models(models, cluster) dists = self._get_density(models, interval, use_mass_center) with catch_warnings(): simplefilter("ignore", category=RuntimeWarning) distsk, errorn, errorp = self._windowize(dists, steps, interval=interval, average=False) # write consistencies to file if savedata: out = open(savedata, 'w') out.write('#Particle\t%s\n' % ('\t'.join([ str(c) + '\t' + '2*stddev(%d)' % c for c in steps]))) for part in range(self.nloci): out.write('%s\t%s\n' % (part + 1, '\t'.join( ['nan\tnan' if part >= len(distsk[c]) else (str(round(distsk[c][part], 3)) + '\t' + str(round(errorp[c][part] - round(distsk[c][part], 3)))) if distsk[c][part] and not isnan(distsk[c][part]) else 'nan\tnan' for c in steps]))) out.close() if plot: xlabel = 'Particle number' ylabel = 'Density (bp / nm)' title = 'Chromatin density' # self._generic_per_particle_plot(steps, distsk, error, errorp, errorn, # xlabel=xlabel, ylabel=ylabel, title=title) self._generic_per_particle_plot(steps, distsk, error, errorp, errorn, savefig, axe, xlabel=xlabel, ylabel=ylabel, title=title) return distsk, errorp, errorn
def _get_interactions(self, models, cutoff): interactions = [[] for _ in range(self.nloci)] if not cutoff: cutoff = int(2 * self.resolution * self._config['scale']) cutoff2 = cutoff**2 for i in range(self.nloci): for m in models: val = 0 mdl = self[m] for j in range(self.nloci): if i == j: continue val += ((mdl['x'][i] - mdl['x'][j])**2 + (mdl['y'][i] - mdl['y'][j])**2 + (mdl['z'][i] - mdl['z'][j])**2) < cutoff2 # val += self.__square_3d_dist(i + 1, j + 1, models=[m])[0] < cutoff2 interactions[i].append(val) return interactions
[docs] def interactions(self, models=None, cluster=None, cutoff=None, steps=(1, 2, 3, 4, 5), axe=None, error=False, savefig=None, savedata=None, average=True, plot=True): """ Plots, for each particle, the number of interactions (particles closer than the given cut-off). The value given is the average for all models. :param None models: if None (default) the contact map will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the contact map only for the models in the cluster number 'cluster' :param None cutoff: distance cutoff (nm) to define whether two particles are in contact or not, default is 2 times resolution, times scale. :param (1, 2, 3, 4, 5) steps: how many particles to group for the estimation. By default 5 curves are drawn :param False error: represent the error of the estimates :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path to a file where to save the contact map data generated, in three columns format (particle1, particle2, percentage of models where these two particles are in contact) :param True average: calculate average interactions along models, otherwise, the median. :param True plot: e.g. only saves data. No plotting done :returns: values to be plotted (list of lists, one per window), and the same for upper and lower error lines """ if isinstance(steps, int): steps = (steps, ) models = self._get_models(models, cluster) interactions = self._get_interactions(models, cutoff) distsk, errorn, errorp = self._windowize(interactions, steps, average=average) if savedata: out = open(savedata, 'w') out.write('#Particle\t%s\n' % ( '\t'.join(['%s_interactions(%s)\t2*stddev(%s)' % ( 'Average' if average else 'Median', k, k) for k in steps]))) for i in range(self.nloci): out.write('%s\t%s\n' % (i + 1, '\t'.join( ['%s\t%s' % (str('nan' if (len(distsk[k]) <= i or distsk[k][i] is None) else round(distsk[k][i], 2)), ( str('nan' if (len(distsk[k]) <= i or distsk[k][i] is None) else round(errorp[k][i] - distsk[k][i], 2)))) for k in steps]))) out.close() if plot: ylabel = 'Number of particles closer than %s nm' % (cutoff) xlabel = 'Particle number' title = 'Interactions per particle' self._generic_per_particle_plot(steps, distsk, error, errorp, errorn, savefig, axe, xlabel=xlabel, ylabel=ylabel, title=title) return distsk, errorp, errorn
[docs] def model_consistency(self, cutoffs=None, models=None, cluster=None, axe=None, savefig=None, savedata=None, plot=True): """ Plots the particle consistency, over a given set of models, vs the modeled region bins. The consistency is a measure of the variability (or stability) of the modeled region (the higher the consistency value, the higher stability). :param None cutoffs: list of distance cutoffs (nm) used to compute the consistency. Two particle are considered consistent if their distance is less than the given cutoff, default is a tuple of 0.5, 1, 1.5 and 2 times resolution, times scale. :param None models: if None (default) the consistency will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the consistency only for the models in the cluster number 'cluster' :param '/tmp/tmp_cons' tmp_path: location of the input files for TM-score program :param '' tmsc: path to the TMscore_consistency script (assumed to be installed by default) :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param None savedata: path to a file where to save the consistency data generated (1 column per cutoff + 1 for particle number). :param True plot: e.g. only saves data. No plotting done """ models = self._get_models(models, cluster) models = [self.__models[m] for m in models] if cutoffs is None: cutoffs = (int(0.5 * self.resolution * self._config['scale']), int(1.0 * self.resolution * self._config['scale']), int(1.5 * self.resolution * self._config['scale']), int(2.0 * self.resolution * self._config['scale'])) consistencies = {} for cut in cutoffs: consistencies[cut] = calc_consistency(models, self.nloci, self._zeros, cut) # write consistencies to file if savedata: out = open(savedata, 'w') out.write('#Particle\t%s\n' % ('\t'.join([str(c) for c in cutoffs]))) for part in range(self.nloci): out.write('%s\t%s\n' % (str(part + 1), '\t'.join( [str(round(consistencies[c][part], 3)) for c in cutoffs]))) out.close() if not plot: return # plot show = False if axe else True axe = setup_plot(axe) plots = [] self._plot_polymer(axe) scat1 = plt.Line2D((0, 1), (0, 0), color=(0.15, 0.15, 0.15), marker='o', linestyle='') scat2 = plt.Line2D((0, 1), (0, 0), color=(0.7 , 0.7 , 0.7 ), marker='o', linestyle='') for i, cut in enumerate(cutoffs[::-1]): plots += axe.plot(list(range(1, self.nloci + 1)), consistencies[cut], color='darkred', alpha=1 - i / float(len(cutoffs))) try: axe.legend(plots + [scat1, scat2], ['%s nm' % (k) for k in cutoffs[::-1]] + ['particles with restraints', 'particles without restraints'], numpoints=1, fontsize='small', loc='center left', bbox_to_anchor=(1, 0.5)) except TypeError: axe.legend(plots + [scat1, scat2], ['%s nm' % (k) for k in cutoffs[::-1]] + ['particles with restraints', 'particles without restraints'], numpoints=1, loc='center left', bbox_to_anchor=(1, 0.5)) axe.set_xlim((1, self.nloci)) axe.set_ylim((0, 100)) axe.set_xlabel('Particle') axe.set_ylabel('Consistency (%)') if cluster: axe.set_title('Cluster %s' % (cluster)) elif len(models) == len(self): axe.set_title('All clusters') else: axe.set_title('Selected models') plt.subplots_adjust(left=0.1, right=0.77) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() plt.close('all') return consistencies
[docs] def walking_dihedral(self, models=None, cluster=None, steps=(1, 3), span=(-2, 1, 0, 1, 3), error=False, plot=True, savefig=None, axe=None, savedata=None): """ Plots the dihedral angle between successive planes. A plane is formed by 3 successive loci. :param None models: if None (default) the dihedral angle will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the dihedral angle only for the models in the cluster number 'cluster' :param (1, 3) steps: how many particles to group for the estimation. By default 2 curves are drawn :param True signed: whether to compute the sign of the angle according to a normal plane, or not. :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param (-3, -1, 0, 1, 2) span: by default computes angle between a plane comprising the current residue (*0*), the residue after (*1*) and the residue 3 positions before (*-3*), and the plane with the current residue, the residue directly after (*1*) and the residue 3 positions after (*3*). e.g. In the scheme bellow it's plane ACD vs plane CDF. :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False error: represent the error of the estimates :param True plot: e.g. if False, only saves data. No plotting done :param None savedata: path to a file where to save the angle data generated (1 column per step + 1 for particle number). :returns: values to be plotted (list of lists, one per window), and the same for upper and lower error lines :: C..........D ... ... ... ... ... ... A..........B .E .. . . . . . . . F...............G """ models = self._get_models(models, cluster) rads = {} if span[0] > 0: raise ValueError('ERROR: first element of span should be negative') if span[-1] < 0: raise ValueError('ERROR: last element of span should be negative') rads = [[None] * len(models)] * (-span[0]) for res in range(-span[0], self.nloci - span[-1]): subrad = self.dihedral_angle(res + span[0], res + span[1], res + span[2], res + span[3], res + span[4], models) # rads.append([None] * abs(span[0]) + subrad + [None] * span[3]) rads.append(subrad) rads += [[None] * len(models)] * (span[-1]) radsk, errorn, errorp = self._windowize(rads, steps, interval=0, average=False, minerr=-360) if plot: xlabel = 'Particle number' ylabel = 'Dihedral angle in degrees' title = 'Dihedral angle between consecutive loci' self._generic_per_particle_plot(steps, radsk, error, errorp, errorn, savefig, axe, xlabel=xlabel, ylabel=ylabel, title=title) if savedata: out = open(savedata, 'w') out.write('#Particle\t%s\n' % ('\t'.join([ str(c) + '\t' + '2*stddev(%d)' % c for c in steps]))) for part in range(self.nloci): out.write('%s\t%s\n' % (part + 1, '\t'.join( ['nan\tnan' if part >= len(radsk[c]) else (str(round(radsk[c][part], 3)) + '\t' + str(round(errorp[c][part] - round(radsk[c][part], 3)))) if radsk[c][part] else 'nan\tnan' for c in steps]))) out.close() if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') return radsk, errorp, errorn
[docs] def walking_angle(self, models=None, cluster=None, steps=(1, 3), signed=True, savefig=None, savedata=None, axe=None, plot=True, error=False): """ Plots the angle between successive loci in a given model or set of models. In order to limit the noise of the measure angle is calculated between 3 loci, between each are two other loci. E.g. in the scheme bellow, angle are calculated between loci A, D and G. :param None models: if None (default) all models will be used for computation. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the angle only for the models in the cluster number 'cluster' :param (1, 3) steps: how many particles to group for the estimation. By default 2 curves are drawn :param True signed: whether to compute the sign of the angle according to a normal plane, or not. :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param False error: represent the error of the estimates :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param True plot: e.g. if False, only saves data. No plotting done :param None savedata: path to a file where to save the angle data generated (1 column per step + 1 for particle number). :returns: values to be plotted (list of lists, one per window), and the same for upper and lower error lines :: C..........D ... ... ... ... ... ... A..........B .E .. . . . . . . . F...............G """ if not isinstance(steps, tuple): steps = (steps,) models = self._get_models(models, cluster) rads = [] ones = array([1., 1., 1.]) sign = 1 for model in models: mox = self[model]['x'] moy = self[model]['y'] moz = self[model]['z'] subrad = [None] * 3 for res in range(1, self.nloci - 5): subrad.append(self.angle_between_3_particles( res, res + 3, res + 6, models=[model], all_angles=False)) if signed: res1 = array([mox[res - 1], moy[res - 1], moz[res - 1]]) # faster than get_particle coordinate res2 = array([mox[res + 2], moy[res + 2], moz[res + 2]]) res3 = array([mox[res + 5], moy[res + 5], moz[res + 5]]) vec1 = res1 - res2 / norm(res1 - res2) vec2 = res1 - res3 / norm(res1 - res3) sign = dot(ones, cross(vec1, vec2)) sign = -1 if sign < 0 else 1 subrad[-1] *= sign subrad += [None] * 3 rads.append(subrad) with catch_warnings(): simplefilter("ignore", category=RuntimeWarning) radsk, errorn, errorp = self._windowize(list(zip(*rads)), steps, interval=0, average=False, minerr=-360) if plot: xlabel = 'Particle number' ylabel = 'Angle in degrees' title = 'Angle between consecutive loci' self._generic_per_particle_plot(steps, radsk, error, errorp, errorn, savefig, axe, xlabel=xlabel, ylabel=ylabel, title=title) if savedata: out = open(savedata, 'w') out.write('#Particle\t%s\n' % ('\t'.join([ str(c) + '\t' + '2*stddev(%d)' % c for c in steps]))) for part in range(self.nloci): out.write('%s\t%s\n' % (part + 1, '\t'.join( ['nan\tnan' if part >= len(radsk[c]) else (str(round(radsk[c][part], 3)) + '\t' + str(round(errorp[c][part] - round(radsk[c][part], 3)))) if radsk[c][part] and not isnan(radsk[c][part]) else 'nan\tnan' for c in steps]))) out.close() return radsk, errorp, errorn
#if savefig: # tadbit_savefig(savefig) #elif not axe: # plt.show() #plt.close('all')
[docs] def zscore_plot(self, axe=None, savefig=None, do_normaltest=False): """ Generate 3 plots. Two heatmaps of the Z-scores used for modeling, one of which is binary showing in red Z-scores higher than upper cut-off; and in blue Z-scores lower than lower cut-off. Last plot is an histogram of the distribution of Z-scores, showing selected regions. Histogram also shows the fit to normal distribution. :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False do_normaltest: to display the result of a test of normality (D'Agostino and Pearson's test, that combines skew and kurtosis). """ zsc_mtrx = reduce(lambda x, y: x + y, [[k] + list(self._zscores[k].keys()) for k in list(self._zscores.keys())]) max_bin = max([int(i) for i in zsc_mtrx]) zsc_mtrx = [[float('nan') for _ in range(max_bin)] for _ in range(max_bin)] for i in range(max_bin): for j in range(max_bin): try: zsc_mtrx[i][j] = self._zscores[str(i)][str(j)] except KeyError: try: zsc_mtrx[i][j] = self._zscores[str(j)][str(i)] except KeyError: zsc_mtrx[i][j] = 0 masked_array = ma.array (zsc_mtrx, mask=isnan(zsc_mtrx)) masked_array_top = ma.array ( masked_array, mask=masked_array < self._config['upfreq']) masked_array_bot = ma.array ( masked_array, mask=self._config['lowfreq'] < masked_array) cmap = copy(viridis) cmap.set_bad('w', 1.) if not axe: fig = plt.figure(figsize=(25, 5.5)) else: fig = axe.get_figure() ax = fig.add_subplot(131) ims = ax.imshow(zsc_mtrx, origin='lower', interpolation="nearest", cmap=cmap) ax.set_ylabel('Particles') ax.set_xlabel('Particles') ax.set_title('Z-scores of the normalized Hi-C count') cbar = ax.figure.colorbar(ims) cbar.ax.set_ylabel('Z-score value') ax = plt.axes([.38, 0.11, .28, .61]) zdata = sorted(reduce(lambda x, y: x + y, [list(self._zscores[v].values()) for v in list(self._zscores.keys())])) try: _, _, patches = ax.hist(zdata, bins=25, linewidth=1, facecolor='none', edgecolor='k', density=True) except AttributeError: _, _, patches = ax.hist(zdata, bins=25, linewidth=1, facecolor='none', edgecolor='k') k2, pv = normaltest(zdata) normfit = sc_norm.pdf(zdata, np_mean(zdata), np_std(zdata)) normplot = ax.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4) try: ax.hist( reduce(lambda x, y: x + y, [list(self._zscores[v].values()) for v in list(self._zscores.keys())]), bins=25, linewidth=2, facecolor='none', edgecolor='k', histtype='stepfilled', density=True) except AttributeError: ax.hist( reduce(lambda x, y: x + y, [list(self._zscores[v].values()) for v in list(self._zscores.keys())]), bins=25, linewidth=2, facecolor='none', edgecolor='k', histtype='stepfilled') height1 = height2 = 0 minv = nanmin(masked_array) maxv = nanmax(masked_array) for thispatch in patches: beg = thispatch.get_x() end = thispatch.get_x() + thispatch.get_width() ax.fill_betweenx([0] + [thispatch.get_height()] * 100, max(beg, self._config['upfreq']) if end > self._config['upfreq' ] else beg, min(end, self._config['lowfreq' ]) if beg < self._config['lowfreq'] else end, color=( Blues(beg / minv) if beg < self._config['lowfreq'] else Reds (end / maxv) if end > self._config['upfreq'] else 'w')) if end > self._config['lowfreq' ] and beg < self._config['lowfreq']: height1 = thispatch.get_height() elif beg < self._config['upfreq'] and end > self._config['upfreq' ]: height2 = thispatch.get_height() labels = [] p1 = plt.Rectangle((0, 0), 1, 1, fc=Blues(0.7), color='k') labels.append('< %.2f (force particles apart)' % ( self._config['lowfreq'])) p2 = plt.Rectangle((0, 0), 1, 1, fc="w", color='k') labels.append('Not used (no constraints)') p3 = plt.Rectangle((0, 0), 1, 1, fc=Reds(0.7), color='k') labels.append('> %.2f (force particles together)' % ( self._config['upfreq'])) try: ax.legend([p1, p2, p3, normplot[0]], labels + [ "Fitted normal distribution" + (("\n D'Agostino Pearson's normality test $K^2$=%.2f pv=%.3f" % (k2, pv)) if do_normaltest else '')], fontsize='small', frameon=False, bbox_to_anchor=(1.013, 1.3 + (.03 if do_normaltest else 0))) except TypeError: ax.legend([p1, p2, p3, normplot[0]], labels + [ "Fitted normal distribution" + (("\n D'Agostino Pearson's normality test $K^2$=%.2f pv=%.3f" % (k2, pv)) if do_normaltest else '')], frameon=False, bbox_to_anchor=(1.013, 1.3 + (.03 if do_normaltest else 0))) ax.set_xlabel('Z-scores') ax.set_ylabel('Proportion of particles') ax.vlines(self._config['lowfreq'], 0, height1, color='k', linestyle='-', lw=2, alpha=1) ax.vlines(self._config['upfreq'] , 0, height2, color='k', linestyle='-', lw=2, alpha=1) ax = fig.add_subplot(133) _ = ax.imshow(masked_array_top, origin='lower', interpolation="nearest", cmap='Reds') _ = ax.imshow(masked_array_bot, origin='lower', interpolation="nearest", cmap='Blues') ax.set_ylabel('Particles') ax.set_xlabel('Particles') ax = plt.axes([.42, 0.11, .48, .79]) ax.set_title('Binary representation of Z-scores used in the ' + 'computation of restraints') ax.set_title('Binary representation of Z-scores used in the ' + 'computation of restraints') ax.axison = False if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all')
[docs] def correlate_with_real_data(self, models=None, cluster=None, cutoff=None, off_diag=1, plot=False, axe=None, savefig=None, corr='spearman', midplot='hexbin', log_corr=True, contact_matrix=None, show_bad_columns=True): """ Plots the result of a correlation between a given group of models and original Hi-C data. :param None models: if None (default) the correlation will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the correlation only for the models in the cluster number 'cluster' :param None cutoff: distance cutoff (nm) to define whether two particles are in contact or not, default is 2 times resolution, times scale. :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False plot: to display the plot :param True log_corr: log plot for correlation :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None contact_matrix: input a contact matrix instead of computing it from the models :param True show_bad_columns: Wether to hide or not bad columns in the contact map :returns: correlation coefficient rho, between the two matrices. A rho value greater than 0.7 indicates a very good correlation """ if not cutoff: cutoff = 2.0 * self.resolution * self._config['scale'] if contact_matrix: model_matrix = contact_matrix else: model_matrix = self.get_contact_matrix(models=models, cluster=cluster, cutoff=cutoff, show_bad_columns=show_bad_columns) oridata = [] moddata = [] for i in (v for v, z in enumerate(self._zeros) if z): for j in (v for v, z in list(enumerate(self._zeros))[i + off_diag:] if z): oriv = self._original_data[i][j] if oriv <= 0 or isnan(oriv): continue oridata.append(oriv) moddata.append(model_matrix[i][j]) # print oridata if corr == 'spearman': corr = spearmanr(moddata, oridata) elif corr == 'pearson': corr = pearsonr(moddata, oridata) elif corr == 'kendall': corr = kendalltau(moddata, oridata) elif corr == 'logpearson': corr = pearsonr(nozero_log_list(moddata), nozero_log_list(oridata)) elif corr == 'chi2': corr = chisquare(array(moddata), array(oridata)) corr = 1. / corr[0], corr[1] elif corr == 'scc': corr = scc(model_matrix, self._original_data, max_dist=int(len(model_matrix)/2)) else: raise NotImplementedError('ERROR: %s not implemented, must be one ' + 'of spearman, pearson or frobenius\n') if not plot and not savefig: return corr if not axe: fig = plt.figure(figsize=(20, 4.5)) else: fig = axe.get_figure() fig.suptitle('Correlation between normalized-real and modeled ' + 'contact maps (correlation=%.4f)' % (corr[0]), size='x-large') ax = fig.add_subplot(131) # imshow of the modeled data cmap = copy(plt.get_cmap('viridis')) cmap.set_bad('darkgrey', 1) ims = ax.imshow(model_matrix, origin='lower', interpolation="nearest", vmin=0, vmax=1, cmap=cmap, extent=(0.5, self.nloci + 0.5, 0.5, self.nloci + 0.5)) ax.set_ylabel('Particle') ax.set_xlabel('Particle') cbar = ax.figure.colorbar(ims) cbar.ax.yaxis.set_major_formatter(PercentFormatter(1.0)) cbar.ax.set_ylabel('Percentage of models with particles at <' + '%s nm' % (cutoff)) ax.set_title('Contact map') # correlation ax = fig.add_subplot(132) try: if log_corr: minmoddata = float(min([m for m in moddata if m])) minoridata = float(min([m for m in oridata if m])) moddata, oridata = (log2([(m if m else minmoddata / 2) * 100 for m in moddata]), log2([m if m else minoridata / 2 for m in oridata])) except: warn('WARNING: unable to log for correlation with real data...') slope, intercept, r_value, p_value, _ = linregress(moddata, oridata) # slope, intercept, r_value, p_value, std_err = linregress(moddata, oridata) if midplot == 'classic': lnr = ax.plot(moddata, intercept + slope * array (moddata), color='k', ls='--', alpha=.7) ax.legend(lnr, ['p-value: %.3f, R: %.3f' % (p_value, r_value)]) ax.plot(moddata, oridata, 'ro', alpha=0.5) ax.set_xlabel('Modelled data') ax.set_ylabel('Real data') elif midplot == 'hexbin': hb = ax.hexbin(moddata, oridata, mincnt=1, gridsize=50, cmap=plt.cm.Spectral_r) lnr = ax.plot(moddata, intercept + slope * array (moddata), color='k', ls='--', alpha=.7) ax.set_xlabel( '%sroportion of models with a particle pair closer than cutoff' % ( 'Log p' if log_corr else 'P')) ax.set_ylabel('%sormalized Hi-C count for a particle pair' % ( 'Log n' if log_corr else 'N')) cbaxes = fig.add_axes([0.41, 0.42, 0.005, 0.45]) cbar = plt.colorbar(hb, cax=cbaxes) # orientation='horizontal') cbar.set_label('Number of particle pairs') elif midplot == 'triple': maxval = max(oridata) minval = min(oridata) ax.set_visible(False) axleft = fig.add_axes([0.42, 0.18, 0.1, 0.65]) axleft.spines['right'].set_color('none') axleft.spines['bottom'].set_color('none') axleft.spines['left'].set_smart_bounds(True) axleft.spines['top'].set_smart_bounds(True) axleft.xaxis.set_ticks_position('top') axleft.yaxis.set_ticks_position('left') axleft.set_ylabel('Normalized Hi-C count for a particle pair') axleft.patch.set_visible(False) axbott = fig.add_axes([0.44, 0.13, 0.17, 0.5]) axbott.spines['left'].set_color('none') axbott.spines['top'].set_color('none') axbott.spines['left'].set_smart_bounds(True) axbott.spines['bottom'].set_smart_bounds(True) axbott.xaxis.set_ticks_position('bottom') axbott.yaxis.set_ticks_position('right') axbott.patch.set_visible(False) axbott.set_xlabel('Proportion of models with a particle pair ' + ' interacting') axmidl = fig.add_axes([0.44, 0.18, 0.17, 0.65]) axbott.hist(moddata, bins=20, alpha=.2) x, _ = histogram([i if str(i) != '-inf' else 0. for i in oridata], bins=20) axleft.barh(linspace(minval, maxval, 20), x, height=(maxval - minval) / 20, alpha=.2) axleft.set_ylim((minval - (maxval - minval) / 20, maxval + (maxval - minval) / 20)) axmidl.plot(moddata, oridata, 'k.', alpha=.3) axmidl.plot(moddata, intercept + slope * array (moddata), color='k', ls='--', alpha=.7) axmidl.set_ylim(axleft.get_ylim()) axmidl.set_xlim(axbott.get_xlim()) axmidl.axis('off') # axmidl.patch.set_visible(False) ax.set_title('Real versus modelled data') ax = fig.add_subplot(133) cmap = copy(plt.get_cmap('viridis')) cmap.set_bad('darkgrey', 1) with errstate(divide='ignore'): ims = ax.imshow(log2(self._original_data), origin='lower', interpolation="nearest", cmap=cmap, extent=(0.5, self.nloci + 0.5, 0.5, self.nloci + 0.5)) ax.set_ylabel('Genomic bin') ax.set_xlabel('Genomic bin') ax.set_title('Normalized Hi-C count') cbar = ax.figure.colorbar(ims) cbar.ax.set_ylabel('Log2 (normalized Hi-C data)') if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') return corr
[docs] def view_centroid(self, **kwargs): """ shortcut for view_models(tool='plot', show='highlighted', highlight='centroid') :param kwargs: any parameters to be passed to view_models (i.e.: view_centroid(azimuth=30, elevation=10, show_axe=True, label=True)) """ self.view_models(tool='plot', show='highlighted', highlight='centroid', **kwargs)
[docs] def view_models(self, models=None, cluster=None, tool='chimera', show='all', highlight='centroid', savefig=None, cmd=None, color='index', align=True, smooth=False, particle_size=50, lw_main=3, alpha_part=0.5, **kwargs): """ Visualize a selected model in the three dimensions (either with Chimera or through matplotlib). :param None models: if None (default) the visualization will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the visualization only for the models in the cluster number 'cluster' :param 'chimera' tool: path to the external tool used to visualize the model. Can also be 'plot', to use matplotlib. :param None savefig: path to a file where to save the image OR movie generated (depending on the extension; accepted formats are png, mov and webm). if set to None, the image or movie will be shown using the default GUI. :param False alpha: only for matplotlib ('plot' option), transparency of particles. :param False smooth: only for matplotlib ('plot' option), spline smoothing (by default smoothing is 0.001). :param 50 particle_size: only for matplotlib ('plot' option), redefine size of particles. If None, resolution times scale is used. :param 'index' color: can be: * a string as: * '**index**' to color particles according to their position in the model (:func:`pytadbit.utils.extraviews.color_residues`) * '**tad**' to color particles according to the TAD they belong to (:func:`pytadbit.utils.extraviews.tad_coloring`) * '**border**' to color particles marking borders. Color according to their score (:func:`pytadbit.utils.extraviews.tad_border_coloring`) coloring function like. * a function, that takes as argument a model and any other parameter passed through the kwargs. * a list of (r, g, b) tuples (as long as the number of particles). Each r, g, b between 0 and 1. :param 'centroid' highlight: higlights a given model, or group of models. Can be either 'all', 'centroid' or 'best' ('best' being the model with the lowest IMP objective function value :param 'all' show: models to be displayed. Can be either 'all', 'grid' or 'highlighted'. :param None cmd: list of commands to be passed to the viewer. The chimera list is: :: focus set bg_color white windowsize 800 600 bonddisplay never #0 represent wire shape tube #0 radius 5 bandLength 100 segmentSubdivisions 1 followBonds on clip yon -500 ~label set subdivision 1 set depth_cue set dc_color black set dc_start 0.5 set dc_end 1 scale 0.8 Followed by the movie command to record movies: :: movie record supersample 1 turn y 3 120 wait 120 movie stop movie encode output SAVEFIG Or the copy command for images: :: copy file SAVEFIG png Passing as the following list as 'cmd' parameter: :: cmd = ['focus', 'set bg_color white', 'windowsize 800 600', 'bonddisplay never #0', 'shape tube #0 radius 10 bandLength 200 segmentSubdivisions 100 followBonds on', 'clip yon -500', '~label', 'set subdivision 1', 'set depth_cue', 'set dc_color black', 'set dc_start 0.5', 'set dc_end 1', 'scale 0.8'] will return the default image (other commands can be passed to modified the final image/movie). :param True align: show aligned models :param 15 radius: radius for the chimera particles :param kwargs: see :func:`pytadbit.utils.extraviews.plot_3d_model` or :func:`pytadbit.utils.extraviews.chimera_view` for other arguments to pass to this function. See also coloring function """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] models = [m['rand_init'] if 'IMPmodel' in str(type(m)) else m for m in models] if smooth is True: smooth = None if color in ['tad', 'border'] and 'tads' not in kwargs: start = (float(self[models[0]]['description']['start']) / self[models[0]]['description']['resolution'] - 1) end = (float(self[models[0]]['description']['end' ]) / self[models[0]]['description']['resolution']) kwargs.update((('tads', self.experiment.tads), ('mstart', start ), ('mend', end))) centroid_model = models[0] if 'centroid' in [show, highlight] and len(models) > 1: centroid_model = self.centroid_model(models) if highlight == 'centroid': mdl = centroid_model elif highlight == 'best': mdl = self[sorted(models, key=lambda x: self[x]['objfun'])[0]]['index'] else: if highlight != 'all': warn("WARNING: represent_model value should be one of" + "'centroid', 'best' or 'all' not %s\n" % ( highlight) + "Highlighting no models.") mdl = 'all' # View with Matplotlib if tool == 'plot': pltshow = 'axe' not in kwargs model_coords = [] if len(models) > 1 and align: for model in self.align_models(models, **kwargs): model_coords.append(model) elif kwargs.get('reference_model', None) is not None and align: model_coords.append(self.align_models(models, **kwargs)[0]) else: for model in models: model_coords.append(( self[model]['x'], self[model]['y'], self[model]['z'])) if show in ['all', 'highlighted']: if 'axe' not in kwargs: fig = plt.figure(figsize=kwargs.get('figsize', (8, 8))) kwargs['axe'] = fig.add_subplot(1, 1, 1, projection='3d') for i in models: if particle_size is None: ps = self[i]['radius'] * 2 else: ps = particle_size if show == 'all' or i == mdl or mdl == 'all': plot_3d_model( *model_coords[models.index(i)], color=color, thin=False if highlight == 'all' else (i != mdl), particle_size=ps, smooth=smooth, alpha_part=alpha_part, lw_main=lw_main, **kwargs) if pltshow: try: kwargs['axe'].set_title('Model %s highlighted as %s' % ( self[mdl]['rand_init'], highlight)) except KeyError: kwargs['axe'].set_title('All models highlighted') else: sqrmdl = sqrt(len(models)) cols = int(round(sqrmdl + (0.0 if int(sqrmdl) == sqrmdl else .5))) rows = int(sqrmdl + .5) if pltshow: fig = plt.figure() for i in range(cols): for j in range(rows): if i * rows + j >= len(models): break this = self[models[i * rows + j]]['index'] if pltshow: kwargs['axe'] = fig.add_subplot( rows, cols, i * rows + j + 1, projection='3d') plot_3d_model( *model_coords[i * rows + j], color=color, thin=False if highlight == 'all' else (this != mdl), particle_size=ps, smooth=smooth, alpha_part=alpha_part, lw_main=lw_main, **kwargs) if pltshow: kwargs['axe'].set_title( 'Model %s' % self[this]['rand_init']) if savefig: tadbit_savefig(savefig) elif pltshow: plt.show() return # View with Chimera cmm_files = [] radius = 10 for model_num in models: if show in ['all', 'grid'] or model_num == mdl or mdl == 'all': self.write_cmm('/tmp/', model_num=model_num, color=color, **kwargs) cmm_files.append('/tmp/model.%s.cmm' % (self[model_num]['rand_init'])) radius = self[model_num]['radius'] chimera_view(cmm_files, savefig=savefig, chimera_bin=tool, chimera_cmd=cmd, highlight=(0 if (show == 'highlighted' and mdl != 'all') else models.index(mdl) if mdl != 'all' else mdl), align=align, grid=show == 'grid', radius=radius)
[docs] def angle_between_3_particles(self, parta, partb, partc, models=None, cluster=None, radian=False, all_angles=False): """ Calculates the angle between 3 particles. Given three particles A, B and C, the angle g (angle ACB, shown below): :: A /| /i| c/ | / | / | B )g |b \ | \ | a\ | \h| \| C is given by the theorem of Al-Kashi: .. math:: b^2 = a^2 + c^2 - 2ac\cos(g) :param parta: A particle number :param partb: A particle number :param partc: A particle number :param None models: if None (default) the angle will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the angle only for the models in the cluster number 'cluster' :param False radian: if True, return value in radians (in degrees otherwise) :returns: an angle, either in degrees or radians. If all_angles is true returns a list of the angle g, h, i (see picture above) """ # WARNING: here particle numbers are +1, they will be reduced # inside median_3d_dist a2 = np_median(self.__square_3d_dist(partb, partc, models=models, cluster=cluster)) c2 = np_median(self.__square_3d_dist(parta, partb, models=models, cluster=cluster)) b2 = np_median(self.__square_3d_dist(parta, partc, models=models, cluster=cluster)) a = a2**0.5 c = c2**0.5 try: g = acos((a2 - b2 + c2) / (2 * a * c)) except ValueError: g = 0. if not all_angles: return g if radian else degrees(g) b = b2**0.5 try: h = acos((a2 + b2 - c2) / (2 * a * b)) except ValueError: h = 0. i = pi - g - h return (g, h, i)
[docs] def particle_coordinates(self, part, models=None, cluster=None): """ Returns the mean coordinate of a given particle in a group of models. :param part: the index number of a particle :param None models: if None (default) the angle will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the angle only for the models in the cluster number 'cluster' """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] part -= 1 xis = 0. yis = 0. zis = 0. for mod in models: xis += self[mod]['x'][part] yis += self[mod]['y'][part] zis += self[mod]['z'][part] xis /= len(models) yis /= len(models) zis /= len(models) return [xis, yis, zis]
[docs] def dihedral_angle(self, pa, pb, pc, pd, pe, models): """ Calculates the dihedral angle between 2 planes formed by 5 particles (one common to both planes). :param None models: if None (default) the angle will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the angle only for the models in the cluster number 'cluster' """ pa -= 1 pb -= 1 pc -= 1 pd -= 1 pe -= 1 dangles = [] for m in models: parta = array([self[m]['x'][pa], self[m]['y'][pa], self[m]['z'][pa]]) partb = array([self[m]['x'][pb], self[m]['y'][pb], self[m]['z'][pb]]) partc = array([self[m]['x'][pc], self[m]['y'][pc], self[m]['z'][pc]]) partd = array([self[m]['x'][pd], self[m]['y'][pd], self[m]['z'][pd]]) parte = array([self[m]['x'][pe], self[m]['y'][pe], self[m]['z'][pe]]) dangles.append(dihedral(parta, partb, partc, partd, parte)) return dangles
[docs] def median_3d_dist(self, part1, part2, models=None, cluster=None, plot=True, median=True, axe=None, savefig=None): """ Computes the median distance between two particles over a set of models :param part1: number corresponding to the first particle :param part2: number corresponding to the second particle :param None models: if None (default) the distance will be computed using all the models. A list of numbers corresponding to a given set of models can be passed :param None cluster: compute the distance only for the models in the cluster number 'cluster' :param True plot: if True, display a histogram and a box-plot of the distribution of the calculated distances. If False, return either the full list of the calculated distances or their median value :param True median: return either the full list of the calculated distances (False) or their median value (True), when 'plot' is set to False :param None axe: a matplotlib.axes.Axes object to define the plot appearance :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :returns: if 'plot' is False, return either the full list of the calculated distances or their median value distances, either the list of distances. """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = self.__models models = [self[mdl] for mdl in models] dists = [mdl.distance(part1, part2) for mdl in models] if not plot: if median: return np_median(dists) else: return dists plot_hist_box(dists, part1, part2, axe, savefig)
def __square_3d_dist(self, part1, part2, models=None, cluster=None): """ same as median_3d_dist, but return the square of the distance instead """ cluster = cluster or -1 part1 -= 1 part2 -= 1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] models = [self[mdl] for mdl in models] return [(mdl['x'][part1] - mdl['x'][part2])**2 + (mdl['y'][part1] - mdl['y'][part2])**2 + (mdl['z'][part1] - mdl['z'][part2])**2 for mdl in models] def __fast_square_3d_dist(self, part1, part2, models): """ same as median_3d_dist, but return the square of the distance instead we know what we are doing. """ return [(mdl['x'][part1] - mdl['x'][part2])**2 + (mdl['y'][part1] - mdl['y'][part2])**2 + (mdl['z'][part1] - mdl['z'][part2])**2 for mdl in models]
[docs] def objective_function_model(self, model, log=False, smooth=True, axe=None, savefig=None): """ This function plots the objective function value per each Monte-Carlo step :param model: the number of the model to plot :param False log: log plot :param True smooth: curve smoothing :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ self[model].objective_function(log=log, smooth=smooth, axe=axe, savefig=savefig)
[docs] def infer_unrestrained_particle_coords(self, xcoords, ycoords, zcoords): """ if a given particle (and direct neighbors) have no restraints. Infer the coordinates by linear interpolation using closest particles with restraints. :param xcoords: list of x coordinates :param ycoords: list of y coordinates :param zcoords: list of z coordinates :transforms: three input lists corresponding to the x, y and z coordinates of the FULL model """ gaps = [] gap = 0 for pos in range(self.nloci): if not self._zeros[pos]: gap += 1 elif gap > 1: if gap > 3: beg, end = pos - gap - 1, pos gaps.append((beg, end)) gap = 0 else: gap = 0 for beg, end in gaps: x1 = xcoords[beg] y1 = ycoords[beg] z1 = zcoords[beg] x2 = xcoords[end] y2 = ycoords[end] z2 = zcoords[end] div = end - beg - 1 xstep = (x2 - x1) / div ystep = (y2 - y1) / div zstep = (z2 - z1) / div for pos, i in enumerate(list(range(beg + 1, end)), 1): xcoords[i] = x1 + xstep * pos ycoords[i] = y1 + ystep * pos zcoords[i] = z1 + zstep * pos
[docs] def write_cmm(self, directory, model_num=None, models=None, cluster=None, color='index', rndname=True, infer_unrestrained=False, **kwargs): """ Save a model in the cmm format, read by Chimera (http://www.cgl.ucsf.edu/chimera). **Note:** If none of model_num, models or cluster parameter are set, ALL the models will be written. :param directory: location where the file will be written (note: the name of the file will be model_1.cmm if model number is 1) :param None model_num: the number of the model to save :param None models: a list of numbers corresponding to a given set of models to save :param None cluster: save the models in the cluster number 'cluster' :param True rndname: If True, file names will be formatted as: model.RND.cmm, where RND is the random number feed used by IMP to generate the corresponding model. If False, the format will be: model_NUM_RND.cmm where NUM is the rank of the model in terms of objective function value :param 'index' color: can be: * a string as: * '**index**' to color particles according to their position in the model (:func:`pytadbit.utils.extraviews.color_residues`) * '**tad**' to color particles according to the TAD they belong to (:func:`pytadbit.utils.extraviews.tad_coloring`) * '**border**' to color particles marking borders. Color according to their score (:func:`pytadbit.utils.extraviews.tad_border_coloring`) coloring function like. * a function, that takes as argument a model and any other parameter passed through the kwargs. * a list of (r, g, b) tuples (as long as the number of particles). Each r, g, b between 0 and 1. :param kwargs: any extra argument will be passed to the coloring function """ cluster = cluster or -1 model_num = model_num or -1 if model_num > -1: models = [model_num] elif models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] for model_num in models: try: model = self[model_num] except KeyError: model = self._bad_models[model_num] model.write_cmm(directory, color=color, rndname=rndname, model_num=model_num, **kwargs)
[docs] def write_json(self, filename, color='index', models=None, cluster=None, title=None, infer_unrestrained=False, **kwargs): """ Save a model in the json format, read by TADkit. **Note:** If none of model_num, models or cluster parameter are set, ALL the models will be written. :param directory: location where the file will be written (note: the name of the file will be model_1.cmm if model number is 1) :param None model_num: the number of the model to save :param True rndname: If True, file names will be formatted as: model.RND.cmm, where RND is the random number feed used by IMP to generate the corresponding model. If False, the format will be: model_NUM_RND.cmm where NUM is the rank of the model in terms of objective function value :param None filename: overide the default file name writing :param kwargs: any extra argument will be passed to the coloring function :param False infer_unrestrained: infer unrestrained particle coordinates by linear interpolation using closest particles with restraints """ cluster = cluster or -1 form = ''' { "metadata" : { "version" : 1.0, "type" : "dataset", "generator": "TADbit" }, "object": {\n%(descr)s "uuid": "%(sha)s", "title": "%(title)s", "bp_per_nm": %(scale)s, "datatype": "xyz", "components": 3, "source": "local", "dependencies": %(dep)s }, "models": [\n%(xyz)s ], "clusters":%(cluster)s, "centroids":%(centroid)s, "restraints": %(restr)s, "hic_data": { "data": {''' form_end = '''}, "n": %(len_hic_data)i , "tads": [%(tad_def)s]} } ''' fil = {} fil['title'] = title or "Sample TADbit data" fil['scale'] = str(self._config['scale']) versions = get_dependencies_version(dico=True) fil['dep'] = str(dict((k.strip(), v.strip()) for k, v in list(versions.items()) if k in [' TADbit', 'IMP', 'MCL'])).replace("'", '"') ukw = 'UNKNOWN' def tocamel(key): key = ''.join([(w[0].upper() + w[1:]) if i else w for i, w in enumerate(key.split('_'))]) key = ''.join((w[0].upper() + w[1:]) if i else w for i, w in enumerate(key.split(' '))) return key try: try: my_descr = dict(self.description) except TypeError: my_descr = {} if not my_descr.get('start', 0): my_descr['start'] = 0 if not my_descr.get('end', 0): my_descr['end' ] = self.nloci my_descr['chrom'] = my_descr['chromosome'] if 'chromosome' in my_descr and isinstance(my_descr['chromosome'], list) else ["%s" % (my_descr.get('chromosome', 'Chromosome'))] if 'chromosome' in my_descr: del my_descr['chromosome'] if 'chrom_start' not in my_descr: warn("WARNING: chrom_start variable wasn't set, setting it to" + " the position in the experiment matrix (%s)" % ( str([m for m in my_descr['start']] if isinstance(my_descr['start'], list) else my_descr['start']))) my_descr['chrom_start'] = [m for m in my_descr['start']] if isinstance(my_descr['start'], list) else my_descr['start'] if 'chrom_end' not in my_descr: warn("WARNING: chrom_end variable wasn't set, setting it to" + " the position in the experiment matrix (%s)" % ( str([m for m in my_descr['end']] if isinstance(my_descr['end'], list) else my_descr['end']))) my_descr['chrom_end'] = [m for m in my_descr['end']] if isinstance(my_descr['end'], list) else my_descr['end'] if not my_descr['species']: warn("WARNING: species wasn't set, The resulting JSON will not work properly in TADkit.") # coordinates inside an array in case different models # from different places in the genome if not isinstance(my_descr['chrom_start'],list): my_descr['chrom_start'] = [my_descr['chrom_start']] my_descr['chrom_end' ] = [my_descr['chrom_end' ]] fil['descr'] = ',\n'.join( (' ' * 19) + '"%s" : %s' % (tocamel(k), ('"%s"' % (v)) if not ((isinstance(v, int) and not isinstance(v, bool)) or isinstance(v, list) or isinstance(v, float)) else str(v).replace("'", '"')) for k, v in list(my_descr.items())) if fil['descr']: fil['descr'] += ',' except AttributeError: fil['descr'] = '"description": "Just some models"' aligned_coords = self.align_models(models=models, cluster=cluster, by_cluster=True) if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] fil['cluster'] = '[]' fil['centroid'] = '[]' elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] fil['cluster'] = '[[' + ','.join(self.clusters[cluster]) + ']]' fil['centroid'] = '[' + self[self.centroid_model(cluster=cluster)]['rand_init'] + ']' else: models = [m for m in self.__models] fil['cluster'] = '[' + ','.join('[' + ','.join(self.clusters[c]) + ']' for c in self.clusters) + ']' fil['centroid'] = '[' + ','.join( [self[self.centroid_model(cluster=c)]['rand_init'] for c in self.clusters]) + ']' fil['xyz'] = [] for m_idx in range(len(models)): m = models[m_idx] if infer_unrestrained: self.infer_unrestrained_particle_coords( aligned_coords[m_idx][0], aligned_coords[m_idx][1], aligned_coords[m_idx][2]) model = {'rand_init': self[models[m_idx]]['rand_init'], 'x': aligned_coords[m_idx][0], 'y': aligned_coords[m_idx][1], 'z': aligned_coords[m_idx][2]} fil['xyz'].append((' ' * 18) + '{"ref": %s,"data": [' % ( model['rand_init']) + ','.join( '%.0f,%.0f,%.0f' % (model['x'][i], model['y'][i], model['z'][i]) for i in range(len(model['x']))) + ']}') fil['xyz'] = ',\n'.join(fil['xyz']) # creates a UUID for this particular set of coordinates AND for TADbit version fil['sha'] = str(uuid5(UUID(md5(versions[' TADbit'].encode('utf-8')).hexdigest()), fil['xyz'])) try: fil['restr'] = '[' + ','.join('[%s,%s,"%s",%f]' % ( k[0], k[1], self._restraints[k][0], self._restraints[k][2]) for k in self._restraints) + ']' except: fil['restr'] = '[]' fil['len_hic_data'] = len(self._original_data) try: fil['tad_def'] = ','.join( '[' + ','.join([str(i), str(self.experiment.tads[tad]['start'] * self.resolution + my_descr['chrom_start'][0]), str(self.experiment.tads[tad]['end'] * self.resolution + my_descr['chrom_start'][0]), str(self.experiment.tads[tad]['score'])]) + ']' for i,tad in enumerate(self.experiment.tads) if self.experiment.tads[tad]['start'] >= my_descr['start'] and self.experiment.tads[tad]['end'] <= my_descr['end']) except: fil['tad_def'] = '' out_f = open(filename, 'w') out_f.write(form % fil) size = len(self._original_data) out_f.write(','.join('"{}":{}'.format((i * size) + j, round(ncol, 6)) for i, nrow in enumerate(self._original_data) for j, ncol in enumerate(nrow) if ncol and not isnan(ncol))) out_f.write(form_end % fil) out_f.close()
[docs] def write_xyz(self, directory, model_num=None, models=None, cluster=None, get_path=False, rndname=True): """ Writes a xyz file containing the 3D coordinates of each particle in the model. .. note:: If none of model_num, models or cluster parameter are set, ALL the models will be written. :param directory: location where the file will be written (note: the file name will be model.1.xyz, if the model number is 1) :param None model_num: the number of the model to save :param None models: a list of numbers corresponding to a given set of models to be written :param None cluster: save the models in the cluster number 'cluster' :param True rndname: If True, file names will be formatted as: model.RND.xyz, where RND is the random number feed used by IMP to generate the corresponding model. If False, the format will be: model_NUM_RND.xyz where NUM is the rank of the model in terms of objective function value :param False get_path: whether to return, or not, the full path where the file has been written """ cluster = cluster or -1 model_num = model_num or -1 if model_num > -1: models = [model_num] elif models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] for model_num in models: try: model = self[model_num] except KeyError: model = self._bad_models[model_num] path_f = model.write_xyz(directory, model_num=model_num, get_path=get_path, rndname=rndname) if get_path: return path_f
[docs] def write_xyz_babel(self, directory, model_num=None, models=None, cluster=None, get_path=False, rndname=True): """ Writes a xyz file containing the 3D coordinates of each particle in the model using a file format compatible with babel (http://openbabel.org/wiki/XYZ_%28format%29). .. note:: If none of model_num, models or cluster parameter are set, ALL the models will be written. :param directory: location where the file will be written (note: the file name will be model.1.xyz, if the model number is 1) :param None model_num: the number of the model to save :param None models: a list of numbers corresponding to a given set of models to be written :param None cluster: save the models in the cluster number 'cluster' :param True rndname: If True, file names will be formatted as: model.RND.xyz, where RND is the random number feed used by IMP to generate the corresponding model. If False, the format will be: model_NUM_RND.xyz where NUM is the rank of the model in terms of objective function value :param False get_path: whether to return, or not, the full path where the file has been written """ cluster = cluster or -1 model_num = model_num or -1 if model_num > -1: models = [model_num] elif models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] for model_num in models: try: model = self[model_num] except KeyError: model = self._bad_models[model_num] path_f = model.write_xyz_babel(directory, model_num=model_num, get_path=get_path, rndname=rndname) if get_path: return path_f
[docs] def get_persistence_length(self, begin=0, end=None, axe=None, savefig=None, savedata=None, plot=True): """ Calculates the persistence length (Lp) of given section of the model. Persistence length is calculated according to [Bystricky2004]_ : .. math:: <R^2> = 2 \\times Lp^2 \\times (\\frac{Lc}{Lp} - 1 + e^{\\frac{-Lc}{Lp}}) with the contour length as :math:`Lc = \\frac{d}{c}` where :math:`d` is the genomic dstance in bp and :math:`c` the linear mass density of the chromatin (in bp/nm). :param 0 begin: starting particle of the region to consider :param None end: ending particle of the region to consider :returns: (float) persistence length """ if not end: end = self.nloci wloci = [i for i in range(self.nloci)] # Maximum genomic distance max_gen_dist=end-begin # Quantities for all models R2_all = [0]*max_gen_dist R4_all = [0]*max_gen_dist cnt_all = [0]*max_gen_dist for model in range(len(self.__models)): # Quantities within each model R2 = [0]*max_gen_dist R4 = [0]*max_gen_dist cnt = [0]*max_gen_dist x = [] ; y = [] ; z = [] for locus in range(begin,end): x.append(self[model]['x'][locus]) y.append(self[model]['y'][locus]) z.append(self[model]['z'][locus]) #Compute the contact matrix squared_distance_matrix = squared_distance_matrix_calculation_wrapper(x, y, z, max_gen_dist) # Compute the average R2 per single model for i, j in combinations(wloci, 2): R2[abs(i-j)] += squared_distance_matrix[i][j] R4[abs(i-j)] += (squared_distance_matrix[i][j]*squared_distance_matrix[i][j]) cnt[abs(i-j)] += 1 for i in range(max_gen_dist): if cnt[i] != 0: R2[i] = R2[i] / cnt[i] R4[i] = R4[i] / cnt[i] else: R2[i] = 0.0 R4[i] = 0.0 for i in range(max_gen_dist): R2_all[i] += R2[i] R4_all[i] += R4[i] cnt_all[i] += 1 avgs = [] std_devs = [] for i in range(max_gen_dist): if cnt_all[i] != 0: avg = R2_all[i]/cnt_all[i] avg2 = R4_all[i]/cnt_all[i] std_dev = sqrt(avg2-avg*avg) avgs.append(avg) std_devs.append(std_dev) else: avgs.append(0) std_devs.append(0) x = linspace(0.0 , float(max_gen_dist), num=max_gen_dist, endpoint=False) persistence_length, pcov = curve_fit(R2_vs_L, x, avgs) # write a 3 column file with genomic_distance | R2 | std_dev_R2 if savedata: out = open(savedata, 'w') out.write('#gen_dist\tR2\tstd_dev_R2\n') out.write('#persistence_length = %f\n' % (persistence_length)) for gen_dist in range(max_gen_dist): out.write('%f\t%f\t%f\n' % (gen_dist,avgs[gen_dist],std_devs[gen_dist])) out.close() if not plot: return persistence_length[0] # If plot we do the plot of R2 vs L show = False if axe else True axe = setup_plot(axe) plots = [] plots = axe.plot(list(range(0, self.nloci)), avgs, color='black') axe.set_xlim((0, max_gen_dist)) axe.set_xlabel('Genomic distance (particle)') axe.set_ylim((0, max(avgs))) axe.set_ylabel('<$R^{2}$> $(nm^2)$') if savefig: tadbit_savefig(savefig) elif show: plt.show() plt.close('all') return persistence_length[0]
[docs] def save_models(self, outfile, minimal=()): """ Saves all the models in pickle format (python object written to disk). :param path_f: path where to save the pickle file :param () minimal: list of items to exclude from save. Options: - 'restraints': used for modeling common to all models - 'zscores': used generate restraints common to all models - 'original_data': used generate Z-scores common to all models - 'log_objfun': generated during modeling model specific """ out = open(outfile, 'wb') dump(self._reduce_models(minimal=minimal), out, HIGHEST_PROTOCOL) out.close()
def _reduce_models(self, minimal=()): """ reduce structural models objects to a dictionary to be saved :param () minimal: list of items to exclude from save. Options: - 'restraints': used for modeling common to all models - 'zscores': used generate restraints common to all models - 'original_data': used generate Z-scores common to all models - 'objfun': generated during modeling model specific :returns: this dictionary """ to_save = {} if 'objfun' in minimal: for m in self.__models: self.__models[m]['log_objfun'] = None to_save['models'] = self.__models else: to_save['models'] = self.__models to_save['bad_models'] = self._bad_models to_save['description'] = self.description to_save['nloci'] = self.nloci to_save['clusters'] = self.clusters to_save['resolution'] = self.resolution to_save['original_data'] = None if 'original_data' in minimal else self._original_data to_save['config'] = self._config to_save['zscore'] = {} if 'zscores' in minimal else self._zscores to_save['restraints'] = {} if 'restraints' in minimal else self._restraints to_save['zeros'] = self._zeros return to_save def _get_models(self, models, cluster): """ Internal function to transform cluster name, model name, or model list into proper list of models that can be processed by StructuralModels functions """ cluster = cluster or -1 if models: models = [m if isinstance(m, int) else self[m]['index'] if isinstance(m, basestring) else m['index'] for m in models] elif cluster > -1 and len(self.clusters) > 0: models = [self[str(m)]['index'] for m in self.clusters[cluster]] else: models = [m for m in self.__models] return models def _windowize(self, dists, steps, average=True, interval=0, minerr=0.): lmodels = len(dists[0]) distsk = {1: dists} for k in steps[1:] if steps[0] == 1 else steps: distsk[k] = [None for _ in range(k // 2 + interval)] for i in range(interval, self.nloci - k - interval + 1): distsk[k].append(reduce(lambda x, y: x + y, [dists[i + j] for j in range(k)])) if k == 1: continue # calculate the mean for steps larger than 1 distsk[k][-1] = [mean_none([distsk[k][-1][i + lmodels * j] for j in range(k)]) for i in range(lmodels)] new_distsk = {} errorp = {} errorn = {} for k, dists in list(distsk.items()): new_distsk[k] = [] errorp[k] = [] errorn[k] = [] for part in dists: if not part: new_distsk[k].append(None) errorp[k].append(None) errorn[k].append(None) continue try: mean_part = (np_mean([p for p in part if p is not None]) if average else np_median([p for p in part if p is not None])) except IndexError: # bug in new version of numpy? mean_part = float('nan') new_distsk[k].append(mean_part) try: errorn[k].append(mean_part - 2 * np_std( [p for p in part if p is not None])) if errorn[k][-1] < minerr: errorn[k][-1] = minerr errorp[k].append(mean_part + 2 * np_std( [p for p in part if p is not None])) if errorp[k][-1] < minerr: errorp[k][-1] = minerr except TypeError: errorn[k].append(None) errorp[k].append(None) return new_distsk, errorn, errorp def _plot_polymer(self, axe): where = axe.get_ylim()[0] axe.scatter(list(range(1, self.nloci + 1)), [where] * self.nloci, s=40, color=[(0.15, 0.15, 0.15) if i else (0.7, 0.7, 0.7) for i in self._zeros], clip_on=False, zorder=100, alpha=0.75) axe.set_ylim((where, axe.get_ylim()[1])) def _generic_per_particle_plot(self, steps, distsk, error, errorp, errorn, savefig, axe, xlabel='', ylabel='', title='', colors=None, ylim=None): if not colors: colors = ['grey', 'darkgreen', 'darkblue', 'purple', 'darkorange', 'darkred'][-len(steps):] if len(steps) > 6: raise Exception('Sorry not enough colors to do this :)') ax = setup_plot(axe) plots = [] for k in steps: plots += ax.plot(list(range(1, len(distsk[k]) + 1)), distsk[k], color=colors[steps.index(k)], lw=steps.index(k) + 1, alpha=0.5) if error: for k in steps: plots += ax.plot(list(range(1, len(errorp[k]) + 1)), errorp[k], color=colors[steps.index(k)], ls='--') ax.plot(list(range(1, len(errorp[k]) + 1)), errorn[k], color=colors[steps.index(k)], ls='--') ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) self._plot_polymer(ax) scat1 = plt.Line2D((0, 1), (0, 0), color=(0.15, 0.15, 0.15), marker='o', linestyle='') scat2 = plt.Line2D((0, 1), (0, 0), color=(0.7, 0.7, 0.7 ), marker='o', linestyle='') try: ax.legend(plots + [scat1, scat2], ['Average for %s particle%s' % (k, 's' if k else '') for k in steps] + ( ['+/- 2 standard deviations' for k in steps] if error else []) + ['particles with restraints', 'particles without restraints'], fontsize='small', numpoints=1, bbox_to_anchor=(1, 0.5), loc='center left') except TypeError: ax.legend(plots + [scat1, scat2], ['Average for %s particle%s' % (k, 's' if k else '') for k in steps] + ( ['+/- 2 standard deviations' for k in steps] if error else [] + ['particles with restraints', 'particles without restraints']), numpoints=1, bbox_to_anchor=(1, 0.5), loc='center left') ax.set_xlim((1, self.nloci)) if ylim: a, b = ax.get_ylim() ax.set_ylim((max(0, a), min(1, b))) ax.set_title(title) plt.subplots_adjust(left=0.1, right=0.77) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() plt.close('all')
class ClusterOfModels(dict): def __str__(self): out1 = ' Cluster #%s has %s models [top model: %s]\n' out = 'Total number of clusters: %s\n%s' % ( len(self), ''.join([out1 % (k, len(self[k]), self[k][0]) for k in self])) return out