"""
November 7, 2013.
"""
from __future__ import print_function
from future import standard_library
from builtins import next
standard_library.install_aliases()
from sys import stderr, modules
from io import IOBase
from collections import OrderedDict
from warnings import warn
from math import sqrt, isnan
try:
from pickle5 import load # python < 3.8
except ImportError:
from pickle import load
from pysam import AlignmentFile
import numpy as np
from pytadbit.parsers.gzopen import gzopen
from pytadbit import HiC_data
from pytadbit.parsers.hic_bam_parser import get_matrix
try:
from pytadbit.parsers.cooler_parser import parse_cooler, is_cooler
except ImportError:
def is_cooler(*unused):
stderr.write('WARNING: cannot detect if input is a cooler file. Probably ' +
'you need to install h5py\n')
return False
pass
try:
file_types = file, IOBase
except NameError:
file_types = (IOBase,)
try:
basestring
except NameError:
basestring = str
HIC_DATA = True
class AutoReadFail(Exception):
"""
Exception to handle failed autoreader.
"""
pass
def is_asymmetric(matrix):
"""
Helper functions for the autoreader.
"""
maxn = len(matrix)
for i in range(maxn):
maxi = matrix[i] # slightly more efficient
for j in range(i+1, maxn):
if maxi[j] != matrix[j][i]:
if isnan(maxi[j]) and isnan(matrix[j][i]):
continue
return True
return False
def is_asymmetric_dico(hic):
"""
Helper functions for the optimal_reader
"""
ncol = len(hic)
for i in range(ncol):
for j in range(i, ncol):
p1 = i * ncol + j
p2 = j * ncol + i
if hic.get(p1, 0) != hic.get(p2, 0):
return True
return False
def symmetrize_dico(hic):
"""
Make an HiC_data object symmetric by summing two halves of the matrix
"""
ncol = len(hic)
for i in range(ncol):
incol = i * ncol
for j in range(i, ncol):
p1 = incol + j
p2 = j * ncol + i
val = hic.get(p1, 0) + hic.get(p2, 0)
if val:
hic[p1] = hic[p2] = val
def symmetrize(matrix):
"""
Make a matrix symmetric by summing two halves of the matrix
"""
maxn = len(matrix)
for i in range(maxn):
for j in range(i, maxn):
matrix[i][j] = matrix[j][i] = matrix[i][j] + matrix[j][i]
def optimal_reader(f, normalized=False, resolution=1):
"""
Reads a matrix generated by TADbit.
Can be slower than autoreader, but uses almost a third of the memory
:param f: an iterable (typically an open file).
:param False normalized: if the matrix is normalized
:param 1 resolution: resolution of the matrix
"""
# get masked bins
masked = {}
pos = 0
for line in f:
if line[0] != '#':
break
pos += len(line)
if line.startswith('# MASKED'):
masked = dict([(int(n), True) for n in line.split()[2:]])
f.seek(pos)
# super fast
header = [tuple(line.split(None, 2)[:2]) for line in f]
f.seek(pos)
ncol = len(header)
# Get the numeric values and remove extra columns
num = float if normalized else int
chromosomes, sections, resolution = _header_to_section(header, resolution)
#############################################################
# monkey patch HiC_data to make it faster
def fast_setitem(self, key, val):
"Use directly dict setitem"
super(HiC_data, self).__setitem__(key, val)
def fast_getitem(self, key):
"Use directly dict setitem"
try:
return super(HiC_data, self).__getitem__(key)
except KeyError:
return 0
original_setitem = HiC_data.__setitem__
original_getitem = HiC_data.__getitem__
# apply_async the patch
HiC_data.__setitem__ = fast_setitem
HiC_data.__getitem__ = fast_getitem
hic = HiC_data(((j, num(v))
for i, line in enumerate(f)
for j, v in enumerate(line.split()[2:], i * ncol)
if num(v)), size=ncol, masked=masked,
dict_sec=sections, chromosomes=chromosomes,
resolution=resolution, symmetricized=False)
# make it symmetric
if is_asymmetric_dico(hic):
hic.symmetricized = True
symmetrize_dico(hic)
# undo patching
HiC_data.__setitem__ = original_setitem
HiC_data.__getitem__ = original_getitem
hic.__setitem__ = original_setitem
hic.__getitem__ = original_getitem
#############################################################
return hic
def __read_file_header(f):
"""
Read file header, inside first commented lines of a file
:returns masked dict, chromsomes orderedDict, crm, beg, end, resolution:
"""
masked = {}
chromosomes = OrderedDict()
crm, beg, end, reso = None, None, None, None
fpos = f.tell()
for line in f:
if line[0] != '#':
break
fpos += len(line)
if line.startswith('# MASKED'):
try:
masked = dict([(int(n), True) for n in line.split()[-1].split(',')])
except ValueError: # nothing here
pass
elif line.startswith('# CRM'):
_, _, crm, size = line.split()
chromosomes[crm] = int(size)
elif 'resolution:' in line:
_, coords, reso = line.split()
try:
crm, pos = coords.split(':')
beg, end = list(map(int, pos.split('-')))
except ValueError:
crm = coords
beg, end = None, None
reso = int(reso.split(':')[1])
f.seek(fpos)
if crm == 'full':
crm = None
return masked, chromosomes, crm, beg, end, reso
def abc_reader(f):
"""
Read matrix stored in 3 column format (bin1, bin2, value)
:param f: an iterable (typically an open file).
:returns: An iterator to be converted in dictionary, matrix size, raw_names
as list of tuples (chr, pos), dictionary of masked bins, and boolean
reporter of symetric transformation
"""
masked, chroms, crm, beg, end, reso = __read_file_header(f) # TODO rest of it not used here
sections = {}
size = 0
for c in chroms:
sections[c] = size
size += chroms[c] // reso + 1
if beg:
header = [(crm, '%d-%d' % (l * reso + 1, (l + 1) * reso))
for l in range(beg, end)]
else:
header = [(c, '%d-%d' % (l * reso + 1, (l + 1) * reso))
for c in chroms
for l in range(sections[c], sections[c] + chroms[c] // reso + 1)]
num = int if HIC_DATA else float
offset = (beg or 0) * (1 + size)
def _disect(x):
a, b, v = x.split()
return (int(a) + int(b) * size + offset, num(v))
items = tuple(_disect(line) for line in f)
return items, size, header, masked, False
def __is_abc(f):
"""
Only works for matrices with more than 3 bins
"""
fpos = f.tell()
count = 0
for line in f:
if line.startswith('#'):
continue
count += 1
if len(line.split()) != 3:
f.seek(fpos)
return False
if count > 3:
f.seek(fpos)
return True
f.seek(fpos)
return False
def autoreader(f):
"""
Auto-detect matrix format of HiC data file.
:param f: an iterable (typically an open file).
:returns: An iterator to be converted in dictionary, matrix size, raw_names
as list of tuples (chr, pos), dictionary of masked bins, and boolean
reporter of symetric transformation
"""
masked = __read_file_header(f)[0] # TODO rest of it not used here
# Skip initial comment lines and read in the whole file
# as a list of lists.
line = next(f)
items = [line.split()] + [line.split() for line in f]
# Count the number of elements per line after the first.
# Wrapping in a set is a trick to make sure that every line
# has the same number of elements.
S = set([len(line) for line in items[1:]])
ncol = S.pop()
# If the set 'S' is not empty, at least two lines have a
# different number of items.
if S:
raise AutoReadFail('ERROR: unequal column number')
# free little memory
del(S)
nrow = len(items)
# Auto-detect the format, there are only 4 cases.
if ncol == nrow:
try:
_ = [float(item) for item in items[0]
if not item.lower() in ['na', 'nan']]
# Case 1: pure number matrix.
header = False
trim = 0
except ValueError:
# Case 2: matrix with row and column names.
header = True
trim = 1
warn('WARNING: found header')
else:
if len(items[0]) == len(items[1]):
# Case 3: matrix with row information.
header = False
trim = ncol - nrow
# warn('WARNING: found %d colum(s) of row names' % trim)
else:
# Case 4: matrix with header and row information.
header = True
trim = ncol - nrow + 1
warn('WARNING: found header and %d colum(s) of row names' % trim)
# Remove header line if needed.
if header and not trim:
header = items.pop(0)
nrow -= 1
elif not trim:
header = list(range(1, nrow + 1))
elif not header:
header = [tuple([a for a in line[:trim]]) for line in items]
else:
del(items[0])
nrow -= 1
header = [tuple([a for a in line[:trim]]) for line in items]
# Get the numeric values and remove extra columns
num = int if HIC_DATA else float
try:
items = [[num(a) for a in line[trim:]] for line in items]
except ValueError:
if not HIC_DATA:
raise AutoReadFail('ERROR: non numeric values')
try:
# Dekker data 2009, uses integer but puts a comma...
items = [[int(float(a)+.5) for a in line[trim:]] for line in items]
warn('WARNING: non integer values')
except ValueError:
try:
# Some data may contain 'NaN' or 'NA'
items = [
[0 if a.lower() in ['na', 'nan']
else int(float(a)+.5) for a in line[trim:]]
for line in items]
warn('WARNING: NA or NaN founds, set to zero')
except ValueError:
raise AutoReadFail('ERROR: non numeric values')
# Check that the matrix is square.
ncol -= trim
if ncol != nrow:
raise AutoReadFail('ERROR: non square matrix')
symmetricized = False
if is_asymmetric(items):
warn('WARNING: matrix not symmetric: summing cell_ij with cell_ji')
symmetrize(items)
symmetricized = True
return (((i + j * ncol, a) for i, line in enumerate(items)
for j, a in enumerate(line) if a),
ncol, header, masked, symmetricized)
def _header_to_section(header, resolution):
"""
converts row-names of the form 'chr12\t1000-2000' into sections, suitable
to create HiC_data objects. Also creates chromosomes, from the reads
"""
sections = {}
chromosomes = None
if (isinstance(header, list)
and isinstance(header[0], tuple)
and len(header[0]) > 1):
chromosomes = OrderedDict()
for i, h in enumerate(header):
if '-' in h[1]:
a, b = list(map(int, h[1].split('-')))
if resolution==1:
resolution = abs(b - a) + 1
elif resolution != abs(b - a) + 1:
raise Exception('ERROR: found different resolution, ' +
'check headers')
else:
a = int(h[1])
if resolution==1 and i:
resolution = abs(a - b) + 1
elif resolution == 1:
b = a
sections[(h[0], a // resolution)] = i
chromosomes.setdefault(h[0], 0)
chromosomes[h[0]] += 1
return chromosomes, sections, resolution
[docs]def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs):
"""
Read and checks a matrix from a file (using
:func:`pytadbit.parser.hic_parser.autoreader`) or a list.
:param things: might be either a file name, a file handler or a list of
list (all with same length)
:param None parser: a parser function that returns a tuple of lists
representing the data matrix,
with this file example.tsv:
::
chrT_001 chrT_002 chrT_003 chrT_004
chrT_001 629 164 88 105
chrT_002 86 612 175 110
chrT_003 159 216 437 105
chrT_004 100 111 146 278
the output of parser('example.tsv') might be:
``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
105, 278])``
:param 1 resolution: resolution of the matrix
:param True hic: if False, TADbit assumes that files contains normalized
data
:returns: the corresponding matrix concatenated into a huge list, also
returns number or rows
"""
one = kwargs.get('one', True)
global HIC_DATA
HIC_DATA = hic
if not isinstance(things, list):
things = [things]
matrices = []
for thing in things:
if isinstance(thing, HiC_data):
matrices.append(thing)
elif isinstance(thing, file_types):
parser = parser or (abc_reader if __is_abc(thing) else autoreader)
matrix, size, header, masked, sym = parser(thing)
thing.close()
chromosomes, sections, resolution = _header_to_section(header,
resolution)
matrices.append(HiC_data(matrix, size, dict_sec=sections,
chromosomes=chromosomes,
resolution=resolution,
symmetricized=sym, masked=masked))
elif isinstance(thing, basestring):
if is_cooler(thing, resolution if resolution > 1 else None):
matrix, size, header, masked, sym = parse_cooler(thing,
resolution if resolution > 1 else None,
not hic)
else:
try:
with gzopen(thing) as f_thing:
parser = parser or (abc_reader if __is_abc(f_thing) else autoreader)
matrix, size, header, masked, sym = parser(f_thing)
except IOError:
if len(thing.split('\n')) > 1:
parser = parser or (abc_reader if __is_abc(thing.split('\n')) else autoreader)
matrix, size, header, masked, sym = parser(thing.split('\n'))
else:
raise IOError('\n ERROR: file %s not found\n' % thing)
sections = dict([(h, i) for i, h in enumerate(header)])
chromosomes, sections, resolution = _header_to_section(header,
resolution)
matrices.append(HiC_data(matrix, size, dict_sec=sections,
chromosomes=chromosomes, masked=masked,
resolution=resolution,
symmetricized=sym))
elif isinstance(thing, list):
if all([len(thing)==len(l) for l in thing]):
size = len(thing)
matrix = [(i + j * size, v) for i, l in enumerate(thing)
for j, v in enumerate(l) if v]
else:
raise Exception('must be list of lists, all with same length.')
matrices.append(HiC_data(matrix, size))
elif isinstance(thing, tuple):
# case we know what we are doing and passing directly list of tuples
matrix = thing
siz = sqrt(len(thing))
if int(siz) != siz:
raise AttributeError('ERROR: matrix should be square.\n')
size = int(siz)
matrices.append(HiC_data(matrix, size))
elif isinstance(thing, (np.ndarray, np.generic) ):
try:
row, col = thing.shape
if row != col:
raise Exception('matrix needs to be square.')
matrix = thing.reshape(-1).tolist()[0]
size = row
except Exception as exc:
print('Error found:', exc)
matrices.append(HiC_data(matrix, size))
else:
raise Exception('Unable to read this file or whatever it is :)')
if one:
return matrices[0]
else:
return matrices
[docs]def load_hic_data_from_reads(fnam, resolution, **kwargs):
"""
:param fnam: tsv file with reads1 and reads2
:param resolution: the resolution of the experiment (size of a bin in
bases)
:param genome_seq: a dictionary containing the genomic sequence by
chromosome
:param False get_sections: for very very high resolution, when the column
index does not fit in memory
"""
sections = []
genome_seq = OrderedDict()
fhandler = open(fnam)
line = next(fhandler)
size = 0
while line.startswith('#'):
if line.startswith('# CRM '):
crm, clen = line[6:].split()
genome_seq[crm] = int(clen) // resolution + 1
size += genome_seq[crm]
line = next(fhandler)
if kwargs.get('get_sections', True):
for crm in genome_seq:
len_crm = genome_seq[crm]
sections.extend([(crm, i) for i in range(len_crm)])
dict_sec = dict([(j, i) for i, j in enumerate(sections)])
imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution)
try:
while True:
_, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
try:
ps1 = dict_sec[(cr1, int(ps1) // resolution)]
ps2 = dict_sec[(cr2, int(ps2) // resolution)]
except KeyError:
ps1 = int(ps1) // resolution
ps2 = int(ps2) // resolution
imx[ps1, ps2] += 1
imx[ps2, ps1] += 1
line = next(fhandler)
except StopIteration:
pass
fhandler.close()
imx.symmetricized = True
return imx
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8,
filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
region=None, verbose=True, clean=True):
"""
:param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
:param resolution: the resolution of the experiment (size of a bin in
bases)
:param None biases: path to pickle file where are stored the biases. Keys
in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
:param '.' tmpdir: path to folder where to create temporary files
:param 8 ncpus:
:param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
set of valid pair of reads.
:param None region: chromosome name, if None, all genome will be loaded
:returns: HiC_data object
"""
bam = AlignmentFile(fnam)
genome_seq = OrderedDict((c, l) for c, l in
zip(bam.references,
[x // resolution + 1 for x in bam.lengths]))
bam.close()
sections = []
if region:
size = genome_seq[region]
sections.extend([(region, i) for i in range(size)])
else:
for crm in genome_seq:
len_crm = genome_seq[crm]
sections.extend([(crm, i) for i in range(len_crm)])
size = sum(genome_seq.values())
chromosomes = {region: genome_seq[region]} if region else genome_seq
dict_sec = dict([(j, i) for i, j in enumerate(sections)])
imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec,
resolution=resolution)
if biases:
if isinstance(biases, basestring):
biases = load(open(biases,'rb'))
if biases['resolution'] != resolution:
raise Exception('ERROR: resolution of biases do not match to the '
'one wanted (%d vs %d)' % (
biases['resolution'], resolution))
if region:
chrom_start = 0
chrom_end = 0
for crm in genome_seq:
if crm == region:
chrom_end = chrom_start + genome_seq[crm]
break
len_crm = genome_seq[crm]
chrom_start += len_crm
imx.bias = dict((k - chrom_start, v)
for k, v in biases.get('biases', {}).items()
if chrom_start <= k < chrom_end)
imx.bads = dict((k - chrom_start, v)
for k, v in biases.get('badcol', {}).items()
if chrom_start <= k < chrom_end)
else:
imx.bads = biases['badcol']
imx.bias = biases['biases']
imx.expected = biases['decay']
get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude,
normalization='raw', tmpdir=tmpdir, clean=clean,
ncpus=ncpus, dico=imx, region1=region, verbose=verbose)
imx._symmetricize()
imx.symmetricized = True
return imx