from __future__ import print_function
from itertools import combinations
from os import path, system
from sys import stdout
from collections import OrderedDict
from multiprocessing import cpu_count
from distutils.version import LooseVersion
from subprocess import Popen, PIPE
from pytadbit.utils.file_handling import mkdir, magic_open, which
def eq_reads(rd1, rd2):
"""
Compare reads accounting for multicontacts
"""
return rd1.split('~', 1)[0] == rd2.split('~', 1)[0]
def gt_reads(rd1, rd2):
"""
Compare reads accounting for multicontacts
"""
return rd1.split('~', 1)[0] > rd2.split('~', 1)[0]
[docs]def merge_2d_beds(path1, path2, outpath):
"""
Merge two result files (file resulting from get_intersection or from
the filtering) into one.
:param path1: path to first file
:param path2: path to second file
:returns: number of reads processed
"""
fh1 = open(path1)
fh2 = open(path2)
# parse header
chromosomes = OrderedDict()
parsed = 0
for line in fh1:
if not line.startswith('#'):
break
parsed += len(line)
if not line.startswith('# CRM'):
continue
_, _, crm, val = line.split()
chromosomes[crm] = val
fh1.seek(parsed)
# check other headers
parsed = 0
for line in fh2:
if not line.startswith('#'):
break
parsed += len(line)
if not line.startswith('# CRM'):
continue
_, _, crm, val = line.split()
if chromosomes[crm] != val:
raise Exception('ERROR: files are the result of mapping on '
'different reference genomes')
fh2.seek(parsed)
# comparison function
greater = lambda x, y: x.split('\t', 1)[0].split('~')[0] > y.split('\t', 1)[0].split('~')[0]
# write headers
out = open(outpath, 'w')
for crm in chromosomes:
out.write('# CRM %s\t%s\n' % (crm, chromosomes[crm]))
# merge sort the two files
read1 = next(fh1)
read2 = next(fh2)
number_reads = 0
while True:
if greater(read2, read1):
out.write(read1)
number_reads += 1
try:
read1 = next(fh1)
except StopIteration:
out.write(read2)
number_reads += 1
break
else:
out.write(read2)
number_reads += 1
try:
read2 = next(fh2)
except StopIteration:
out.write(read1)
number_reads += 1
break
for read in fh1:
out.write(read)
number_reads += 1
for read in fh2:
out.write(read)
number_reads += 1
fh1.close()
fh2.close()
out.close()
return number_reads
def merge_bams(bam1, bam2, output_bam, cpus = cpu_count(), samtools = 'samtools', verbose = True):
"""
Merge two bam files with samtools into one.
:param bam1: path to first file
:param bam2: path to second file
"""
samtools = which(samtools)
if verbose:
print(' - Mergeing experiments')
system(samtools + ' merge -@ %d %s %s %s' % (cpus, output_bam, bam1, bam2))
if verbose:
print(' - Indexing new BAM file')
# check samtools version number and modify command line
version = LooseVersion([l.split()[1]
for l in Popen(samtools, stderr=PIPE,
universal_newlines=True).communicate()[1].split('\n')
if 'Version' in l][0])
if version >= LooseVersion('1.3.1'):
system(samtools + ' index -@ %d %s' % (cpus, output_bam))
else:
system(samtools + ' index %s' % (output_bam))
[docs]def get_intersection(fname1, fname2, out_path, verbose=False, compress=False):
"""
Merges the two files corresponding to each reads sides. Reads found in both
files are merged and written in an output file.
Dealing with multiple contacts:
- a pairwise contact is created for each possible combnation of the
multicontacts. The name of the read is extended by '# 1/3' in case
the reported pairwise contact corresponds to the first of 3 possibles
- it may happen that different contacts are mapped on a single RE fragment
(if each are on different end), in which case:
- if no other fragment from this read are mapped than, both are kept
- otherwise, they are merged into one longer (as if they were mapped
in the positive strand)
:param fname1: path to a tab separated file generated by the function
:func:`pytadbit.parsers.sam_parser.parse_sam`
:param fname2: path to a tab separated file generated by the function
:func:`pytadbit.parsers.sam_parser.parse_sam`
:param out_path: path to an outfile. It will written in a similar format as
the inputs
:param False compress: compress (gzip) input files. This is done in the
background while next input files are parsed.
:returns: final number of pair of interacting fragments, and a dictionary with
the number of multiple contacts (keys of the dictionary being the number of
fragment cought together, can be 3, 4, 5..)
"""
# Get the headers of the two files
reads1 = magic_open(fname1)
line1 = next(reads1)
header1 = ''
while line1.startswith('#'):
if line1.startswith('# CRM'):
header1 += line1
line1 = next(reads1)
read1 = line1.split('\t', 1)[0]
reads2 = magic_open(fname2)
line2 = next(reads2)
header2 = ''
while line2.startswith('#'):
if line2.startswith('# CRM'):
header2 += line2
line2 = next(reads2)
read2 = line2.split('\t', 1)[0]
if header1 != header2:
raise Exception('seems to be mapped onover different chromosomes\n')
# prepare to write read pairs into different files
# depending on genomic position
nchunks = 1024
global CHROM_START
CHROM_START = {}
cum_pos = 0
for line in header1.split('\n'):
if line.startswith('# CRM'):
_, _, crm, pos = line.split()
CHROM_START[crm] = cum_pos
cum_pos += int(pos)
lchunk = cum_pos // nchunks
buf = dict([(i, []) for i in range(nchunks + 1)])
# prepare temporary directories
tmp_dir = out_path + '_tmp_files'
mkdir(tmp_dir)
for i in range(nchunks // int(nchunks**0.5) + 1):
mkdir(path.join(tmp_dir, 'rep_%03d' % i))
# iterate over reads in each of the two input files
# and store them into a dictionary and then into temporary files
# dicitonary ois emptied each 1 milion entries
if verbose:
print ('Getting intersection of reads 1 and reads 2:')
count = 0
count_dots = -1
multiples = {}
try:
while True:
if verbose:
if not count_dots % 10:
stdout.write(' ')
if not count_dots % 50:
stdout.write('%s\n ' % (
(' %4d milion reads' % (count_dots)) if
count_dots else ''))
if count_dots >= 0:
stdout.write('.')
stdout.flush()
count_dots += 1
for _ in range(1000000): # iterate 1 million times, write to files
# same read id in both lianes, we store put the more upstream
# before and store them
if eq_reads(read1, read2):
count += 1
_process_lines(line1, line2, buf, multiples, lchunk)
line1 = next(reads1)
read1 = line1.split('\t', 1)[0]
line2 = next(reads2)
read2 = line2.split('\t', 1)[0]
# if first element of line1 is greater than the one of line2:
elif gt_reads(read1, read2):
line2 = next(reads2)
read2 = line2.split('\t', 1)[0]
else:
line1 = next(reads1)
read1 = line1.split('\t', 1)[0]
write_to_files(buf, tmp_dir, nchunks)
except StopIteration:
reads1.close()
reads2.close()
write_to_files(buf, tmp_dir, nchunks)
if verbose:
print('\nFound %d pair of reads mapping uniquely' % count)
# compression
if compress:
if verbose:
print('compressing input files')
procs = [Popen(['gzip', f]) for f in (fname1, fname2)]
# sort each tmp file according to first element (idx) and write them
# to output file (without the idx)
# sort also according to read 2 (to filter duplicates)
# and also according to strand
if verbose:
print('Sorting each temporary file by genomic coordinate')
out = open(out_path, 'w')
out.write(header1)
for b in buf:
if verbose:
stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf)))
stdout.flush()
with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)),
'tmp_%05d.tsv' % b)) as f_tmp:
out.write(''.join(['\t'.join(l[1:]) for l in sorted(
[l.split('\t') for l in f_tmp],
key=lambda x: (x[0], x[8], x[9], x[6]))]))
out.close()
if compress:
for proc in procs:
proc.communicate()
system('rm -rf ' + fname1)
system('rm -rf ' + fname2)
if verbose:
print('\nRemoving temporary files...')
system('rm -rf ' + tmp_dir)
return count, multiples
def _loc_reads(r1, r2):
"""
Put upstream read before, get position in buf
"""
pos1 = CHROM_START[r1[1]] + int(r1[2])
pos2 = CHROM_START[r2[1]] + int(r2[2])
if pos1 > pos2:
r1, r2 = r2, r1
pos1, pos2 = pos2, pos1
return r1, r2, pos1
def write_to_files(buf, tmp_dir, nchunks):
for b in buf:
out = open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)),
'tmp_%05d.tsv' % b), 'a')
out.write('\n'.join(buf[b]))
if buf[b]: # case the file was empty
out.write('\n')
out.close()
del(buf[b][:])
def _process_lines(line1, line2, buf, multiples, lchunk):
# case we have potential multicontacts
if '|||' in line1 or '|||' in line2:
elts = {}
for read in line1.split('|||'):
nam, crm, pos, strd, nts, beg, end = read.strip().split('\t')
elts.setdefault((crm, beg, end), []).append(
(nam, crm, pos, strd, nts, beg, end))
for read in line2.split('|||'):
nam, crm, pos, strd, nts, beg, end = read.strip().split('\t')
elts.setdefault((crm, beg, end), []).append(
(nam, crm, pos, strd, nts, beg, end))
# write contacts by pairs
# loop over RE fragments
for elt in elts:
# case we have 2 read-frags inside current fragment
if len(elts[elt]) == 1:
elts[elt] = elts[elt][0]
# case all fragments felt into a single RE frag
# we take only first and last
elif len(elts) == 1:
elts[elt] = sorted(
elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1]
elts1 = {elt: elts[elt][0]}
elts2 = {elt: elts[elt][1]}
# case we have several read-frag in this RE fragment
else:
# take first and last
map1, map2 = sorted(
elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1]
strand = map1[3]
# if the 2 strands are different keep the longest fragment
if strand != map2[3]:
map1 = tuple(max(elts[elt], key=lambda x: int(x[4])))
elts[elt] = map1
continue
elts[elt] = map1
# sum up read-frags in the RE fragment by putting
# them on the same strand
# use the strand of the first fragment as reference
if strand == '1':
beg = int(map1[2])
nts = int(map2[2]) + int(map2[4]) - beg
else:
beg = int(map2[2])
nts = beg - (int(map1[2]) - (int(map1[4])))
elts[elt] = tuple(list(map1[:2]) + [str(beg), strand, str(nts)]
+ list(map1[5:]))
contacts = len(elts) - 1
if contacts > 1:
multiples.setdefault(contacts, 0)
multiples[contacts] += 1
prod_cont = contacts * (contacts + 1) // 2
for i, (r1, r2) in enumerate(combinations(list(elts.values()), 2)):
r1, r2, idx = _loc_reads(r1, r2)
buf[idx // lchunk].append('%d\t%s#%d/%d\t%s\t%s' % (
idx, r1[0], i + 1, prod_cont, '\t'.join(r1[1:]),
'\t'.join(r2[1:])))
elif contacts == 1:
r1, r2, idx = _loc_reads(list(elts.values())[0], list(elts.values())[1])
buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:])))
else:
r1, r2, idx = _loc_reads(list(elts1.values())[0], list(elts2.values())[0])
buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:])))
else:
r1, r2, idx = _loc_reads(line1.strip().split('\t'), line2.strip().split('\t'))
buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:])))