Source code for pytadbit.mapping

from __future__ import print_function
from itertools                    import combinations
from os                           import path, system
from sys                          import stdout
from collections                  import OrderedDict
from multiprocessing              import cpu_count
from distutils.version            import LooseVersion
from subprocess                   import Popen, PIPE

from pytadbit.utils.file_handling import mkdir, magic_open, which


def eq_reads(rd1, rd2):
    """
    Compare reads accounting for multicontacts
    """
    return rd1.split('~', 1)[0] == rd2.split('~', 1)[0]


def gt_reads(rd1, rd2):
    """
    Compare reads accounting for multicontacts
    """
    return rd1.split('~', 1)[0] > rd2.split('~', 1)[0]


[docs]def merge_2d_beds(path1, path2, outpath): """ Merge two result files (file resulting from get_intersection or from the filtering) into one. :param path1: path to first file :param path2: path to second file :returns: number of reads processed """ fh1 = open(path1) fh2 = open(path2) # parse header chromosomes = OrderedDict() parsed = 0 for line in fh1: if not line.startswith('#'): break parsed += len(line) if not line.startswith('# CRM'): continue _, _, crm, val = line.split() chromosomes[crm] = val fh1.seek(parsed) # check other headers parsed = 0 for line in fh2: if not line.startswith('#'): break parsed += len(line) if not line.startswith('# CRM'): continue _, _, crm, val = line.split() if chromosomes[crm] != val: raise Exception('ERROR: files are the result of mapping on ' 'different reference genomes') fh2.seek(parsed) # comparison function greater = lambda x, y: x.split('\t', 1)[0].split('~')[0] > y.split('\t', 1)[0].split('~')[0] # write headers out = open(outpath, 'w') for crm in chromosomes: out.write('# CRM %s\t%s\n' % (crm, chromosomes[crm])) # merge sort the two files read1 = next(fh1) read2 = next(fh2) number_reads = 0 while True: if greater(read2, read1): out.write(read1) number_reads += 1 try: read1 = next(fh1) except StopIteration: out.write(read2) number_reads += 1 break else: out.write(read2) number_reads += 1 try: read2 = next(fh2) except StopIteration: out.write(read1) number_reads += 1 break for read in fh1: out.write(read) number_reads += 1 for read in fh2: out.write(read) number_reads += 1 fh1.close() fh2.close() out.close() return number_reads
def merge_bams(bam1, bam2, output_bam, cpus = cpu_count(), samtools = 'samtools', verbose = True): """ Merge two bam files with samtools into one. :param bam1: path to first file :param bam2: path to second file """ samtools = which(samtools) if verbose: print(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (cpus, output_bam, bam1, bam2)) if verbose: print(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (cpus, output_bam)) else: system(samtools + ' index %s' % (output_bam))
[docs]def get_intersection(fname1, fname2, out_path, verbose=False, compress=False): """ Merges the two files corresponding to each reads sides. Reads found in both files are merged and written in an output file. Dealing with multiple contacts: - a pairwise contact is created for each possible combnation of the multicontacts. The name of the read is extended by '# 1/3' in case the reported pairwise contact corresponds to the first of 3 possibles - it may happen that different contacts are mapped on a single RE fragment (if each are on different end), in which case: - if no other fragment from this read are mapped than, both are kept - otherwise, they are merged into one longer (as if they were mapped in the positive strand) :param fname1: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param fname2: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param out_path: path to an outfile. It will written in a similar format as the inputs :param False compress: compress (gzip) input files. This is done in the background while next input files are parsed. :returns: final number of pair of interacting fragments, and a dictionary with the number of multiple contacts (keys of the dictionary being the number of fragment cought together, can be 3, 4, 5..) """ # Get the headers of the two files reads1 = magic_open(fname1) line1 = next(reads1) header1 = '' while line1.startswith('#'): if line1.startswith('# CRM'): header1 += line1 line1 = next(reads1) read1 = line1.split('\t', 1)[0] reads2 = magic_open(fname2) line2 = next(reads2) header2 = '' while line2.startswith('#'): if line2.startswith('# CRM'): header2 += line2 line2 = next(reads2) read2 = line2.split('\t', 1)[0] if header1 != header2: raise Exception('seems to be mapped onover different chromosomes\n') # prepare to write read pairs into different files # depending on genomic position nchunks = 1024 global CHROM_START CHROM_START = {} cum_pos = 0 for line in header1.split('\n'): if line.startswith('# CRM'): _, _, crm, pos = line.split() CHROM_START[crm] = cum_pos cum_pos += int(pos) lchunk = cum_pos // nchunks buf = dict([(i, []) for i in range(nchunks + 1)]) # prepare temporary directories tmp_dir = out_path + '_tmp_files' mkdir(tmp_dir) for i in range(nchunks // int(nchunks**0.5) + 1): mkdir(path.join(tmp_dir, 'rep_%03d' % i)) # iterate over reads in each of the two input files # and store them into a dictionary and then into temporary files # dicitonary ois emptied each 1 milion entries if verbose: print ('Getting intersection of reads 1 and reads 2:') count = 0 count_dots = -1 multiples = {} try: while True: if verbose: if not count_dots % 10: stdout.write(' ') if not count_dots % 50: stdout.write('%s\n ' % ( (' %4d milion reads' % (count_dots)) if count_dots else '')) if count_dots >= 0: stdout.write('.') stdout.flush() count_dots += 1 for _ in range(1000000): # iterate 1 million times, write to files # same read id in both lianes, we store put the more upstream # before and store them if eq_reads(read1, read2): count += 1 _process_lines(line1, line2, buf, multiples, lchunk) line1 = next(reads1) read1 = line1.split('\t', 1)[0] line2 = next(reads2) read2 = line2.split('\t', 1)[0] # if first element of line1 is greater than the one of line2: elif gt_reads(read1, read2): line2 = next(reads2) read2 = line2.split('\t', 1)[0] else: line1 = next(reads1) read1 = line1.split('\t', 1)[0] write_to_files(buf, tmp_dir, nchunks) except StopIteration: reads1.close() reads2.close() write_to_files(buf, tmp_dir, nchunks) if verbose: print('\nFound %d pair of reads mapping uniquely' % count) # compression if compress: if verbose: print('compressing input files') procs = [Popen(['gzip', f]) for f in (fname1, fname2)] # sort each tmp file according to first element (idx) and write them # to output file (without the idx) # sort also according to read 2 (to filter duplicates) # and also according to strand if verbose: print('Sorting each temporary file by genomic coordinate') out = open(out_path, 'w') out.write(header1) for b in buf: if verbose: stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf))) stdout.flush() with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)), 'tmp_%05d.tsv' % b)) as f_tmp: out.write(''.join(['\t'.join(l[1:]) for l in sorted( [l.split('\t') for l in f_tmp], key=lambda x: (x[0], x[8], x[9], x[6]))])) out.close() if compress: for proc in procs: proc.communicate() system('rm -rf ' + fname1) system('rm -rf ' + fname2) if verbose: print('\nRemoving temporary files...') system('rm -rf ' + tmp_dir) return count, multiples
def _loc_reads(r1, r2): """ Put upstream read before, get position in buf """ pos1 = CHROM_START[r1[1]] + int(r1[2]) pos2 = CHROM_START[r2[1]] + int(r2[2]) if pos1 > pos2: r1, r2 = r2, r1 pos1, pos2 = pos2, pos1 return r1, r2, pos1 def write_to_files(buf, tmp_dir, nchunks): for b in buf: out = open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)), 'tmp_%05d.tsv' % b), 'a') out.write('\n'.join(buf[b])) if buf[b]: # case the file was empty out.write('\n') out.close() del(buf[b][:]) def _process_lines(line1, line2, buf, multiples, lchunk): # case we have potential multicontacts if '|||' in line1 or '|||' in line2: elts = {} for read in line1.split('|||'): nam, crm, pos, strd, nts, beg, end = read.strip().split('\t') elts.setdefault((crm, beg, end), []).append( (nam, crm, pos, strd, nts, beg, end)) for read in line2.split('|||'): nam, crm, pos, strd, nts, beg, end = read.strip().split('\t') elts.setdefault((crm, beg, end), []).append( (nam, crm, pos, strd, nts, beg, end)) # write contacts by pairs # loop over RE fragments for elt in elts: # case we have 2 read-frags inside current fragment if len(elts[elt]) == 1: elts[elt] = elts[elt][0] # case all fragments felt into a single RE frag # we take only first and last elif len(elts) == 1: elts[elt] = sorted( elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1] elts1 = {elt: elts[elt][0]} elts2 = {elt: elts[elt][1]} # case we have several read-frag in this RE fragment else: # take first and last map1, map2 = sorted( elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1] strand = map1[3] # if the 2 strands are different keep the longest fragment if strand != map2[3]: map1 = tuple(max(elts[elt], key=lambda x: int(x[4]))) elts[elt] = map1 continue elts[elt] = map1 # sum up read-frags in the RE fragment by putting # them on the same strand # use the strand of the first fragment as reference if strand == '1': beg = int(map1[2]) nts = int(map2[2]) + int(map2[4]) - beg else: beg = int(map2[2]) nts = beg - (int(map1[2]) - (int(map1[4]))) elts[elt] = tuple(list(map1[:2]) + [str(beg), strand, str(nts)] + list(map1[5:])) contacts = len(elts) - 1 if contacts > 1: multiples.setdefault(contacts, 0) multiples[contacts] += 1 prod_cont = contacts * (contacts + 1) // 2 for i, (r1, r2) in enumerate(combinations(list(elts.values()), 2)): r1, r2, idx = _loc_reads(r1, r2) buf[idx // lchunk].append('%d\t%s#%d/%d\t%s\t%s' % ( idx, r1[0], i + 1, prod_cont, '\t'.join(r1[1:]), '\t'.join(r2[1:]))) elif contacts == 1: r1, r2, idx = _loc_reads(list(elts.values())[0], list(elts.values())[1]) buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:]))) else: r1, r2, idx = _loc_reads(list(elts1.values())[0], list(elts2.values())[0]) buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:]))) else: r1, r2, idx = _loc_reads(line1.strip().split('\t'), line2.strip().split('\t')) buf[idx // lchunk].append('%d\t%s\t%s' % (idx, '\t'.join(r1), '\t'.join(r2[1:])))