Source code for pytadbit.parsers.sam_parser

"""
17 nov. 2014
"""
from __future__ import print_function
from builtins import next

from itertools import combinations
from bisect import bisect_right as bisect
from pysam import Samfile
from pytadbit.mapping.restriction_enzymes import map_re_sites
from shutil import copyfileobj
from warnings import warn
import os
from sys import stdout

try:
    basestring
except NameError:
    basestring = str


[docs]def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print('Searching and mapping RE sites to the reference genome') frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, basestring): f_names1 = [f_names1] if isinstance(f_names2, basestring): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print('Loading read' + str(read + 1)) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print('WARNING: file "%s" not found' % fnam) continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' == x[0][0] else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print('loading SAM file from %s: %s' % (mapper, fnam)) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n(also reference genome can be truncated)') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print('Getting Multiple contacts') reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = next(tmp_reads_fh) except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = {} multi = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: prev_read = prev_read.strip() + '|||' + read_line multi += 1 else: reads_fh.write(prev_read) prev_read = read_line try: multis[read][multi] += 1 except KeyError: multis[read][multi] = 1 multi = 0 prev_head = head reads_fh.write(prev_read) reads_fh.close() tmp_reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs): """ Parse gem 3c sam file using pysam tools. :param f_name: path to sam file corresponding to the mapping of reads :param out_file: path to outfile tab separated format containing paired read information :param genome_lengths: a dictionary generated containing the length of the genomic sequence per chromosome :param False tmp_format: If True leave the file prepared to be merged with other map files. """ frag_chunk = kwargs.get('frag_chunk', 100000) try: fhandler = Samfile(f_name) except IOError: raise Exception('ERROR: file "%s" not found' % f_name) # max number of reads in buffer max_size = 1000000 # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 nfile = 0 tmp_files = [] reads = [] cur_name = '' write_pairs = False read1 = None read2 = [] samiter = fhandler.fetch(until_eof=True) r = None try: r = next(samiter) except StopIteration: # empty SAM file return None pass while r: if not r.is_paired or r.is_unmapped or r.mapq < 4: try: r = next(samiter) except StopIteration: break continue if r.is_read1 and cur_name != r.qname: if read1 is None: read1 = r cur_name = r.qname try: r = next(samiter) except StopIteration: break continue else: write_pairs = True if not write_pairs: if r.is_read2 or r.is_supplementary: read2.append(r) try: r = next(samiter) except StopIteration: break continue else: if not read2: write_pairs = False read1 = None try: r = next(samiter) except StopIteration: break continue reads_grp = [] read_id = read1.query_name for read in [read1]+read2: if read.query_name != read_id: continue positive = not read.is_reverse crm = crm_dict[read.tid] len_seq = read.reference_end-read.pos if positive: pos = read.pos + 1 else: pos = read.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash read_multi = [] break idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] reads_grp.append([read.tid, crm, pos, positive, len_seq, prev_re, next_re]) if len(reads_grp) > 2: _merge_multis(reads_grp) elif len(reads_grp) < 2: reads_grp = [] reads_multi = [] for paired_reads in combinations(reads_grp, 2): read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2])) for item in sublist] if read_multi: reads_multi.append(read_multi) sub_count += 1 paired_total = len(reads_multi) paired_nbr = 0 for pair_read in reads_multi: read_name_id = read_id paired_nbr += 1 if paired_total > 1: read_name_id += '#%d/%d' % (paired_nbr,paired_total) reads.append([read_name_id]+pair_read) if sub_count >= max_size: sub_count = 0 nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)+'\n') del reads[:] write_pairs = False read1 = None del read2[:] if reads: nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)) #map_out.close() # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True)) if verbose: stdout.write('\n') if tmp_format: os.rename(tmp_files[0], out_file) else: map_out = open(out_file, 'w') tmp_reads_fh = open(tmp_files[0],'rb') for crm in genome_lengths: map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm])) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]]+read[2:8]+read[9:])) map_out.close() os.system('rm -rf ' + tmp_files[0]) return out_file def write_reads_to_file(reads, outfiles, tmp_files, nfile): if not reads: # can be... return tmp_name = os.path.join(*outfiles.split('/')[:-1] + [('tmp_%03d_' % nfile) + outfiles.split('/')[-1]]) tmp_name = ('/' * outfiles.startswith('/')) + tmp_name tmp_files.append(tmp_name) out = open(tmp_name, 'w') out.write(''.join(sorted(reads, key=lambda x: x.split('\t', 1)[0].split('~')[0]))) out.close() del(reads[:]) # empty list def write_paired_reads_to_file(reads, outfiles, tmp_files, nfile): if not reads: # can be... return tmp_name = os.path.join(*outfiles.split('/')[:-1] + [('tmp_%03d_' % nfile) + outfiles.split('/')[-1]]) tmp_name = ('/' * outfiles.startswith('/')) + tmp_name tmp_files.append(tmp_name) out = open(tmp_name, 'w') out.write(''.join(reads)) out.close() del(reads[:]) # empty list def _merge_multis(reads_multi): merged_reads = [] elts = {} for read in reads_multi: elts.setdefault((read[1], read[5], read[6]), []).append(read) # write contacts by pairs # loop over RE fragments for elt in elts: # case we have 1 read-frags inside current fragment if len(elts[elt]) == 1: merged_reads.append(elts[elt][0]) # case all fragments felt into a single RE frag # we take only first and last elif len(elts) == 1: elts[elt] = sorted( elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1] merged_reads.append(elts[elt][0]) merged_reads.append(elts[elt][1]) # case we have several read-frag in this RE fragment else: # take first and last map1, map2 = sorted( elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1] strand = map1[3] # if the 2 strands are different keep the longest fragment if strand != map2[3]: map1 = max(elts[elt], key=lambda x: int(x[4])) merged_reads.append(map1) continue # sum up read-frags in the RE fragment by putting # them on the same strand # use the strand of the first fragment as reference if strand == 1: beg = int(map1[2]) nts = int(map2[2]) + int(map2[4]) - beg else: beg = int(map2[2]) nts = beg - (int(map1[2]) - (int(map1[4]))) merged_reads.append(list(map1[:2]) + [str(beg), strand, str(nts)] + list(map1[5:])) reads_multi = merged_reads def merge_sort(file1, file2, outfiles, nfile, paired=False): tmp_name = os.path.join(*outfiles.split('/')[:-1] + [('tmp_merged_%03d_' % nfile) + outfiles.split('/')[-1]]) tmp_name = ('/' * outfiles.startswith('/')) + tmp_name tmp_file = open(tmp_name, 'w') fh1 = open(file1) fh2 = open(file2) if paired: greater = lambda x, y: [y,x] == sorted([y,x], key=lambda j: (int(j.split('\t')[1]), float(j.split('\t')[3]), int(j.split('\t')[8]), float(j.split('\t')[10]))) else: greater = lambda x, y: x.split('\t', 1)[0].split('~')[0] > y.split('\t', 1)[0].split('~')[0] read1 = next(fh1) read2 = next(fh2) while True: if greater(read2, read1): tmp_file.write(read1) try: read1 = next(fh1) except StopIteration: tmp_file.write(read2) break else: tmp_file.write(read2) try: read2 = next(fh2) except StopIteration: tmp_file.write(read1) break for read in fh1: tmp_file.write(read) for read in fh2: tmp_file.write(read) fh1.close() fh2.close() tmp_file.close() os.system('rm -f ' + file1) os.system('rm -f ' + file2) return tmp_name